<a href="https://colab.research.google.com/github/AAdewunmi/Breast-Cancer-Risk-Prediction-Project/blob/main/Breast_Cancer_Risk_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Breast Cancer Prediction Project

We will build a Deep Learning Model that is to be trained on breast cancer autopsy image dataset to predict if a person has breast cancer and if it is benigh or malign.

We will be using DenseNet-201 which is a convolution neural network that is 201 layers deep. We can load a pretrained version of the network or we can also retrain the model, which is what we are doing in this project.

We will then implement our very own website that will be built using Django framwork and host it on AWS.

In [1]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install core ML/CV dependencies

!pip install numpy pandas matplotlib tensorflow keras opencv-python Pillow scikit-learn scipy tqdm



In [5]:
# Installs all core libraries required for the model training script,
# including TensorFlow/Keras, NumPy, Pandas, scikit-learn, OpenCV (cv2),
# and Pillow.

import json
import math
import os
import cv2
from PIL import Image
import numpy as np
import tensorflow
from keras import layers
from tensorflow.keras.applications.densenet import DenseNet201
from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score
import scipy
from tqdm import tqdm
import gc
from functools import partial
from sklearn import metrics
from collections import Counter
import json
import itertools


In [10]:
# Implement Dataset_loader and init benign/malign datasets

import os
from typing import Tuple, Union, List

import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm


def Dataset_loader(DIR: str, RESIZE: Union[int, Tuple[int, int]], sigmaX: float = 0.0) -> np.ndarray:
    """
    Load all PNG images from a directory, convert them to RGB, resize, and return a NumPy array.

    Parameters
    ----------
    DIR : str
        Directory containing images. Only files with a .png extension (case-insensitive) are loaded.
    RESIZE : int | tuple[int, int]
        Target size. If an int, images are resized to (RESIZE, RESIZE).
        If a tuple, interpret as (width, height) per OpenCV's convention.
    sigmaX : float, optional
        Standard deviation for Gaussian blur. If > 0, apply cv2.GaussianBlur; default is 0 (no blur).

    Returns
    -------
    np.ndarray
        Array of shape (N, H, W, 3) in uint8 RGB, where N is the number of images.
        If the directory has no PNGs, returns an empty array with shape (0, H, W, 3).

    Raises
    ------
    FileNotFoundError
        If the provided directory does not exist.
    ValueError
        If RESIZE is a tuple with non-positive values.

    Notes
    -----
    - Uses PIL to read and ensure RGB; uses OpenCV for resizing (INTER_AREA).
    - Processing order is deterministic (alphabetical file name sort).
    - Consider renaming to `load_dataset` for PEP 8 compliance in future refactors.

    Examples
    --------
    >>> imgs = Dataset_loader("/path/to/benign", 224)
    >>> imgs.shape
    (N, 224, 224, 3)
    """
    if not os.path.isdir(DIR):
        raise FileNotFoundError(f"Directory not found: {DIR}")

    # Normalize RESIZE into an (width, height) tuple for OpenCV.
    if isinstance(RESIZE, int):
        target_size: Tuple[int, int] = (RESIZE, RESIZE)
    else:
        if len(RESIZE) != 2 or RESIZE[0] <= 0 or RESIZE[1] <= 0:
            raise ValueError("RESIZE tuple must be (width, height) with positive integers.")
        target_size = (int(RESIZE[0]), int(RESIZE[1]))

    # Collect .png files only (case-insensitive), sorted for reproducibility
    entries: List[str] = sorted(os.listdir(DIR))
    png_paths = [
        os.path.join(DIR, name)
        for name in entries
        if os.path.splitext(name)[1].lower() == ".png"
    ]

    # Early return with empty (0, H, W, 3) if no PNGs found
    if not png_paths:
        return np.empty((0, target_size[1], target_size[0], 3), dtype=np.uint8)

    images = []
    for path in tqdm(png_paths, desc=f"Loading {os.path.basename(DIR) or DIR}"):
        # Read with PIL, enforce RGB
        rgb = np.asarray(Image.open(path).convert("RGB"))
        # Resize with OpenCV (expects (width, height))
        resized = cv2.resize(rgb, target_size, interpolation=cv2.INTER_AREA)
        # Optional blur for preprocessing
        if sigmaX and sigmaX > 0:
            resized = cv2.GaussianBlur(resized, ksize=(0, 0), sigmaX=sigmaX)
        images.append(resized.astype(np.uint8))

    return np.stack(images, axis=0)

# pass RESIZE and fix the typo in the second path
benign = Dataset_loader(
    "drive/MyDrive/Colab Notebooks/Breast-Cancer-Risk-Prediction/data/benign",
    224
)
malign = Dataset_loader(
    "drive/MyDrive/Colab Notebooks/Breast-Cancer-Risk-Prediction/data/malign",
    224
)


Loading benign: 100%|██████████| 112/112 [00:05<00:00, 19.89it/s]
Loading malign: 100%|██████████| 127/127 [00:04<00:00, 26.45it/s]
