# Breast Cancer Detection

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# image processing
from PIL import Image
import cv2

# plotting (we will need that later for our plotes and all)
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning models (we want to try a lot of them and see which one performs the best!)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

# balancing the dataset (we need that later too)
from imblearn.over_sampling import SMOTE


In [2]:
# Set the directory paths
data_dir = '/Users/josuegodeme/Downloads/AI4All Project/archive'  


In [3]:
import glob

# Initialize lists to hold file paths and labels
image_paths = []
labels = []

# Walk through the directory and collect image paths and labels
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        if filename.endswith('.png'):
            # Full path to the image
            file_path = os.path.join(dirname, filename)
            image_paths.append(file_path)
            
            # Extract the class label from the filename
            if 'class0' in filename:
                labels.append(0)
            elif 'class1' in filename:
                labels.append(1)


In [4]:
print(f'Total images: {len(image_paths)}')
print(f'Total labels: {len(labels)}')

# Check the distribution of classes
from collections import Counter
print('Class distribution:', Counter(labels))


Total images: 277524
Total labels: 277524
Class distribution: Counter({0: 198738, 1: 78786})


In [5]:
# Initialize a list to hold image data
images = []

# Loop over image paths and load images
for path in image_paths:
    # Load the image
    img = cv2.imread(path)
    
    # Convert BGR to RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Normalize the image
    img = img / 255.0
    
    # Append to the images list
    images.append(img)


In [6]:
import numpy as np
from typing import List
import sys

def debug_image_data(images: List) -> None:
    """
    Debug image data structure and content.
    
    Args:
        images: List of images to analyze
    """
    print("=== Data Structure Analysis ===")
    print(f"Type of images container: {type(images)}")
    print(f"Length of images container: {len(images)}")
    
    # Check first few images
    for i in range(min(5, len(images))):
        print(f"\nImage {i}:")
        print(f"  Type: {type(images[i])}")
        try:
            print(f"  Shape: {np.array(images[i]).shape}")
            print(f"  Data type: {np.array(images[i]).dtype}")
            print(f"  Min value: {np.min(images[i])}")
            print(f"  Max value: {np.max(images[i])}")
        except Exception as e:
            print(f"  Error converting to array: {str(e)}")
            
        # Memory size
        try:
            print(f"  Approximate memory size: {sys.getsizeof(images[i]) / 1024:.2f} KB")
        except:
            print("  Could not determine memory size")

def convert_images_safely(images: List) -> np.ndarray:
    """
    Safely convert images to numpy array with additional error checking.
    
    Args:
        images: List of images
        
    Returns:
        Numpy array of images
    """
    # First, verify each image can be converted individually
    processed_images = []
    for i, img in enumerate(images):
        try:
            img_array = np.asarray(img)
            if img_array.shape != (50, 50, 3):
                raise ValueError(f"Image {i} has unexpected shape: {img_array.shape}")
            processed_images.append(img_array)
        except Exception as e:
            print(f"Error processing image {i}: {str(e)}")
            continue
    
    # Convert to final array
    if processed_images:
        return np.stack(processed_images)
    else:
        raise ValueError("No images could be processed successfully")

# Example usage:
def process_image_data(images):
    """
    Process image data with detailed debugging.
    """
    print("Analyzing image data structure...")
    debug_image_data(images)
    
    print("\nAttempting safe conversion...")
    try:
        X = convert_images_safely(images)
        print(f"Success! Final array shape: {X.shape}")
        return X
    except Exception as e:
        print(f"Conversion failed: {str(e)}")
        return None

In [7]:
import numpy as np
from typing import List, Tuple
from PIL import Image

def resize_and_normalize_images(images: List[np.ndarray], 
                              target_size: Tuple[int, int] = (50, 50)) -> np.ndarray:
    """
    Resize images to target size and normalize pixel values.
    
    Args:
        images: List of numpy arrays representing images
        target_size: Tuple of (height, width) for output images
        
    Returns:
        Numpy array of processed images with shape (n_images, height, width, 3)
    """
    processed_images = []
    errors = 0
    
    for i, img in enumerate(images):
        try:
            # Convert numpy array to PIL Image
            pil_img = Image.fromarray((img * 255).astype(np.uint8))
            
            # Resize image
            resized_img = pil_img.resize(target_size, Image.Resampling.BILINEAR)
            
            # Convert back to numpy array and normalize
            img_array = np.array(resized_img).astype(np.float32) / 255.0
            
            processed_images.append(img_array)
            
            # Print progress every 10000 images
            if (i + 1) % 10000 == 0:
                print(f"Processed {i + 1}/{len(images)} images...")
                
        except Exception as e:
            errors += 1
            if errors <= 5:  # Only show first 5 errors
                print(f"Error processing image {i}: {str(e)}")
            continue
    
    if errors > 0:
        print(f"\nTotal errors encountered: {errors}")
    
    # Stack all processed images into a single array
    X = np.stack(processed_images)
    
    print(f"\nFinal array shape: {X.shape}")
    print(f"Value range: [{X.min():.3f}, {X.max():.3f}]")
    
    return X

# Helper function to check memory usage
def get_memory_usage(X: np.ndarray) -> str:
    """Calculate memory usage of numpy array in MB"""
    return f"{X.nbytes / (1024 * 1024):.2f} MB"

# Example usage
def process_dataset(images, labels):
    """Process complete dataset with images and labels"""
    print("Starting image processing...")
    X = resize_and_normalize_images(images)
    y = np.array(labels)
    
    print("\nDataset summary:")
    print(f"Images shape: {X.shape}")
    print(f"Labels shape: {y.shape}")
    print(f"Memory usage: {get_memory_usage(X)}")
    
    return X, y

In [None]:
# Process your dataset
X, y = process_dataset(images, labels)

Starting image processing...
Processed 10000/277524 images...
Processed 20000/277524 images...
Processed 30000/277524 images...
Processed 40000/277524 images...
Processed 50000/277524 images...
Processed 60000/277524 images...
Processed 70000/277524 images...
Processed 80000/277524 images...
Processed 90000/277524 images...
Processed 100000/277524 images...
Processed 110000/277524 images...
Processed 120000/277524 images...
Processed 130000/277524 images...
Processed 140000/277524 images...
Processed 150000/277524 images...
Processed 160000/277524 images...
Processed 170000/277524 images...
Processed 180000/277524 images...
Processed 190000/277524 images...
Processed 200000/277524 images...
Processed 210000/277524 images...
Processed 220000/277524 images...
Processed 230000/277524 images...
Processed 240000/277524 images...
Processed 250000/277524 images...
Processed 260000/277524 images...
Processed 270000/277524 images...


In [None]:
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')


X shape: (277524, 50, 50, 3)
y shape: (277524,)


In [None]:
# First, split into training and temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42)

# Now, split the temp set equally into validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42)

print(f'Training set shape: {X_train.shape}')
print(f'Validation set shape: {X_val.shape}')
print(f'Test set shape: {X_test.shape}')


Training set shape: (222019, 50, 50, 3)
Validation set shape: (27752, 50, 50, 3)
Test set shape: (27753, 50, 50, 3)


In [None]:
# Flatten the images
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

print(f'Flattened training set shape: {X_train_flat.shape}')


Flattened training set shape: (222019, 7500)


to be uncommented later coz it is our Knns

In [None]:
# Instantiate the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train_flat, y_train)

# Predict on the validation set
y_val_pred_knn = knn.predict(X_val_flat)

# Evaluate the model
print("KNN Classification Report (Validation Set):")
print(classification_report(y_val, y_val_pred_knn))


In [None]:
# Gaussian Naive Bayes classifier
gnb = GaussianNB()

# training the model here 
gnb.fit(X_train_flat, y_train)

# prediction on the validation set
y_val_pred_gnb = gnb.predict(X_val_flat)

# evaluation of the model
print("Gaussian Naive Bayes Classification Report (Validation Set):")
print(classification_report(y_val, y_val_pred_gnb))


In [None]:
## THIS DO NOT WORK YET - PLEASE SKIP CELL
# Instantiate the Logistic Regression classifier
lr = LogisticRegression(max_iter=500)

# Train the model
lr.fit(X_train_flat, y_train)

# Predict on the validation set
y_val_pred_lr = lr.predict(X_val_flat)

# Evaluate the model
print("Logistic Regression Classification Report (Validation Set):")
print(classification_report(y_val, y_val_pred_lr))


# RESAMPLING OUR DATASET: Finxing the Imbalance Issue

### Try 1 : Undersampling



In [None]:
from collections import Counter

# Original class distribution in the training set
print('Original training set shape:', Counter(y_train))



Original training set shape: Counter({0: 158990, 1: 63029})


In [None]:
import numpy as np

# Separate the indices of each class
class_0_indices = np.where(y_train == 0)[0]
class_1_indices = np.where(y_train == 1)[0]

# Number of samples in the minority class
n_class_1 = len(class_1_indices)

# Randomly select indices from the majority class
np.random.seed(42)  # For reproducibility
class_0_selected_indices = np.random.choice(class_0_indices, size=n_class_1, replace=False)

# Combine the selected indices
undersampled_indices = np.concatenate([class_0_selected_indices, class_1_indices])

# Create undersampled training data
X_train_undersampled = X_train_flat[undersampled_indices]
y_train_undersampled = y_train[undersampled_indices]

# Shuffle the undersampled dataset
from sklearn.utils import shuffle
X_train_undersampled, y_train_undersampled = shuffle(X_train_undersampled, y_train_undersampled, random_state=42)

# Check the new class distribution
print('Undersampled training set shape:', Counter(y_train_undersampled))


In [None]:
# Instantiate the KNN classifier
knn_undersampled = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_undersampled.fit(X_train_undersampled, y_train_undersampled)

# Predict on the validation set
y_val_pred_knn_undersampled = knn_undersampled.predict(X_val_flat)

# Evaluate the model
print("KNN Classification Report after Undersampling (Validation Set):")
print(classification_report(y_val, y_val_pred_knn_undersampled))


In [None]:
# Instantiate the Gaussian Naive Bayes classifier
gnb_undersampled = GaussianNB()

# Train the model
gnb_undersampled.fit(X_train_undersampled, y_train_undersampled)

# Predict on the validation set
y_val_pred_gnb_undersampled = gnb_undersampled.predict(X_val_flat)

# Evaluate the model
print("Gaussian Naive Bayes Classification Report after Undersampling (Validation Set):")
print(classification_report(y_val, y_val_pred_gnb_undersampled))


In [None]:
## THIS DO NOT WORK YET - PLEASE SKIP CELL

# Instantiate the Logistic Regression classifier
lr_undersampled = LogisticRegression(max_iter=500)

# Train the model
lr_undersampled.fit(X_train_undersampled, y_train_undersampled)

# Predict on the validation set
y_val_pred_lr_undersampled = lr_undersampled.predict(X_val_flat)

# Evaluate the model
print("Logistic Regression Classification Report after Undersampling (Validation Set):")
print(classification_report(y_val, y_val_pred_lr_undersampled))


### Try 2: SYNTHETIC MINORITY OVER SAMPLING TECHNIQUE

Information about SMOTE: https://medium.com/@corymaklin/synthetic-minority-over-sampling-technique-smote-7d419696b88c

Synthetic minority over sampling technique: it creates new examples of the minority classes by generating data that is close to the existings points

In [None]:
## All the code below hasn't been ran yet, (i have an error when running locally that I havent figured out yet)

In [None]:
# Before SMOTE
print('Original dataset shape:', Counter(y_train))

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train_flat, y_train)

# After SMOTE
print('Resampled dataset shape:', Counter(y_train_balanced))


Original dataset shape: Counter({0: 158990, 1: 63029})


: 

In [None]:
# Retrain KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_balanced, y_train_balanced)

# Predict on the validation set
y_val_pred_knn_balanced = knn.predict(X_val_flat)

# Evaluate the model
print("KNN Classification Report after SMOTE (Validation Set):")
print(classification_report(y_val, y_val_pred_knn_balanced))


In [None]:
gnb = GaussianNB()
# Retrain Gaussian Naive Bayes
gnb.fit(X_train_balanced, y_train_balanced)

# Predict on the validation set
y_val_pred_gnb_balanced = gnb.predict(X_val_flat)

# Evaluate the model
print("Gaussian Naive Bayes Classification Report after SMOTE (Validation Set):")
print(classification_report(y_val, y_val_pred_gnb_balanced))


In [None]:
# Retrain Logistic Regression
lr.fit(X_train_balanced, y_train_balanced)

# Predict on the validation set
y_val_pred_lr_balanced = lr.predict(X_val_flat)

# Evaluate the model
print("Logistic Regression Classification Report after SMOTE (Validation Set):")
print(classification_report(y_val, y_val_pred_lr_balanced))
