In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [None]:
# -------- PREPROCESSING PARAMETERS  ----------
METADATA_CSV = r""  # path to metadata CSV
IMAGE_DIR = r"."  # path to folder with images

IMG_SIZE = (224, 224)
BATCH_SIZE = 16

In [None]:
# ------- LOADING DATA ------ #

# Load metadata and print first few rows
def load_metadata(csv_path):
    # Code here
    df = read_csv(csv_path)
    print(df.head())
    return df

# Split metadata into training and testing sets using train_test_split
def split_metadata(df, label_col, test_size=0.2, random_state=42):
    # Code here
    X = df.drop(columns=[label_col])  # features
    y = df[label_col]                 # labels
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    return X_train, X_test, y_train, y_test

In [None]:
# ---------- PREPROCESSING FUNCTIONS ----------


# Preprocess images: load, resize, normalize, and augment
# 1. Load the image (you can use cv2.imread or keras.preprocessing.image.load_img)
# 2. Resize it to img_size
# 3. Convert to array and normalize (divide by 255.0)
# 4. Return the preprocessed image
def preprocess_images(image_dir, metadata_df, img_size):
    # Code here
    img = load_img(img_path, target_size=img_size)
    # Convert to array
    img_array = img_to_array(img)
    # Normalize to [0,1]
    img_array = img_array / 255.0
    return img_array


# Test out the image preprocessing by visualizing a few sample images
def visualize_sample_images(generator, num_images=9):
    # Code here
    images, labels = next(generator)
    plt.figure(figsize=(8, 8))
    for i in range(num_images):
        plt.subplot(3, 3, i + 1)
        plt.imshow(images[i])
        label = "Tumor" if labels[i] > 0.5 else "No Tumor"
        plt.title(label)
        plt.axis("off")
    plt.tight_layout()
    plt.show()

# Preprocess all images in the dataframe
# 1. Initialize empty lists for X (images) and y (labels)
# 2. Loop through df rows:
#     - Get image_id, build full path (e.g., os.path.join(image_dir, image_id + ".jpg"))
#     - Call preprocess_image() for each
#     - Append image and label
# 3. Convert lists to NumPy arrays and return
def preprocess_all_images(df, image_dir, img_size):
    # Code here
    X, y = [], []
    for _, row in df.iterrows():
        img_path = os.path.join(image_dir, row['label'], os.path.basename(row['image_path']))
        img_array = preprocess_image(img_path, img_size)
        X.append(img_array)
        y.append(1 if row['label'] == 'yes' else 0)

    X = np.array(X)
    y = np.array(y)
    print(f"Processed {len(X)} images. Shape: {X.shape}")
    return X, y

# Create ImageDataGenerators for training and validation sets

def create_datagens(train_df, val_df, base_dir, img_size, batch_size):
    # Code here
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    # Only rescaling for validation
    val_datagen = ImageDataGenerator(rescale=1./255)

    # Flow from dataframe automatically loads and labels images
    train_generator = train_datagen.flow_from_dataframe(
        train_df,
        x_col='image_path',
        y_col='label',
        directory=None,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='binary',
        shuffle=True
    )

    val_generator = val_datagen.flow_from_dataframe(
        val_df,
        x_col='image_path',
        y_col='label',
        directory=None,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='binary',
        shuffle=False
    )

    return train_generator, val_generator

