In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
zip_path = '/content/drive/MyDrive/micro-club-pinktober-breast-cancer-detection.zip'


In [3]:
import zipfile

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/breast_cancer_data')  # extracts all files to this folder


In [4]:
import os
# Check 1: Does the main data folder exist?
DATA_DIR = './breast_cancer_data'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')

print(f"Checking existence of DATA_DIR: {os.path.exists(DATA_DIR)}")
print(f"Checking existence of TRAIN_DIR: {os.path.exists(TRAIN_DIR)}")

Checking existence of DATA_DIR: True
Checking existence of TRAIN_DIR: True


In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

import numpy as np
import pandas as pd
import os

# --- 1. CONFIGURATION ---

# ⚠️ UPDATE THESE PATHS
TRAIN_DIR = "./breast_cancer_data/train"
TEST_DIR = "./breast_cancer_data/test"
SUBMISSION_FILE = "submission.csv"

# Model & Training Parameters
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 20
# FIX ATTEMPT: Reset to standard transfer learning LR (0.001)
# since the model was stuck at 50% with the lower LR (0.0001).
LEARNING_RATE = 0.001


# --- 2. LOAD & PREPARE DATA ---

print("Loading and preparing data...")

# 1. Load Training and Validation Datasets
train_ds = tf.keras.utils.image_dataset_from_directory(
    TRAIN_DIR,
    validation_split=0.2,
    subset="training",
    seed=42,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='binary'
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    TRAIN_DIR,
    validation_split=0.2,
    subset="validation",
    seed=42,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='binary'
)

# 2. Get class names and map them to M/N
class_names = train_ds.class_names
print(f"Class names found (Index 0/1): {class_names}")

def map_to_submission_label(index):
    # Determine the submission label (M or N) based on the class index
    if class_names[index].lower() == 'malignant':
        return 'M'
    elif class_names[index].lower() == 'normal':
        return 'N'
    return '?'

# 3. Create Data Augmentation layer
data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal_and_vertical"),
    layers.RandomRotation(0.2),
    layers.RandomZoom(0.2),
    layers.RandomContrast(0.2),
], name="data_augmentation")


# 4. Define preprocessing functions (MobileNetV2 normalization)
def preprocess_and_augment(image, label):
    image = data_augmentation(image, training=True)
    image = preprocess_input(image)
    return image, label

def preprocess_only(image, label):
    image = preprocess_input(image)
    return image, label

# 5. Apply preprocessing and optimize pipelines
train_ds = train_ds.map(preprocess_and_augment, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.prefetch(buffer_size=tf.data.AUTOTUNE)

val_ds = val_ds.map(preprocess_only, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=tf.data.AUTOTUNE)


# --- 3. BUILD THE MODEL (TRANSFER LEARNING) ---

print("Building model...")

# 1. Load pre-trained base model (MobileNetV2)
base_model = MobileNetV2(
    input_shape=IMAGE_SIZE + (3,),
    include_top=False,
    weights='imagenet'
)

# 2. Start by FREEZING the base model
base_model.trainable = False

# 3. Create the new classification head
inputs = keras.Input(shape=IMAGE_SIZE + (3,))
# NOTE: The redundant preprocess_input layer is REMOVED (the fix for 50% accuracy)
x = base_model(inputs, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.5)(x) # Increased Dropout for stability
outputs = layers.Dense(1, activation='sigmoid')(x)

# 4. Combine into the final model
model = Model(inputs, outputs)

# 5. Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


# --- 4. TRAIN THE MODEL ---

print("Starting model training (Stage 1: Frozen Base)...")

# Use generous patience since training has been unstable
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=7,
        restore_best_weights=True
    )
]

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    callbacks=callbacks
)


# ----------------------------------------------------------------------
# ⚡ OPTIONAL STAGE 2: Fine-Tuning
# Run this block ONLY if Stage 1 (above) results in accuracy > 65%
# ----------------------------------------------------------------------

print("\n--- Starting Stage 2: Fine-Tuning ---")
# Unfreeze the base model
base_model.trainable = True

 # Freeze all layers except the last few blocks (e.g., last 20 layers)
for layer in base_model.layers[:-20]:
    layer.trainable = False

# # Re-compile the model with a very low learning rate
model.compile(
     optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE / 10), # e.g., 0.00001
     loss='binary_crossentropy',
     metrics=['accuracy']
 )

# # Continue training for a few more epochs
history_ft = model.fit(
     train_ds,
    epochs=EPOCHS + 5, # Run for 5 extra epochs
     initial_epoch=history.epoch[-1], # Start from where Stage 1 left off
     validation_data=val_ds,
     callbacks=callbacks
 )

# ----------------------------------------------------------------------

# --- 5. MAKE PREDICTIONS AND CREATE SUBMISSION FILE ---

print(f"\nLoading test data from: {TEST_DIR}")

# 1. Load the test dataset (no labels, no shuffle)
test_ds = tf.keras.utils.image_dataset_from_directory(
    TEST_DIR,
    labels=None,
    shuffle=False,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE
)

# 2. Get the filenames in the correct order
test_filenames = test_ds.file_paths
test_filenames = [os.path.basename(f) for f in test_filenames]

# 3. Preprocess the test data
test_ds = test_ds.map(lambda image: preprocess_input(image), num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=tf.data.AUTOTUNE)

# 4. Make predictions
print("Running predictions on test set...")
raw_predictions = model.predict(test_ds)

# 5. Convert probabilities to class indices (0 or 1)
predicted_indices = (raw_predictions > 0.5).astype(int)

# 6. Map indices to the final 'M' or 'N' submission labels
submission_labels = [map_to_submission_label(i[0]) for i in predicted_indices]

# 7. Create the DataFrame and save to CSV
submission_df = pd.DataFrame({
    # FIX HERE: Change 'image' to 'ID' (or 'id')
    'ID': test_filenames,
    'label': submission_labels
})

submission_df.to_csv(SUBMISSION_FILE, index=False)

print("\n-------------------------------------------------")
print(f"SUCCESS! Submission file created: {SUBMISSION_FILE}")
print("First 10 lines of submission (Check Header!):")
print(submission_df.head(10))
print("-------------------------------------------------")

Loading and preparing data...
Found 700 files belonging to 2 classes.
Using 560 files for training.
Found 700 files belonging to 2 classes.
Using 140 files for validation.
Class names found (Index 0/1): ['malignant', 'normal']
Building model...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Starting model training (Stage 1: Frozen Base)...
Epoch 1/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 2s/step - accuracy: 0.4859 - loss: 0.9174 - val_accuracy: 0.5357 - val_loss: 0.7331
Epoch 2/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 468ms/step - accuracy: 0.5045 - loss: 0.8770 - val_accuracy: 0.5500 - val_loss: 0.7303
Epoch 3/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 428ms/step - accuracy: 0.5210 - loss: 0.8330 - val_accuracy: 0.5071 - val_loss: 0.7189
Epoch 4/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 372ms/step - accuracy: 0.5170 - loss: 0.8299 - val_accuracy: 0.5143 - val_loss: 0.7154
Epoch 5/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 476ms/step - accuracy: 0.5563 - loss: 0.7566 - val_accuracy: 0.5071 - val_loss: 0.7277
Epoch 6/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 395ms/step - accuracy: 0.4924 - loss: 0.8112 - val_accuracy: 0.514