<a href="https://colab.research.google.com/github/ArnavBharti/Skin-Cancer-CNN/blob/main/skin_cancer_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Skin Cancer Classification with MobileNet

This notebook contains the complete pipeline for classifying 7 types of skin lesions using two different MobileNet-based architectures.

**The process includes:**
1.  **Setup**: Mounting Google Drive and importing libraries.
2.  **Configuration**: Setting up paths, hyperparameters, and class weights.
3.  **Data Loading**: Creating a custom PyTorch Dataset and DataLoader.
4.  **Model Definition**: Defining two MobileNet-based models.
5.  **Training & Evaluation**: Training the models, saving the best weights, and evaluating performance.
6.  **Results**: Visualizing training history, confusion matrices, and detailed classification reports.

# SECTION 1: SETUP AND CONFIGURATION

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from tqdm.notebook import tqdm
import time
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight

print("Libraries imported successfully.")
print(f"PyTorch Version: {torch.__version__}")
print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

In [None]:
# --- Paths ---
BASE_DRIVE_PATH = "/content/drive/My Drive/dl_project/"
TRAIN_DIR = os.path.join(BASE_DRIVE_PATH, "train")
VAL_DIR = os.path.join(BASE_DRIVE_PATH, "val")
MODEL_SAVE_DIR = os.path.join(BASE_DRIVE_PATH, "models")

os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
print(f"Models will be saved to: {MODEL_SAVE_DIR}")

# --- Device ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Model & Training Hyperparameters ---
NUM_CLASSES = 7
IMAGE_SIZE = 224
BATCH_SIZE = 24
NUM_EPOCHS = 42
LEARNING_RATE = 0.001
NUM_WORKERS = 2

# --- Class Definitions ---
CLASS_NAMES = {
    0: 'akiec', # Actinic keratoses
    1: 'bcc',   # Basal cell carcinoma
    2: 'bkl',   # Benign keratosis-like lesions
    3: 'df',    # Dermatofibroma
    4: 'mel',   # Melanoma
    5: 'nv',    # Melanocytic nevi
    6: 'vasc'   # Vascular lesions
}
CLASS_IDX_TO_NAME = [CLASS_NAMES[i] for i in range(NUM_CLASSES)]

# SECTION 2: DATA LOADING AND AUGMENTATION

In [None]:
train_transforms = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
class SkinCancerDataset(Dataset):
    """Custom PyTorch dataset for loading skin lesion images."""
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(data_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        if not self.image_files:
            raise FileNotFoundError(f"No images found in directory: {data_dir}")

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.data_dir, img_name)

        try:
            # The label is the first number in the filename (e.g., '1_abc.jpg')
            label = int(img_name.split('_')[0]) - 1
            image = Image.open(img_path).convert('RGB')

            if self.transform:
                image = self.transform(image)

            return image, label
        except Exception as e:
            print(f"Error loading image or label for {img_path}: {e}")
            # Return a placeholder if an image is corrupt
            return torch.randn(3, IMAGE_SIZE, IMAGE_SIZE), 0

In [None]:
try:
    train_labels = [int(f.split('_')[0]) - 1 for f in os.listdir(TRAIN_DIR)]
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
    print("Calculated Class Weights:")
    for i, weight in enumerate(class_weights):
        print(f"- {CLASS_NAMES[i]}: {weight:.2f}")
except Exception as e:
    print(f"Could not calculate class weights, using uniform weights. Error: {e}")
    class_weights = torch.ones(NUM_CLASSES).to(DEVICE)

In [None]:
train_dataset = SkinCancerDataset(TRAIN_DIR, transform=train_transforms)
val_dataset = SkinCancerDataset(VAL_DIR, transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

print(f"\nDataLoaders created.")
print(f"Training images: {len(train_dataset)}")
print(f"Validation images: {len(val_dataset)}")

# SECTION 3: MODEL ARCHITECTURES

In [None]:
class Mish(nn.Module):
    def forward(self, x):
        return x * torch.tanh(nn.functional.softplus(x))

In [None]:
class MobileNetModel1(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES, freeze_layers=False):
        super(MobileNetModel1, self).__init__()
        self.mobilenet = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)

        if freeze_layers:
            for param in self.mobilenet.features.parameters():
                param.requires_grad = False

        in_features = self.mobilenet.classifier[1].in_features
        self.mobilenet.classifier = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.mobilenet(x)

In [None]:
class MobileNetModel2(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES, freeze_layers=False):
        super(MobileNetModel2, self).__init__()
        self.mobilenet = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.IMAGENET1K_V1)

        if freeze_layers:
            for param in self.mobilenet.features.parameters():
                param.requires_grad = False

        in_features = self.mobilenet.classifier[0].in_features
        self.mobilenet.classifier = nn.Sequential(
            nn.Linear(in_features, 256),
            nn.BatchNorm1d(256),
            Mish(),
            nn.Dropout(p=0.4),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        return self.mobilenet(x)

print("Model architectures defined.")

# SECTION 4: UTILITY FUNCTIONS

In [None]:
def plot_training_history(history, model_name):
    """Plots training and validation loss and accuracy."""
    epochs = range(1, len(history['train_loss']) + 1)

    plt.figure(figsize=(16, 6))

    # Plot Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history['train_loss'], 'bo-', label='Training Loss')
    plt.plot(epochs, history['val_loss'], 'ro-', label='Validation Loss')
    plt.title(f'{model_name} - Training & Validation Loss', fontsize=14)
    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('Loss', fontsize=12)
    plt.legend()
    plt.grid(True)

    # Plot Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history['train_acc'], 'bo-', label='Training Accuracy')
    plt.plot(epochs, history['val_acc'], 'ro-', label='Validation Accuracy')
    plt.title(f'{model_name} - Training & Validation Accuracy', fontsize=14)
    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('Accuracy', fontsize=12)
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()


def plot_confusion_matrix(conf_matrix, model_name):
    """Plots a confusion matrix using Seaborn."""
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=CLASS_IDX_TO_NAME,
                yticklabels=CLASS_IDX_TO_NAME)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.title(f'{model_name} - Confusion Matrix (Validation Set)', fontsize=14)
    plt.show()

print("Utility functions defined.")

# SECTION 5: TRAINING & EVALUATION LOGIC

In [None]:
## 5.1. Training Function
def train_model(model, model_name, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs):
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    best_val_acc = 0.0
    model_save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_best.pth")

    print(f"\n--- Starting Training for {model_name} ---")
    start_time = time.time()

    for epoch in range(num_epochs):
        # --- Training Phase ---
        model.train()
        running_loss, correct_preds, total_samples = 0.0, 0, 0
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")

        for inputs, labels in train_pbar:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            correct_preds += (predicted == labels).sum().item()
            train_pbar.set_postfix({'loss': running_loss/total_samples, 'acc': correct_preds/total_samples})

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = correct_preds / len(train_loader.dataset)
        history['train_loss'].append(epoch_train_loss)
        history['train_acc'].append(epoch_train_acc)

        # --- Validation Phase ---
        model.eval()
        running_loss, correct_preds, total_samples = 0.0, 0, 0
        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")

        with torch.no_grad():
            for inputs, labels in val_pbar:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                running_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                total_samples += labels.size(0)
                correct_preds += (predicted == labels).sum().item()
                val_pbar.set_postfix({'loss': running_loss/total_samples, 'acc': correct_preds/total_samples})

        epoch_val_loss = running_loss / len(val_loader.dataset)
        epoch_val_acc = correct_preds / len(val_loader.dataset)
        history['val_loss'].append(epoch_val_loss)
        history['val_acc'].append(epoch_val_acc)

        print(f"\nEpoch {epoch+1}: Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f} | Val Loss: {epoch_val_loss:.4f} Acc: {epoch_val_acc:.4f}")

        # Update learning rate
        if scheduler:
            scheduler.step(epoch_val_loss)

        # Save the best model
        if epoch_val_acc > best_val_acc:
            best_val_acc = epoch_val_acc
            torch.save(model.state_dict(), model_save_path)
            print(f"New best validation accuracy: {best_val_acc:.4f}. Model saved to {model_save_path}")

    total_time = time.time() - start_time
    print(f"\n--- Training Complete for {model_name} ({total_time // 60:.0f}m {total_time % 60:.0f}s) ---")
    print(f"Best Validation Accuracy: {best_val_acc:.4f}")
    return history

In [None]:
def evaluate_model(model, data_loader, model_name):
    """Evaluates the model and prints detailed reports."""
    model.eval()
    all_labels, all_preds = [], []

    print(f"\n--- Evaluating {model_name} on Validation Set ---")
    eval_pbar = tqdm(data_loader, desc="Evaluating")

    with torch.no_grad():
        for inputs, labels in eval_pbar:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    # --- Metrics Calculation ---
    conf_matrix = confusion_matrix(all_labels, all_preds)
    class_report = classification_report(
        all_labels, all_preds,
        target_names=CLASS_IDX_TO_NAME,
        zero_division=0
    )

    # --- Display Results ---
    print("\n" + "="*50)
    print(f"               FINAL EVALUATION REPORT: {model_name}")
    print("="*50)
    print("\nFull Classification Report:")
    print(class_report)

    # Plot confusion matrix
    plot_confusion_matrix(conf_matrix, model_name)


# SECTION 6: MODEL 1 - TRAINING AND EVALUATION

In [None]:
model1_name = "MobileNetV2_FineTuned"
model1 = MobileNetModel1().to(DEVICE)
criterion1 = nn.CrossEntropyLoss(weight=class_weights)
optimizer1 = optim.Adam(model1.parameters(), lr=LEARNING_RATE)
scheduler1 = optim.lr_scheduler.ReduceLROnPlateau(optimizer1, mode='min', factor=0.2, patience=3, verbose=True)

history1 = train_model(model1, model1_name, train_loader, val_loader, criterion1, optimizer1, scheduler1, num_epochs=NUM_EPOCHS)




In [None]:
plot_training_history(history1, model1_name)

print(f"\n--- Loading best weights for {model1_name} for final evaluation ---")
model1_best_path = os.path.join(MODEL_SAVE_DIR, f"{model1_name}_best.pth")
try:
    model1.load_state_dict(torch.load(model1_best_path))
    print(f"Successfully loaded weights from {model1_best_path}")
    evaluate_model(model1, val_loader, model1_name)
except FileNotFoundError:
    print(f"ERROR: Could not find the model file at {model1_best_path}. Evaluation skipped.")
except Exception as e:
    print(f"An error occurred while loading the model: {e}")

# SECTION 7: MODEL 2 - TRAINING AND EVALUATION

In [None]:
model2_name = "MobileNetV3_CustomClassifier"
model2 = MobileNetModel2().to(DEVICE)
criterion2 = nn.CrossEntropyLoss(weight=class_weights)
optimizer2 = optim.Adam(model2.parameters(), lr=LEARNING_RATE)
scheduler2 = optim.lr_scheduler.ReduceLROnPlateau(optimizer2, mode='min', factor=0.2, patience=3, verbose=True)

history2 = train_model(model2, model2_name, train_loader, val_loader, criterion2, optimizer2, scheduler2, num_epochs=NUM_EPOCHS)


In [None]:
plot_training_history(history2, model2_name)

print(f"\n--- Loading best weights for {model2_name} for final evaluation ---")
model2_best_path = os.path.join(MODEL_SAVE_DIR, f"{model2_name}_best.pth")
try:
    model2.load_state_dict(torch.load(model2_best_path))
    print(f"Successfully loaded weights from {model2_best_path}")
    evaluate_model(model2, val_loader, model2_name)
except FileNotFoundError:
    print(f"ERROR: Could not find the model file at {model2_best_path}. Evaluation skipped.")
except Exception as e:
    print(f"An error occurred while loading the model: {e}")