<a href="https://colab.research.google.com/github/Ethaniconic/Swalambh_mvp/blob/Dev/PreTrainingUpdated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Kaggle API key

In [1]:
from google.colab import files
files.upload() # Select your kaggle.json file here

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"devashishkamble28","key":"9e53c80e1f11eae9a65edb91f98614c6"}'}

DataSet download (HAM10000)

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset (approx 2.6GB)
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000

# Unzip quietly (-q) to avoid millions of print lines
!unzip -q skin-cancer-mnist-ham10000.zip -d ham10000_data
print("âœ… Data downloaded and unzipped!")

Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0
Downloading skin-cancer-mnist-ham10000.zip to /content
 99% 5.15G/5.20G [01:29<00:00, 63.7MB/s]
100% 5.20G/5.20G [01:29<00:00, 62.3MB/s]
âœ… Data downloaded and unzipped!


Google Drive linking

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Create a folder for your models if it doesn't exist
!mkdir -p /content/drive/MyDrive/DermSight_Models

Mounted at /content/drive


Model training

In [4]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm # Progress bar

# --- CONFIGURATION ---
BATCH_SIZE = 64 # T4 GPU can handle this easily
EPOCHS = 10     # Train a bit longer since Colab is fast
LR = 0.001
IMG_SIZE = 224
DATA_DIR = './ham10000_data/HAM10000_images_part_1' # Both parts usually merge, but check paths
# Note: The Kaggle dataset structure varies sometimes.
# We will combine part 1 and part 2 folders essentially by looking in both or moving them.

# --- PATH CORRECTION ---
# The KMader dataset unzips into multiple folders. Let's find where the images are.
image_paths = {}
for folder in ['./ham10000_data/HAM10000_images_part_1', './ham10000_data/HAM10000_images_part_2']:
    if os.path.exists(folder):
        for img in os.listdir(folder):
            image_paths[os.path.splitext(img)[0]] = os.path.join(folder, img)

print(f"Found {len(image_paths)} images.")

CSV_FILE = './ham10000_data/HAM10000_metadata.csv'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 1. DATASET CLASS ---
class HAMDataset(Dataset):
    def __init__(self, df, img_paths, transform=None):
        self.df = df
        self.img_paths = img_paths
        self.transform = transform
        self.class_map = {
            'nv': 0, 'mel': 1, 'bkl': 2, 'bcc': 3,
            'akiec': 4, 'vasc': 5, 'df': 6
        }

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_id = row['image_id']

        # Get path from our dictionary
        img_path = self.img_paths.get(img_id)

        if img_path is None:
            # Skip missing images (rare but happens)
            return self.__getitem__((idx + 1) % len(self.df))

        image = Image.open(img_path).convert('RGB')
        label = self.class_map[row['dx']]

        if self.transform:
            image = self.transform(image)

        return image, label

# --- 2. PREPARATION ---
# Augmentation
train_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Load Data
full_df = pd.read_csv(CSV_FILE)
train_df, val_df = train_test_split(full_df, test_size=0.1, stratify=full_df['dx'])

train_dataset = HAMDataset(train_df, image_paths, transform=train_transforms)
val_dataset = HAMDataset(val_df, image_paths, transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

# --- 3. MODEL (EfficientNet-B0) ---
model = models.efficientnet_b0(weights='DEFAULT')
num_ftrs = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_ftrs, 7) # 7 Classes
model = model.to(DEVICE)

# Class Weights for Imbalance
class_counts = train_df['dx'].value_counts().sort_index()
ordered_counts = [class_counts[k] for k in ['nv','mel','bkl','bcc','akiec','vasc','df']]
weights = [1.0 / c for c in ordered_counts]
class_weights = torch.FloatTensor(weights).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=LR)
scaler = GradScaler()

# --- 4. TRAINING LOOP ---
print(f"ðŸš€ Starting training on {DEVICE} for {EPOCHS} epochs...")

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")

    for images, labels in loop:
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()
        with autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch {epoch+1} -> Train Loss: {running_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f} | Val Acc: {100 * correct / total:.2f}%")

# --- 5. SAVE MODEL TO DRIVE ---
save_path = '/content/drive/MyDrive/DermSight_Models/ham10000_pretrained.pth'
torch.save(model.state_dict(), save_path)
print(f"âœ… Model saved successfully to: {save_path}")
print("You can now download this file and use it in your local Fusion project!")

Found 10015 images.
Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20.5M/20.5M [00:00<00:00, 207MB/s]
  scaler = GradScaler()


ðŸš€ Starting training on cuda for 10 epochs...


  with autocast():
Epoch 1/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:56<00:00,  1.21it/s, loss=1.09]


Epoch 1 -> Train Loss: 1.2185 | Val Loss: 0.9675 | Val Acc: 59.98%


Epoch 2/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:26<00:00,  1.63it/s, loss=0.725]


Epoch 2 -> Train Loss: 1.0059 | Val Loss: 0.7777 | Val Acc: 64.97%


Epoch 3/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:26<00:00,  1.63it/s, loss=1.06]


Epoch 3 -> Train Loss: 0.7938 | Val Loss: 0.6789 | Val Acc: 74.15%


Epoch 4/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:25<00:00,  1.64it/s, loss=0.852]


Epoch 4 -> Train Loss: 0.6449 | Val Loss: 0.6538 | Val Acc: 72.36%


Epoch 5/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:26<00:00,  1.62it/s, loss=0.756]


Epoch 5 -> Train Loss: 0.5924 | Val Loss: 0.6589 | Val Acc: 71.66%


Epoch 6/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:27<00:00,  1.62it/s, loss=0.695]


Epoch 6 -> Train Loss: 0.5608 | Val Loss: 0.7178 | Val Acc: 73.15%


Epoch 7/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:27<00:00,  1.60it/s, loss=0.526]


Epoch 7 -> Train Loss: 0.4813 | Val Loss: 0.5305 | Val Acc: 76.65%


Epoch 8/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:27<00:00,  1.62it/s, loss=0.639]


Epoch 8 -> Train Loss: 0.4865 | Val Loss: 0.5666 | Val Acc: 73.05%


Epoch 9/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:26<00:00,  1.63it/s, loss=0.665]


Epoch 9 -> Train Loss: 0.4863 | Val Loss: 0.6247 | Val Acc: 73.75%


Epoch 10/10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 141/141 [01:26<00:00,  1.63it/s, loss=0.375]


Epoch 10 -> Train Loss: 0.4316 | Val Loss: 0.4921 | Val Acc: 77.94%
âœ… Model saved successfully to: /content/drive/MyDrive/DermSight_Models/ham10000_pretrained.pth
You can now download this file and use it in your local Fusion project!
