<a href="https://colab.research.google.com/github/ElizabethChacko/AI-based-Intelligent-System-for-Skin-Disease-Detection-and-Healthcare-Recommendation/blob/main/Skin_Disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Create project folders in Drive
import os
base_path = '/content/drive/MyDrive/Skin_project'  # Changed root to match your theme
os.makedirs(f'{base_path}/data_raw', exist_ok=True)
os.makedirs(f'{base_path}/dataset', exist_ok=True)
os.makedirs(f'{base_path}/models', exist_ok=True)

print("Folders ready! Base path:", base_path)

Mounted at /content/drive
Folders ready! Base path: /content/drive/MyDrive/Skin_project


In [5]:
# CELL 2 – Kaggle API Setup (your file is named kaggle(2).json)
from google.colab import files
import os

print("Please upload your kaggle(2).json file now...")
uploaded = files.upload()   # ← Upload kaggle(2).json when the file picker appears

# Automatically detect and rename the uploaded file
for filename in uploaded.keys():
    if filename.startswith("kaggle") and filename.endswith(".json"):
        print(f"Found: {filename}")
        # Rename it to the expected name
        os.rename(filename, "kaggle.json")
        print("Renamed to kaggle.json")
        break

# Install kaggle & set permissions
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Test that it works
!kaggle datasets list -m 5
print("Kaggle API is ready!")

Please upload your kaggle(2).json file now...


Saving kaggle (2).json to kaggle (2).json
Found: kaggle (2).json
Renamed to kaggle.json
usage: kaggle [-h] [-v] [-W]
              {competitions,c,datasets,d,kernels,k,models,m,files,f,config}
              ...
kaggle: error: unrecognized arguments: 5
Kaggle API is ready!


In [6]:
# FINAL CELL 3 – 100% WORKING (run after cleanup)
%cd /content/drive/MyDrive/Skin_project/data_raw

# Download only if not present
if not os.path.exists("skin-cancer-mnist-ham10000.zip"):
    !kaggle datasets download -d kmader/skin-cancer-mnist-ham10000
    print("Download complete")
else:
    print("Zip already exists")

# UNZIP
!unzip -q skin-cancer-mnist-ham10000.zip
print("Unzipped successfully")

# Create clean ham10000 folder and move everything inside
!mkdir -p ham10000
!mv HAM10000_images_part_1 ham10000/
!mv HAM10000_images_part_2 ham10000/
!mv HAM10000_metadata.csv ham10000/
!mv hmnist_* ham10000/ 2>/dev/null || true   # optional small CSV files

# Remove the big zip to save space
!rm skin-cancer-mnist-ham10000.zip

# FINAL CHECK — YOU MUST SEE THIS
print("\nFINAL STRUCTURE (this must appear):")
!ls -la ham10000/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
-rw------- 1 root root   279172 Oct  6  2019 ISIC_0024306.jpg
-rw------- 1 root root   274405 Oct  6  2019 ISIC_0024307.jpg
-rw------- 1 root root   295747 Oct  6  2019 ISIC_0024308.jpg
-rw------- 1 root root   278615 Oct  6  2019 ISIC_0024309.jpg
-rw------- 1 root root   372386 Oct  6  2019 ISIC_0024310.jpg
-rw------- 1 root root   273101 Oct  6  2019 ISIC_0024311.jpg
-rw------- 1 root root   257771 Oct  6  2019 ISIC_0024312.jpg
-rw------- 1 root root   247904 Oct  6  2019 ISIC_0024313.jpg
-rw------- 1 root root   269124 Oct  6  2019 ISIC_0024314.jpg
-rw------- 1 root root   302692 Oct  6  2019 ISIC_0024315.jpg
-rw------- 1 root root   284060 Oct  6  2019 ISIC_0024316.jpg
-rw------- 1 root root   273903 Oct  6  2019 ISIC_0024317.jpg
-rw------- 1 root root   298082 Oct  6  2019 ISIC_0024318.jpg
-rw------- 1 root root   290761 Oct  6  2019 ISIC_0024319.jpg
-rw------- 1 root root   319930 Oct  6  2019 ISIC_0024320.jpg
-rw--

In [9]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import shutil
from tqdm import tqdm

# Paths
base_path = '/content/drive/MyDrive/Skin_project'
data_raw = f'{base_path}/data_raw/ham10000'
metadata_path = f'{data_raw}/HAM10000_metadata.csv'
image_part1 = f'{data_raw}/HAM10000_images_part_1'
image_part2 = f'{data_raw}/HAM10000_images_part_2'

# Load metadata
df = pd.read_csv(metadata_path)

# Class names (7 classes for HAM10000)
class_names = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']
print("Classes:", class_names)

# Add filepath column (check both parts)
def get_image_path(image_id):
    path1 = os.path.join(image_part1, f"{image_id}.jpg")
    path2 = os.path.join(image_part2, f"{image_id}.jpg")
    return path1 if os.path.exists(path1) else path2

df['path'] = df['image_id'].apply(get_image_path)

# Check missing files
missing = df[~df['path'].apply(os.path.exists)]
print(f"Missing images: {len(missing)}")  # Should be ~0

# Labels
df['label'] = df['dx']
df['label_idx'] = df['label'].map({name: idx for idx, name in enumerate(class_names)})

# Stratified split: 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Copy to ImageFolder structure
def copy_to_folder(df_split, split_name):
    split_dir = f'{base_path}/dataset/{split_name}'
    for class_name in class_names:
        os.makedirs(os.path.join(split_dir, class_name), exist_ok=True)

    for _, row in tqdm(df_split.iterrows(), total=len(df_split), desc=f"Copying {split_name}"):
        src = row['path']
        dest = os.path.join(split_dir, row['label'], os.path.basename(src))
        if os.path.exists(src):
            shutil.copy2(src, dest)

copy_to_folder(train_df, 'train')
copy_to_folder(val_df, 'val')
copy_to_folder(test_df, 'test')

print("Data organized! Ready for training.")

Classes: ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']
Missing images: 0
Train: 7010, Val: 1502, Test: 1503


Copying train: 100%|██████████| 7010/7010 [22:19<00:00,  5.24it/s]
Copying val: 100%|██████████| 1502/1502 [04:40<00:00,  5.36it/s]
Copying test: 100%|██████████| 1503/1503 [01:09<00:00, 21.55it/s]

Data organized! Ready for training.





In [10]:
!pip install -q timm torchmetrics

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import timm
from torchmetrics import Accuracy
import matplotlib.pyplot as plt
from tqdm import tqdm
import os



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/983.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m983.0/983.2 kB[0m [31m40.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
# Augmentations for train
train_transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.RandomResizedCrop(300, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# For val/test
val_test_transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.CenterCrop(300),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

num_classes = len(class_names) # Define num_classes
model = timm.create_model('efficientnet_b3', pretrained=True, num_classes=num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

# Define DataLoaders
batch_size = 32

train_dataset = datasets.ImageFolder(os.path.join(base_path, 'dataset', 'train'), transform=train_transform)
val_dataset = datasets.ImageFolder(os.path.join(base_path, 'dataset', 'val'), transform=val_test_transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


Using device: cuda


In [None]:
num_epochs = 20
best_acc = 0.0
save_path = f'{base_path}/models/best_model.pth'

train_losses, val_accs = [], []

for epoch in range(num_epochs):
    # Train
    model.train()
    running_loss = 0.0
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
    for inputs, labels in pbar:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        pbar.set_postfix({'loss': loss.item()})

    avg_loss = running_loss / len(train_loader)
    train_losses.append(avg_loss)

    # Val
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_acc = correct / total
    val_accs.append(val_acc)

    print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, Val Acc={val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), save_path)
        print(f"  → Best model saved! Acc: {best_acc:.4f}")

# Plot
plt.plot(train_losses, label='Train Loss')
plt.plot(val_accs, label='Val Acc')
plt.legend()
plt.show()

print("Training done! Best Acc:", best_acc)

Epoch 1/20: 100%|██████████| 220/220 [04:48<00:00,  1.31s/it, loss=0.731]


Epoch 1: Loss=0.9515, Val Acc=0.7483
  → Best model saved! Acc: 0.7483


Epoch 2/20:  60%|█████▉    | 131/220 [02:45<01:48,  1.22s/it, loss=0.362]