# Skin Cancer Detection — **Demo** (HAM10000, ~100 images)

**Colab-ready notebook (TensorFlow / Keras).**

**What this demo does:**

- Downloads a small subset (~100 images) of the HAM10000 dataset directly from Kaggle (you'll upload `kaggle.json` once in Colab).
- Preprocesses images (resize 224×224, normalization, Gaussian denoising and simple hair removal).
- Uses DenseNet201 (pretrained) + small Dense head for classification into lesion types.
- Trains on 80% of the sampled images and evaluates on 20%.
- Produces a performance table, confusion matrix, ROC curves and example predictions.

**Instructions:** Run each cell in order. When prompted, upload your `kaggle.json` (Kaggle API token) so the notebook can download the dataset.


In [1]:
# Install required packages (run in Colab)
!pip install -q kaggle tensorflow matplotlib scikit-learn opencv-python-headless seaborn tqdm
print('Installed packages (or verified existing installations).')

Installed packages (or verified existing installations).


'pip' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
# Upload kaggle.json in Colab when prompted (skip if running locally and dataset is already available)
from google.colab import files
print('Upload kaggle.json now (Kaggle API token). If you already uploaded, cancel this upload dialog.') 
try:
    uploaded = files.upload()
    for fn in uploaded.keys():
        print('Uploaded file:', fn)
except Exception as e:
    print('Upload canceled or not running in Colab:', e)

ModuleNotFoundError: No module named 'google'

In [None]:
# Move kaggle.json to ~/.kaggle and set permissions (Colab)
import os, shutil
if os.path.exists('kaggle.json'):
    os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)
    shutil.move('kaggle.json', os.path.expanduser('~/.kaggle/kaggle.json'))
    os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)
    print('kaggle.json moved to ~/.kaggle/kaggle.json')
else:
    print('kaggle.json not found in current working directory. If you are running locally, ensure dataset files are present.')

In [3]:
# Download HAM10000 dataset from Kaggle and unzip to /content/HAM10000
import os
os.makedirs('/content/HAM10000', exist_ok=True)
print('Downloading dataset from Kaggle (requires kaggle.json). This may take a few minutes depending on network speed.')
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000 -p /content/HAM10000 --unzip -q
print('Dataset downloaded and unzipped. Files:')
print(sorted(os.listdir('/content/HAM10000'))[:50])

Downloading dataset from Kaggle (requires kaggle.json). This may take a few minutes depending on network speed.
Dataset downloaded and unzipped. Files:
[]


'kaggle' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
# Load metadata and sample ~100 images with class balancing
import pandas as pd, glob, os, shutil
meta_path = '/content/HAM10000/HAM10000_metadata.csv'
assert os.path.exists(meta_path), "Metadata CSV not found. Ensure download completed."
meta = pd.read_csv(meta_path)
print('Total metadata rows:', len(meta))
print(meta['dx'].value_counts())

# Map image ids to paths
all_images = glob.glob('/content/HAM10000/*.jpg')
id_to_path = { os.path.splitext(os.path.basename(p))[0]: p for p in all_images }
meta['path'] = meta['image_id'].astype(str).map(id_to_path)
meta = meta.dropna(subset=['path']).reset_index(drop=True)
print('Rows with actual image files:', len(meta))

# Sample up to 15 per class to get approx 100 images
sampled = []
max_per_class = 15
for cls, g in meta.groupby('dx'):
    sampled += g.sample(n=min(len(g), max_per_class), random_state=42)['path'].tolist()
# make sure ~100 total
sampled = sampled[:100]
sample_dir = '/content/sample_images'
os.makedirs(sample_dir, exist_ok=True)
for p in sampled:
    shutil.copy(p, sample_dir)
print('Sampled images copied to', sample_dir, 'count =', len(os.listdir(sample_dir)))

In [None]:
# Preprocess sampled images: resize, gaussian denoise, hair removal, normalize
import cv2, numpy as np, os, pandas as pd
IMG_SIZE = (224,224)
sample_paths = sorted([os.path.join('/content/sample_images', f) for f in os.listdir('/content/sample_images') if f.lower().endswith('.jpg')])
print('Number of sampled images:', len(sample_paths))

def preprocess(path):
    img = cv2.imread(path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, IMG_SIZE, interpolation=cv2.INTER_AREA)
    img = cv2.GaussianBlur(img, (5,5), sigmaX=1.0)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
    _, thresh = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)
    if thresh.sum() > 0:
        img = cv2.inpaint(img, thresh, 1, cv2.INPAINT_TELEA)
    img = img.astype('float32')/255.0
    return img

X = []
y = []
label_map = {}
meta = pd.read_csv('/content/HAM10000/HAM10000_metadata.csv')
for p in sample_paths:
    imgid = os.path.splitext(os.path.basename(p))[0]
    row = meta[meta['image_id'].astype(str)==imgid].iloc[0]
    cls = row['dx']
    if cls not in label_map:
        label_map[cls] = len(label_map)
    X.append(preprocess(p))
    y.append(label_map[cls])
X = np.array(X); y = np.array(y)
print('X,y shapes:', X.shape, y.shape)
print('Label map:', label_map)

In [None]:
# Train-test split (80:20)
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train/Test sizes:', train_X.shape[0], test_X.shape[0])

In [None]:
# Build model (DenseNet201 base + small Dense head)
import tensorflow as tf
from tensorflow.keras import layers, models, applications, optimizers

base = applications.DenseNet201(weights='imagenet', include_top=False, input_shape=(224,224,3), pooling='avg')
base.trainable = False

inp = layers.Input(shape=(224,224,3))
x = base(inp, training=False)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.4)(x)
out = layers.Dense(len(label_map), activation='softmax')(x)
model = models.Model(inp, out)

model.compile(optimizer=optimizers.Adam(1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Train for a few epochs (demo)
history = model.fit(train_X, train_y, validation_split=0.1, epochs=8, batch_size=8)

In [None]:
# Evaluate and report metrics
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
preds = np.argmax(model.predict(test_X), axis=1)
print('Test accuracy:', accuracy_score(test_y, preds))
print('\nClassification report:')
print(classification_report(test_y, preds, target_names=list(label_map.keys()), zero_division=0))
cm = confusion_matrix(test_y, preds)
cm

In [None]:
# Performance table (Accuracy, Precision, Recall, F1 - macro)
from sklearn.metrics import precision_score, recall_score, f1_score
perf = {
    'Accuracy': accuracy_score(test_y, preds),
    'Precision_macro': precision_score(test_y, preds, average='macro', zero_division=0),
    'Recall_macro': recall_score(test_y, preds, average='macro', zero_division=0),
    'F1_macro': f1_score(test_y, preds, average='macro', zero_division=0)
}
import pandas as pd
pd.DataFrame([perf], index=['DenseNet201_demo'])

In [None]:
# Plots: training curves, confusion matrix, and sample predictions
import matplotlib.pyplot as plt, seaborn as sns, random
plt.figure(figsize=(10,4))
plt.subplot(1,2,1); plt.plot(history.history['loss'], label='train_loss'); plt.plot(history.history['val_loss'], label='val_loss'); plt.legend(); plt.title('Loss')
plt.subplot(1,2,2); plt.plot(history.history['accuracy'], label='train_acc'); plt.plot(history.history['val_accuracy'], label='val_acc'); plt.legend(); plt.title('Accuracy')
plt.show()

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=list(label_map.keys()), yticklabels=list(label_map.keys()))
plt.xlabel('Predicted'); plt.ylabel('True'); plt.title('Confusion Matrix'); plt.show()

# Show 6 sample predictions from test set
plt.figure(figsize=(12,8))
idxs = random.sample(range(test_X.shape[0]), min(6, test_X.shape[0]))
for i, ix in enumerate(idxs):
    plt.subplot(2,3,i+1)
    plt.imshow(test_X[ix])
    plt.axis('off')
    true = list(label_map.keys())[list(label_map.values()).index(test_y[ix])]
    pred = list(label_map.keys())[list(label_map.values()).index(preds[ix])]
    plt.title(f'True: {true}\nPred: {pred}')
plt.show()

**End of demo notebook.**

Run the notebook step-by-step in Google Colab. Upload `kaggle.json` when prompted so the notebook can download the HAM10000 dataset and run the demo.