In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\aditya\.cache\kagglehub\datasets\kmader\skin-cancer-mnist-ham10000\versions\2


In [3]:
import zipfile
import os
for zip_name in ["HAM10000_images_part_1.zip", "HAM10000_images_part_2.zip"]:
    zip_path = os.path.join(path, zip_name)
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(path)
            print(f"Extracted {zip_name}")

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight


import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, CSVLogger



In [5]:
import math
import random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [6]:
from pathlib import Path

DATA_DIR = Path(r"C:\Users\aditya\.cache\kagglehub\datasets\kmader\skin-cancer-mnist-ham10000\versions\2")
META_CSV = DATA_DIR / "HAM10000_metadata.csv"
IMG_DIRS = [DATA_DIR / "HAM10000_images_part_1", DATA_DIR / "HAM10000_images_part_2"]
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)


In [7]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 30
BASE_LR = 1e-4
VAL_SPLIT = 0.15
TEST_SPLIT = 0.15

In [8]:
assert META_CSV.exists(), f"Metadata CSV not found at {META_CSV}"
for d in IMG_DIRS:
    assert d.exists(), f"Image directory not found: {d}"


print("Loading metadata…")
df = pd.read_csv(META_CSV)


# Map image_id -> file path (look in both parts)
print("Resolving image paths…")
image_paths = {}
for img_dir in IMG_DIRS:
    for fname in os.listdir(img_dir):
        if fname.lower().endswith((".jpg", ".jpeg", ".png")):
            key = Path(fname).stem # 'ISIC_XXXXX'
            image_paths[key] = str(img_dir / fname)


# Keep only rows whose image file exists
df["filepath"] = df["image_id"].map(image_paths)
df = df[~df["filepath"].isnull()].copy()

Loading metadata…
Resolving image paths…


In [9]:
df["filepath"] = df["image_id"].map(image_paths)
df = df[~df["filepath"].isnull()].copy()

print("Encoding labels…")
le = LabelEncoder()
df["label"] = le.fit_transform(df["dx"].values)
num_classes = len(le.classes_)
print("Classes:", list(le.classes_))

Encoding labels…
Classes: ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']


In [10]:
print("Splitting dataset (stratified)…")
train_df, temp_df = train_test_split(
df, test_size=(VAL_SPLIT + TEST_SPLIT), stratify=df["label"], random_state=SEED
)
relative_val = VAL_SPLIT / (VAL_SPLIT + TEST_SPLIT)
val_df, test_df = train_test_split(
temp_df, test_size=(1 - relative_val), stratify=temp_df["label"], random_state=SEED
)


print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Splitting dataset (stratified)…
Train: 7010, Val: 1502, Test: 1503


In [11]:
print("Computing class weights…")
class_weights_array = compute_class_weight(
class_weight="balanced",
classes=np.unique(train_df["label"]),
y=train_df["label"].values,
)
class_weights = {i: w for i, w in enumerate(class_weights_array)}
print("Class weights:", class_weights)

Computing class weights…
Class weights: {0: np.float64(4.37305053025577), 1: np.float64(2.7817460317460316), 2: np.float64(1.3022478172023035), 3: np.float64(12.36331569664903), 4: np.float64(1.285530900421786), 5: np.float64(0.21338772031292808), 6: np.float64(10.115440115440116)}


In [12]:
AUTOTUNE = tf.data.AUTOTUNE


def decode_img(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32)
    img = preprocess_input(img) # ResNet50 preprocessing
    return img

In [13]:
augmenter = tf.keras.Sequential([
layers.RandomFlip("horizontal"),
layers.RandomRotation(0.1),
layers.RandomZoom(0.1),
layers.RandomContrast(0.1),
])

In [14]:
def make_dataset(df_, training=False):
    paths = df_["filepath"].values
    labels = df_["label"].values
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    ds = ds.shuffle(len(df_), seed=SEED) if training else ds

    def _load(path, label):
        img = decode_img(path)
        if training:
            img = augmenter(img)  
        label_onehot = tf.one_hot(label, num_classes)
        return img, label_onehot

    ds = ds.map(_load, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_dataset(train_df, training=True)
val_ds = make_dataset(val_df, training=False)
test_ds = make_dataset(test_df, training=False)

In [15]:
print("Building model…")
base_model = ResNet50(
include_top=False,
weights="imagenet",
input_shape=(*IMG_SIZE, 3)
)
for layer in base_model.layers[:-10]:
    layer.trainable = False


inputs = layers.Input(shape=(*IMG_SIZE, 3))
x = base_model(inputs, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model = models.Model(inputs, outputs)


model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=BASE_LR),
loss="categorical_crossentropy",
metrics=["accuracy"]
)


model.summary()

Building model…


In [16]:
checkpoint_path = ARTIFACTS_DIR / "best_model.keras"
callbacks = [
EarlyStopping(monitor="val_accuracy", patience=6, restore_best_weights=True),
ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6),
ModelCheckpoint(filepath=str(checkpoint_path), monitor="val_accuracy", save_best_only=True),
CSVLogger(str(ARTIFACTS_DIR / "history.csv"), append=False)
]

In [None]:
print("Training (stage 1)…")
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    class_weight=class_weights,
    callbacks=callbacks,
    verbose=1
)

Training (stage 1)…
Epoch 1/30


[1m  8/220[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:46[0m 1s/step - accuracy: 0.0695 - loss: 3.7712

In [None]:
print("Fine-tuning (stage 2)…")
for layer in base_model.layers[-50:]:
    layer.trainable = True


model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=BASE_LR * 0.2),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)


history_ft = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=max(8, EPOCHS // 2),
    class_weight=class_weights,
    callbacks=callbacks,
    verbose=1
)

In [None]:
print("Evaluating on test set…")
loss, acc = model.evaluate(test_ds, verbose=0)
print(f"Test Accuracy: {acc:.4f} | Test Loss: {loss:.4f}")


# Predictions for detailed metrics
print("Generating classification report…")
probs = model.predict(test_ds, verbose=0)
y_true = np.concatenate([np.argmax(y.numpy(), axis=1) for _, y in test_ds], axis=0)
y_pred = probs.argmax(axis=1)


print(classification_report(y_true, y_pred, target_names=list(le.classes_)))
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
import json
print("Saving artifacts…")
model.save(checkpoint_path) # ensure final model persisted
with open(ARTIFACTS_DIR / "label_map.json", "w") as f:
    json.dump({cls: int(idx) for idx, cls in enumerate(le.classes_)}, f, indent=2)


print("Done. Artifacts saved in:", ARTIFACTS_DIR.resolve())

In [None]:
def predict_image(image_path: str):
    """Return (class_name, confidence) for a single image file."""
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32)
    img = preprocess_input(img)
    img = tf.expand_dims(img, axis=0)
    preds = model.predict(img, verbose=0)[0]
    idx = int(np.argmax(preds))
    class_name = le.classes_[idx]
    conf = float(preds[idx])
    return class_name, conf