In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import keras
from keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.applications import EfficientNetB2, VGG16
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, concatenate, Flatten, Input
from keras import models, regularizers, Model, optimizers
from keras.losses import CategoricalCrossentropy
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf

In [106]:
target_size = (260, 260)
batch_size = 32
num_classes = 21

In [107]:
path = "bttai-ajl-2025/train/train"
df = pd.read_csv("bttai-ajl-2025/train.csv")

df['md5hash'] = df['md5hash'].astype(str) + '.jpg'

# Combine label and md5hash to form the correct path
df['file_path'] = df['label'] + '/' + df['md5hash']

In [108]:
df.describe()

Unnamed: 0,fitzpatrick_scale,fitzpatrick_centaur,ddi_scale
count,2860.0,2860.0,2860.0
mean,2.524476,2.095455,23.547552
std,1.474428,1.510942,15.530522
min,-1.0,-1.0,-1.0
25%,2.0,1.0,12.0
50%,2.0,2.0,12.0
75%,3.0,3.0,34.0
max,6.0,6.0,56.0


In [109]:
df.isnull().sum()

md5hash                     0
fitzpatrick_scale           0
fitzpatrick_centaur         0
label                       0
nine_partition_label        0
three_partition_label       0
qc                       2770
ddi_scale                   0
file_path                   0
dtype: int64

In [110]:
df = df[df["qc"] != "3 Wrongly labelled"].copy()

df = df.drop(columns=["qc"])


In [111]:
df[df.fitzpatrick_scale == -1].shape

(108, 8)

In [112]:
df['fitzpatrick_scale'] = df['fitzpatrick_scale'].replace(-1, 0)
df['fitzpatrick_centaur'] = df['fitzpatrick_centaur'].replace(-1, 0)

In [113]:
df = pd.get_dummies(df, columns=['fitzpatrick_scale'], prefix='fst', dtype=int)

In [114]:
df.head()

Unnamed: 0,md5hash,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,ddi_scale,file_path,fst_0,fst_1,fst_2,fst_3,fst_4,fst_5,fst_6
0,fd06d13de341cc75ad679916c5d7e6a6.jpg,4,prurigo-nodularis,benign-epidermal,benign,34,prurigo-nodularis/fd06d13de341cc75ad679916c5d7...,0,0,0,0,1,0,0
1,a4bb4e5206c4e89a303f470576fc5253.jpg,1,basal-cell-carcinoma-morpheiform,malignant-epidermal,malignant,12,basal-cell-carcinoma-morpheiform/a4bb4e5206c4e...,0,1,0,0,0,0,0
2,c94ce27e389f96bda998e7c3fa5c4a2e.jpg,5,keloid,inflammatory,non-neoplastic,56,keloid/c94ce27e389f96bda998e7c3fa5c4a2e.jpg,0,0,0,0,0,1,0
3,ebcf2b50dd943c700d4e2b586fcd4425.jpg,3,basal-cell-carcinoma,malignant-epidermal,malignant,34,basal-cell-carcinoma/ebcf2b50dd943c700d4e2b586...,0,0,0,1,0,0,0
4,c77d6c895f05fea73a8f3704307036c0.jpg,1,prurigo-nodularis,benign-epidermal,benign,12,prurigo-nodularis/c77d6c895f05fea73a8f37043070...,0,1,0,0,0,0,0


In [None]:
train_df, validation_df = train_test_split(df, 
                                           test_size = 0.2, 
                                           stratify=df["label"])

In [116]:
train_df.shape

(2142, 14)

In [117]:
train_df.label.value_counts()

label
squamous-cell-carcinoma               305
basal-cell-carcinoma                  246
folliculitis                          178
acne-vulgaris                         176
melanoma                              136
eczema                                107
mycosis-fungoides                      95
acne                                   95
actinic-keratosis                      92
prurigo-nodularis                      89
kaposi-sarcoma                         82
keloid                                 82
dermatomyositis                        80
superficial-spreading-melanoma-ssm     62
malignant-melanoma                     59
pyogenic-granuloma                     59
epidermal-nevus                        48
dyshidrotic-eczema                     43
dermatofibroma                         41
seborrheic-keratosis                   35
basal-cell-carcinoma-morpheiform       32
Name: count, dtype: int64

In [118]:
validation_df.shape

(714, 14)

In [119]:
validation_df.label.value_counts()

label
squamous-cell-carcinoma               102
basal-cell-carcinoma                   82
folliculitis                           59
acne-vulgaris                          58
melanoma                               45
eczema                                 36
acne                                   32
mycosis-fungoides                      32
prurigo-nodularis                      30
actinic-keratosis                      30
kaposi-sarcoma                         27
keloid                                 27
dermatomyositis                        26
superficial-spreading-melanoma-ssm     21
pyogenic-granuloma                     20
malignant-melanoma                     19
epidermal-nevus                        16
dyshidrotic-eczema                     15
dermatofibroma                         14
seborrheic-keratosis                   12
basal-cell-carcinoma-morpheiform       11
Name: count, dtype: int64

In [120]:
datagen = ImageDataGenerator(
    # we have to use efficient net preprocessing lib to be consistent
    preprocessing_function=tf.keras.applications.vgg16.preprocess_input,
    validation_split=0.15,
    rotation_range = 30,
    shear_range=0.2,
    zoom_range=0.2,
    # we may not need this if we use efficient net since we have to resize anyway
    # rescale = 1./255,
    horizontal_flip = True,
    vertical_flip = True,
    brightness_range=[0.7, 1.2],
    height_shift_range = 0.1,
    width_shift_range = 0.1
)

# flow the data into the datagens
train_data = datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = "bttai-ajl-2025/train/train/",
    x_col = "file_path",
    y_col = "label",
    target_size = target_size,
    batch_size = 32,
    class_mode = "categorical",
    subset = "training",
    shuffle = True,
    seed = 42
)

val_data = datagen.flow_from_dataframe(
    dataframe = validation_df,
    directory = "bttai-ajl-2025/train/train/",
    x_col = "file_path",
    y_col = "label",
    target_size = target_size,
    batch_size = 32,
    class_mode = "categorical",
    subset = "validation",
    shuffle = True,
    seed=42
)

Found 1821 validated image filenames belonging to 21 classes.
Found 107 validated image filenames belonging to 21 classes.


In [121]:
train_metadata = train_df[['fst_0', 'fst_1', 'fst_2', 'fst_3', 'fst_4', 'fst_5', 'fst_6']].values
val_metadata = validation_df[['fst_0', 'fst_1', 'fst_2', 'fst_3', 'fst_4', 'fst_5', 'fst_6']].values

In [122]:
def combined_generator(image_gen, metadata, batch_size):
    while True:
        # Get the next batch of images and labels from ImageDataGenerator
        images, labels = next(image_gen)
        
        # Get the corresponding metadata for this batch
        metadata_batch = metadata[image_gen.index_array]
        
        # Yield the combined data
        yield (images, metadata_batch), labels

In [123]:
train_gen = combined_generator(train_data, train_metadata, batch_size=32)

# Validation generator
val_gen = combined_generator(val_data, val_metadata, batch_size=32)

In [124]:
base_model = EfficientNetB2(
    include_top=False,
    weights="imagenet",
    input_shape=(260, 260, 3)
)
# make trianable false for now
base_model.trainable = False

In [125]:
# add a custom layers
x = base_model.output
x = keras.layers.GlobalAveragePooling2D()(x)
x = keras.layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(num_classes, activation="softmax")(x)

# make/define the model
model = models.Model(inputs=base_model.input, outputs=outputs)
model.summary()

In [126]:
#label smoothing and loss
loss = CategoricalCrossentropy(label_smoothing=0.1)
optimizer = optimizers.Adam(learning_rate=0.0003)

model.compile(optimizer=optimizer, 
              loss=loss, 
              metrics=['accuracy']
             )

In [127]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7)
]

In [128]:
history = model.fit(
    train_data,
    epochs=50,
    validation_data=val_data,
    callbacks=callbacks
)

  self._warn_if_super_not_called()


Epoch 1/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 708ms/step - accuracy: 0.0879 - loss: 3.4809 - val_accuracy: 0.1589 - val_loss: 3.1997 - learning_rate: 3.0000e-04
Epoch 2/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 667ms/step - accuracy: 0.1417 - loss: 3.2521 - val_accuracy: 0.2243 - val_loss: 3.0665 - learning_rate: 3.0000e-04
Epoch 3/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 668ms/step - accuracy: 0.2015 - loss: 3.1245 - val_accuracy: 0.2430 - val_loss: 3.0064 - learning_rate: 3.0000e-04
Epoch 4/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 719ms/step - accuracy: 0.2253 - loss: 3.0526 - val_accuracy: 0.2430 - val_loss: 2.9512 - learning_rate: 3.0000e-04
Epoch 5/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 711ms/step - accuracy: 0.2487 - loss: 2.9828 - val_accuracy: 0.2991 - val_loss: 2.9081 - learning_rate: 3.0000e-04
Epoch 6/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━

In [130]:
base_model.trainable = True

'''
we can do this but idk if we want to try running it first or not
# Unfreeze the last 50 layers
for layer in base_model.layers[:-50]:
    layer.trainable = False
'''

# compile again w/ smaller rate
model.compile(
    optimizer = optimizers.Adam(learning_rate = 0.00001),
    loss = loss,
    metrics = ["accuracy"]
)

In [None]:
history = model.fit(
    train_data,
    # we can add more or less
    epochs = 20,
    validation_data = val_data,
    callbacks = callbacks
)

Epoch 1/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 3s/step - accuracy: 0.2503 - loss: 2.8687 - val_accuracy: 0.3551 - val_loss: 2.4387 - learning_rate: 1.0000e-05
Epoch 2/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 3s/step - accuracy: 0.2883 - loss: 2.7906 - val_accuracy: 0.3271 - val_loss: 2.5645 - learning_rate: 1.0000e-05
Epoch 3/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 3s/step - accuracy: 0.3250 - loss: 2.7038 - val_accuracy: 0.4206 - val_loss: 2.5066 - learning_rate: 1.0000e-05
Epoch 4/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 3s/step - accuracy: 0.3333 - loss: 2.6119 - val_accuracy: 0.3551 - val_loss: 2.5257 - learning_rate: 1.0000e-05
Epoch 5/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 3s/step - accuracy: 0.3566 - loss: 2.5847 - val_accuracy: 0.4206 - val_loss: 2.4384 - learning_rate: 1.0000e-05
Epoch 6/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
import matplotlib.pyplot as plt

# Extract loss and accuracy from history
def plot_training(history, title="Training Performance"):
    plt.figure(figsize=(12, 5))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history["accuracy"], label="Train Accuracy")
    plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
    plt.title(f"{title} - Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.title(f"{title} - Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()

# Plot initial training results
plot_training(history, title="Initial Training")

# Plot fine-tuning results
plot_training(history_finetune, title="Fine-Tuning Training")