In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import keras
from keras.preprocessing.image import load_img, img_to_array
from keras.applications import EfficientNetB2, EfficientNetB7, VGG16
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, concatenate, Flatten, Input, Concatenate
from keras import models, regularizers, Model, optimizers
from keras.losses import CategoricalCrossentropy
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
target_size = (224, 224)
batch_size = 32
num_classes = 21

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/AJL Competition/train/train"
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AJL Competition/train.csv")

df['md5hash'] = df['md5hash'].astype(str) + '.jpg'

# Combine label and md5hash to form the correct path
df['file_path'] = df['label'] + '/' + df['md5hash']

In [None]:
df.describe()

Unnamed: 0,fitzpatrick_scale,fitzpatrick_centaur,ddi_scale
count,2860.0,2860.0,2860.0
mean,2.524476,2.095455,23.547552
std,1.474428,1.510942,15.530522
min,-1.0,-1.0,-1.0
25%,2.0,1.0,12.0
50%,2.0,2.0,12.0
75%,3.0,3.0,34.0
max,6.0,6.0,56.0


In [None]:
df.isnull().sum()

Unnamed: 0,0
md5hash,0
fitzpatrick_scale,0
fitzpatrick_centaur,0
label,0
nine_partition_label,0
three_partition_label,0
qc,2770
ddi_scale,0
file_path,0


In [None]:
df = df[df["qc"] != "3 Wrongly labelled"].copy()

df = df.drop(columns=["qc"])


In [None]:
df[df.fitzpatrick_scale == -1].shape

(108, 8)

In [None]:
df['fitzpatrick_scale'] = df['fitzpatrick_scale'].replace(-1, 0)
df['fitzpatrick_centaur'] = df['fitzpatrick_centaur'].replace(-1, 0)

In [None]:
df = pd.get_dummies(df, columns=['fitzpatrick_scale'], prefix='fst', dtype=int)

In [None]:
df.head()

Unnamed: 0,md5hash,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,ddi_scale,file_path,fst_0,fst_1,fst_2,fst_3,fst_4,fst_5,fst_6
0,fd06d13de341cc75ad679916c5d7e6a6.jpg,4,prurigo-nodularis,benign-epidermal,benign,34,prurigo-nodularis/fd06d13de341cc75ad679916c5d7...,0,0,0,0,1,0,0
1,a4bb4e5206c4e89a303f470576fc5253.jpg,1,basal-cell-carcinoma-morpheiform,malignant-epidermal,malignant,12,basal-cell-carcinoma-morpheiform/a4bb4e5206c4e...,0,1,0,0,0,0,0
2,c94ce27e389f96bda998e7c3fa5c4a2e.jpg,5,keloid,inflammatory,non-neoplastic,56,keloid/c94ce27e389f96bda998e7c3fa5c4a2e.jpg,0,0,0,0,0,1,0
3,ebcf2b50dd943c700d4e2b586fcd4425.jpg,3,basal-cell-carcinoma,malignant-epidermal,malignant,34,basal-cell-carcinoma/ebcf2b50dd943c700d4e2b586...,0,0,0,1,0,0,0
4,c77d6c895f05fea73a8f3704307036c0.jpg,1,prurigo-nodularis,benign-epidermal,benign,12,prurigo-nodularis/c77d6c895f05fea73a8f37043070...,0,1,0,0,0,0,0


In [None]:
train_df, validation_df = train_test_split(df,
                                           test_size = 0.2,
                                           stratify=df["label"])

In [None]:
train_df.shape

(2284, 14)

In [None]:
train_df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
squamous-cell-carcinoma,326
basal-cell-carcinoma,262
folliculitis,190
acne-vulgaris,187
melanoma,145
eczema,114
acne,102
mycosis-fungoides,102
actinic-keratosis,98
prurigo-nodularis,95


In [None]:
validation_df.shape

(572, 14)

In [None]:
validation_df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
squamous-cell-carcinoma,81
basal-cell-carcinoma,66
folliculitis,47
acne-vulgaris,47
melanoma,36
eczema,29
acne,25
mycosis-fungoides,25
actinic-keratosis,24
prurigo-nodularis,24


In [None]:
datagen = ImageDataGenerator(
    # we have to use efficient net preprocessing lib to be consistent
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input,
    validation_split=0.15,
    rotation_range = 30,
    shear_range=0.2,
    zoom_range=0.2,
    # we may not need this if we use efficient net since we have to resize anyway
    # rescale = 1./255,
    horizontal_flip = True,
    vertical_flip = True,
    brightness_range=[0.7, 1.2],
    height_shift_range = 0.1,
    width_shift_range = 0.1
)

# flow the data into the datagens
train_data = datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = path,
    x_col = "file_path",
    y_col = "label",
    target_size = target_size,
    batch_size = 32,
    class_mode = "categorical",
    subset = "training",
    shuffle = True,
    seed = 42
)

val_data = datagen.flow_from_dataframe(
    dataframe = validation_df,
    directory = path,
    x_col = "file_path",
    y_col = "label",
    target_size = target_size,
    batch_size = 32,
    class_mode = "categorical",
    subset = "validation",
    shuffle = True,
    seed=42
)

Found 1942 validated image filenames belonging to 21 classes.
Found 85 validated image filenames belonging to 21 classes.


In [None]:
train_metadata = train_df[['fst_0', 'fst_1', 'fst_2', 'fst_3', 'fst_4', 'fst_5', 'fst_6']].values
val_metadata = validation_df[['fst_0', 'fst_1', 'fst_2', 'fst_3', 'fst_4', 'fst_5', 'fst_6']].values

In [None]:
def combined_generator(image_gen, metadata, batch_size):
     while True:
        images, labels = next(image_gen)
        batch_indices = image_gen.index_array
        metadata_batch = metadata[batch_indices]
        yield (images, metadata_batch), labels

In [None]:
train_gen = combined_generator(train_data, train_metadata, batch_size=32)

# Validation generator
val_gen = combined_generator(val_data, val_metadata, batch_size=32)

In [None]:
train_classes = train_data.classes
cw = class_weight.compute_class_weight(
    # we need it balanced duh lol
    class_weight="balanced",
    classes=np.unique(train_classes),
    y=train_classes
)
class_weights = dict(enumerate(cw))

print("Class Weights:", class_weights)

Class Weights: {0: 1.0162218733647306, 1: 0.589020321504398, 2: 1.1277584204413473, 3: 0.4091866835229667, 4: 3.302721088435374, 5: 2.3119047619047617, 6: 1.2167919799498748, 7: 2.371184371184371, 8: 0.9734335839598998, 9: 2.2555168408826947, 10: 0.6044195455960162, 11: 1.3210884353741497, 12: 1.2330158730158731, 13: 1.6813852813852814, 14: 0.7224702380952381, 15: 1.1141709695926563, 16: 1.2330158730158731, 17: 1.622389306599833, 18: 2.9831029185867894, 19: 0.3256203890006707, 20: 1.651360544217687}


In [None]:
base_model = EfficientNetB7(
    include_top=False,
    weights="imagenet",
    input_shape=(224, 224, 3)
)
# make trianable false for now
base_model.trainable = False

In [None]:
image_input = Input(shape=(224, 224, 3), name='image_input')
base_model = EfficientNetB7(weights='imagenet', include_top=False, input_tensor=image_input)
x = base_model.output
x = GlobalAveragePooling2D()(x)  # Flatten the image features

# Metadata input branch
metadata_input = Input(shape=(7,), name='metadata_input')  # 2 features: fitzpatrick_scale and fitzpatrick_centaur
metadata_branch = Dense(32, activation='relu')(metadata_input)  # Process metadata with a dense layer

# Combine image features and metadata
combined = Concatenate()([x, metadata_branch])  # Concatenate the two branches

# Add final layers
combined = Dense(128, activation='relu')(combined)  # Additional dense layer
output = Dense(num_classes, activation='softmax')(combined)  # Final classification layer

# Define the model
model = Model(inputs=[image_input, metadata_input], outputs=output)

# Print model summary
model.summary()

In [None]:
#label smoothing and loss
loss = CategoricalCrossentropy(label_smoothing=0.1)
optimizer = optimizers.Adam(learning_rate=0.0003)

model.compile(optimizer=optimizer,
              loss=loss,
              metrics=['accuracy']
             )

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7)
]

In [None]:
history = model.fit(
    train_gen,
    epochs=50,
    validation_data=val_gen,
    callbacks=callbacks
)

Found 2856 validated image filenames belonging to 21 classes.


KeyboardInterrupt: 

In [None]:
base_model.trainable = True

'''
we can do this but idk if we want to try running it first or not
# Unfreeze the last 50 layers
for layer in base_model.layers[:-50]:
    layer.trainable = False
'''

# compile again w/ smaller rate
model.compile(
    optimizer = optimizers.Adam(learning_rate = 0.00001),
    loss = loss,
    metrics = ["accuracy"]
)

In [None]:
history = model.fit(
    train_gen,
    # we can add more or less
    epochs = 20,
    validation_data = val_gen,
    callbacks = callbacks
)

Epoch 1/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 3s/step - accuracy: 0.2503 - loss: 2.8687 - val_accuracy: 0.3551 - val_loss: 2.4387 - learning_rate: 1.0000e-05
Epoch 2/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 3s/step - accuracy: 0.2883 - loss: 2.7906 - val_accuracy: 0.3271 - val_loss: 2.5645 - learning_rate: 1.0000e-05
Epoch 3/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 3s/step - accuracy: 0.3250 - loss: 2.7038 - val_accuracy: 0.4206 - val_loss: 2.5066 - learning_rate: 1.0000e-05
Epoch 4/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 3s/step - accuracy: 0.3333 - loss: 2.6119 - val_accuracy: 0.3551 - val_loss: 2.5257 - learning_rate: 1.0000e-05
Epoch 5/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 3s/step - accuracy: 0.3566 - loss: 2.5847 - val_accuracy: 0.4206 - val_loss: 2.4384 - learning_rate: 1.0000e-05
Epoch 6/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
import matplotlib.pyplot as plt

# Extract loss and accuracy from history
def plot_training(history, title="Training Performance"):
    plt.figure(figsize=(12, 5))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history["accuracy"], label="Train Accuracy")
    plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
    plt.title(f"{title} - Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.title(f"{title} - Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()

# Plot initial training results
plot_training(history, title="Initial Training")

# Plot fine-tuning results
plot_training(history_finetune, title="Fine-Tuning Training")