In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import keras
from keras.preprocessing.image import load_img, img_to_array
from keras.applications import EfficientNetB2
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, concatenate, Flatten, Input, Concatenate
from keras import layers, models, regularizers, Model, optimizers
from keras.losses import CategoricalCrossentropy
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [2]:
target_size = (260, 260)
batch_size = 32
num_classes = 21

In [3]:
path = "../bttai-ajl-2025/train/train"
df = pd.read_csv("../bttai-ajl-2025/train.csv")

df['md5hash'] = df['md5hash'].astype(str) + '.jpg'

# Combine label and md5hash to form the correct path
df['file_path'] = df['label'] + '/' + df['md5hash']

In [4]:
df.columns

Index(['md5hash', 'fitzpatrick_scale', 'fitzpatrick_centaur', 'label',
       'nine_partition_label', 'three_partition_label', 'qc', 'ddi_scale',
       'file_path'],
      dtype='object')

In [5]:
df.groupby(['three_partition_label', 'nine_partition_label', 'label']).size()

three_partition_label  nine_partition_label          label                             
benign                 benign-dermal                 dermatofibroma                         55
                                                     pyogenic-granuloma                     79
                       benign-epidermal              epidermal-nevus                        64
                                                     prurigo-nodularis                     119
                                                     seborrheic-keratosis                   48
malignant              malignant-cutaneous-lymphoma  mycosis-fungoides                     127
                       malignant-dermal              kaposi-sarcoma                        109
                       malignant-epidermal           actinic-keratosis                     122
                                                     basal-cell-carcinoma                  328
                                                     basa

In [6]:
df = df[df["qc"] != "3 Wrongly labelled"].copy()

df = df.drop(columns=["qc"])


In [7]:
def sample_indices_per_class(y, num_samples=291):
    unique_classes = np.unique(y)
    sampled_indices = []

    for cls in unique_classes:
        class_indices = np.where(y == cls)[0]  # Get indices for this class
        sampled_cls_indices = np.random.choice(class_indices, num_samples, replace=False)  # Sample 291
        sampled_indices.extend(sampled_cls_indices)
    
    return np.array(sampled_indices)

In [8]:
sampled_indices  = sample_indices_per_class(df.three_partition_label)

balanced_df = df.iloc[sampled_indices].copy()

In [9]:
balanced_df.three_partition_label.value_counts()

three_partition_label
benign            291
malignant         291
non-neoplastic    291
Name: count, dtype: int64

In [10]:
train_df, validation_df = train_test_split(df, 
                                           test_size = 0.2, 
                                           stratify=df["three_partition_label"],
                                           random_state=42)

In [11]:
train_df.three_partition_label.value_counts()

three_partition_label
malignant         1182
non-neoplastic     811
benign             291
Name: count, dtype: int64

In [12]:
validation_df.three_partition_label.value_counts()

three_partition_label
malignant         296
non-neoplastic    203
benign             73
Name: count, dtype: int64

In [13]:
train_datagen = ImageDataGenerator(
    # we have to use efficient net preprocessing lib to be consistent
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input,
    rotation_range = 30,
    shear_range=0.2,
    zoom_range=0.2,
    # we may not need this if we use efficient net since we have to resize anyway
    # rescale = 1./255,
    horizontal_flip = True,
    vertical_flip = True,
    brightness_range=[0.7, 1.2],
    height_shift_range = 0.1,
    width_shift_range = 0.1
)

validation_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input
)

# flow the data into the datagens
train_data = train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = path,
    x_col = "file_path",
    y_col = "three_partition_label",
    target_size = target_size,
    batch_size = batch_size,
    class_mode = "categorical",
    seed = 42
)

val_data = validation_datagen.flow_from_dataframe(
    dataframe = validation_df,
    directory = path,
    x_col = "file_path",
    y_col = "three_partition_label",
    target_size = target_size,
    batch_size = batch_size,
    class_mode = "categorical",
    seed=42
)

Found 2284 validated image filenames belonging to 3 classes.
Found 572 validated image filenames belonging to 3 classes.


In [14]:
train_data.class_indices

{'benign': 0, 'malignant': 1, 'non-neoplastic': 2}

In [15]:
val_data.class_indices

{'benign': 0, 'malignant': 1, 'non-neoplastic': 2}

In [16]:
train_classes = train_data.classes
cw = class_weight.compute_class_weight(
    # we need it balanced duh lol
    class_weight="balanced",
    classes=np.unique(train_classes),
    y=train_classes
)
class_weights = dict(enumerate(cw))

print("Class Weights:", class_weights)

Class Weights: {0: 2.616265750286369, 1: 0.6441060349689791, 2: 0.9387587340731607}


In [1]:
base_model = EfficientNetB2(
    include_top=False,
    weights="imagenet",
    input_shape=(260, 260, 3)
)
# make trianable false for now
base_model.trainable = True

NameError: name 'EfficientNetB2' is not defined

In [18]:
x = base_model.output
x = layers.GlobalAveragePooling2D()(x)

# First Dense Layer with BatchNorm and L2 Regularization
# x = layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)
# x = layers.BatchNormalization()(x)  # Helps stabilize training
# x = layers.Dropout(0.3)(x)  # Reduce to avoid excessive information loss

# Second Dense Layer for more representation
x = layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)  

outputs = layers.Dense(3, activation="softmax")(x)

# make/define the model
model = models.Model(inputs=base_model.input, outputs=outputs)

In [19]:
loss = CategoricalCrossentropy(label_smoothing=0.1)
optimizer1 = optimizers.Adam(learning_rate=0.0003)
optimizer2 = optimizers.SGD(learning_rate=0.0003)
optimizer3 = optimizers.RMSprop(learning_rate=0.0003)

model.compile(optimizer=optimizer1, 
              loss=loss, 
              metrics=['accuracy']
             )

In [20]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7)
]

In [21]:
history = model.fit(
    train_data,
    epochs=20,
    validation_data=val_data,
    callbacks=callbacks,
    class_weight=class_weights
)

  self._warn_if_super_not_called()


Epoch 1/20
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 3s/step - accuracy: 0.4609 - loss: 2.0685 - val_accuracy: 0.5892 - val_loss: 1.3301 - learning_rate: 3.0000e-04
Epoch 2/20
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 3s/step - accuracy: 0.5922 - loss: 1.5798 - val_accuracy: 0.6294 - val_loss: 1.3331 - learning_rate: 3.0000e-04
Epoch 3/20
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 3s/step - accuracy: 0.6530 - loss: 1.4332 - val_accuracy: 0.6888 - val_loss: 1.2893 - learning_rate: 3.0000e-04
Epoch 4/20
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 3s/step - accuracy: 0.6785 - loss: 1.3773 - val_accuracy: 0.6923 - val_loss: 1.3218 - learning_rate: 3.0000e-04
Epoch 5/20
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 3s/step - accuracy: 0.7261 - loss: 1.2466 - val_accuracy: 0.7570 - val_loss: 1.2788 - learning_rate: 3.0000e-04
Epoch 6/20
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [23]:
model.save("models/parentModel.keras")