In [None]:
# 1. Import Necessary Libraries
import pandas as pd
import numpy as np
import os
import shutil
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, GlobalAveragePooling2D, Dense, BatchNormalization, Activation, Add, Input, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet101
from sklearn.metrics import f1_score
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 2. Load Data ** The PATH for the train_df and test_df need to be changed **
train_df = pd.read_csv('/content/drive/MyDrive/Break through AI/AI-Studio-Ceramide/bttai-ajl-2025/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Break through AI/AI-Studio-Ceramide/bttai-ajl-2025/test.csv')

# Add .jpg extension to md5hash column to reference the file_name
train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'

# Combine label and md5hash to form the correct path
train_df['file_path'] = train_df['label'] + '/' + train_df['md5hash']

# Drop DDI column
train_df = train_df.drop(columns = ['ddi_scale', 'qc'], axis=1)
test_df = test_df.drop(columns = ['ddi_scale', 'qc'], axis=1)

# Display
# print('test_df: ')
# test_df.head(10)
print('train_df: ')
train_df.head(10)

train_df: 


Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,file_path
0,fd06d13de341cc75ad679916c5d7e6a6.jpg,4,4,prurigo-nodularis,benign-epidermal,benign,prurigo-nodularis/fd06d13de341cc75ad679916c5d7...
1,a4bb4e5206c4e89a303f470576fc5253.jpg,1,1,basal-cell-carcinoma-morpheiform,malignant-epidermal,malignant,basal-cell-carcinoma-morpheiform/a4bb4e5206c4e...
2,c94ce27e389f96bda998e7c3fa5c4a2e.jpg,5,5,keloid,inflammatory,non-neoplastic,keloid/c94ce27e389f96bda998e7c3fa5c4a2e.jpg
3,ebcf2b50dd943c700d4e2b586fcd4425.jpg,3,3,basal-cell-carcinoma,malignant-epidermal,malignant,basal-cell-carcinoma/ebcf2b50dd943c700d4e2b586...
4,c77d6c895f05fea73a8f3704307036c0.jpg,1,1,prurigo-nodularis,benign-epidermal,benign,prurigo-nodularis/c77d6c895f05fea73a8f37043070...
5,9d5a90fa3f6934608add10e698001760.jpg,3,5,prurigo-nodularis,benign-epidermal,benign,prurigo-nodularis/9d5a90fa3f6934608add10e69800...
6,57885e3f5a3c043c3621a06bca196282.jpg,2,1,seborrheic-keratosis,benign-epidermal,benign,seborrheic-keratosis/57885e3f5a3c043c3621a06bc...
7,8adbbbc4e50a0df8b89710dfd495d3c5.jpg,5,4,eczema,inflammatory,non-neoplastic,eczema/8adbbbc4e50a0df8b89710dfd495d3c5.jpg
8,763ed484fcc50bf7b67cc44f95bac95e.jpg,-1,-1,folliculitis,inflammatory,non-neoplastic,folliculitis/763ed484fcc50bf7b67cc44f95bac95e.jpg
9,0198c74d604fde7055671d1b35869664.jpg,3,4,squamous-cell-carcinoma,malignant-epidermal,malignant,squamous-cell-carcinoma/0198c74d604fde7055671d...


In [None]:
# 3. Data Preprocessing
# Encode the labels
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])

# Split the data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Define image data generators for training and validation
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

# Define the directory paths
train_dir = '/content/drive/MyDrive/Break through AI/AI-Studio-Ceramide/bttai-ajl-2025/train/train/'

# Display
print("Train_data shape: ", train_data.shape)
print("Val_data shape: ", val_data.shape)

Train_data shape:  (2288, 8)
Val_data shape:  (572, 8)


In [None]:
def create_generator(dataframe, directory, batch_size=32, target_size=(128, 128)):
    """
    Template function to create image generators.
    Students should complete this function to load images and labels properly.
    """
    # Fill in the correct flow_from_dataframe parameters
    generator = train_datagen.flow_from_dataframe(
        dataframe=dataframe,
        directory=directory,
        x_col='file_path',  # Use combined path
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        validate_filenames=False  # Disable strict filename validation
    )
    return generator

In [None]:
# Create generators
train_generator = create_generator(train_data, train_dir)
val_generator = create_generator(val_data, train_dir)

Found 2288 non-validated image filenames.
Found 572 non-validated image filenames.


In [None]:
# TODO: You should implement the model architecture here.
# Feel free to explore different model types that best serve your purpose.

def bottleneck_block(x, filters, strides=(1, 1), downsample=False):
    """
    A bottleneck block for ResNet-101

    Paremeters:
    x - input tensor
    filters - number of filters in the 3x3 conv layer
    strides - strides for the 3x3 conv layer (default=1)
    downsample - boolean to determine if downsampling is needed

    Returns:
    x - output tensor
    """

    # To save original input x
    shortcut = x

    # (1x1 Conv) Reduce dimensionality
    x = Conv2D(filters=filters, kernel_size=(1, 1), strides=(1, 1), padding='valid', use_bias=False)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    # (3x3 Conv) Main Computation
    x = Conv2D(filters=filters, kernel_size=(3, 3), strides=strides, padding='same', use_bias=False)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    # (1x1 Conv) Expand back
    x = Conv2D(filters=4*filters, kernel_size=(1, 1), strides=(1, 1), padding='valid', use_bias=False)(shortcut)
    x = BatchNormalization()(shortcut)

    # Add skip connection
    x = Add()([shortcut, x])
    x = Activation('relu')(x)

    return x


In [None]:
# TODO: You should implement the model architecture here.
# Feel free to explore different model types that best serve your purpose.

def build_resnet101(input_shape, num_classes=1000):
    """
    Builds a ResNet-101 model.

    Parameters:
    input_shape (tuple): The shape of the input images.
    num_classes (int): The number of output classes.

    Returns:
    model (tf.keras.Model): The ResNet-101 model.
    """
    input = Input(shape=input_shape)

    # Initial Convolution + Max Pool
    x = Conv2D(filters=64, kernel_size=(7, 7), strides=(3, 3), padding='same', use_bias=False)(input)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)

    # Residual stage
    # Conv2_x: 3 blocks
    for i in range(3):
        x = bottleneck_block(x, filters=64)

    # Conv3_x: 4 blocks & first with stride 2
    for i in range(4):
        if i == 0:
            x = bottleneck_block(x, filters=128, strides=(2, 2), downsample=True)
        x = bottleneck_block(x, filters=128)

    # Conv4_x: 23 blocks first with stride 2
    for i in range(23):
        if i == 0:
            x = bottleneck_block(x, filters=256, strides=(2, 2), downsample=True)
        x = bottleneck_block(x, filters=256)

    # Conv5_x: 3 blocks with stride 2
    for i in range(3):
        if i == 0:
            x = bottleneck_block(x, filters=512, strides=(2, 2), downsample=True)
        x = bottleneck_block(x, filters=512)

    # Global Average Pooling & Fully Connected Layer
    x = GlobalAveragePooling2D()(x)
    output = Dense(num_classes, activation='softmax')(x)

    # Create model
    model = Model(inputs=input, outputs=output)

    return model

In [None]:
# Create the ResNet_101 model
resnet101_model = build_resnet101(input_shape=(128, 128, 3))

# Compile model
resnet101_model.compile(optimizer=Adam(learning_rate=0.055), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model summary
resnet101_model.summary()

In [None]:
# TODO: Train your model here.
history = resnet101_model.fit(train_generator, epochs=1, validation_data=val_generator)

# Final accuracy resnet101_model.evaluate(val_generator)
print("Val Accuracy: ", history.history['val_accuracy'][-1])

  self._warn_if_super_not_called()


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1925s[0m 26s/step - accuracy: 0.0953 - loss: 15.7123 - val_accuracy: 0.1154 - val_loss: 3.4342
Val Accuracy:  0.11538461595773697


In [None]:
from os.path import join
# 6. Make Predictions on Test Data
def preprocess_test_data(test_df, directory):
    """
    Template for loading and preprocessing test images.
    """
    # TODO: create a generator for the test set here.
    test_df['file_path'] = test_df['md5hash'].apply((lambda x: os.path.join(directory, x)))# Create file_path column

    test_datagen = ImageDataGenerator(rescale=1./255)
    test_generator = test_datagen.flow_from_dataframe(
        test_df,
        directory=directory,
        x_col='file_path',
        y_col=None,
        target_size=(128, 128),
        batch_size=32,
        class_mode=None,
        validate_filenames=False
    )
    return test_generator

In [None]:
# Load test data
test_dir = '/content/drive/MyDrive/Break through AI/AI-Studio-Ceramide/bttai-ajl-2025/test/test/'
test_generator = preprocess_test_data(test_df, test_dir)

Found 1227 non-validated image filenames.


In [None]:
# TODO
# Generate predictions based on the trained model
predictions = resnet101_model.predict(test_generator)

# Convert predictions to class labels
predicted_labels = np.argmax(predictions, axis=1)

# Add predicted labels to test_df to match train_df
predicted_labels = label_encoder.inverse_transform(predicted_labels)
test_df['predicted_label'] = predicted_labels

# Extract the true labels for the validation set
val_true_labels = val_data['label']

# Extarct predicted labels for the validation set
val_predicted_labels = label_encoder.inverse_transform(np.argmax(predictions[:len(val_true_labels)], axis=1))

# Calculate the f1_score
f1 = f1_score(val_true_labels, val_predicted_labels, average='weighted')
print("F1 Score: ", f1)

# Remove .jpg to match sample submission
test_df['md5hash'] = test_df['md5hash'].str.replace('.jpg', '', regex=False)

# Then, save the predictions into a CSV file for submission
folder_path = '/content/drive/MyDrive/Break through AI/AI-Studio-Ceramide/Izabel_google_colabs'
os.makedirs(folder_path, exist_ok=True)
test_df[['md5hash', 'predicted_label']].to_csv('predictions.csv', index=False)
print("Predictions saved to 'predictions.csv'")

# Move prediction.csv file to Izabel_google_colabs folder
folder_path = '/content/drive/MyDrive/Break through AI/AI-Studio-Ceramide/Izabel_google_colabs'
os.makedirs(folder_path, exist_ok=True)
shutil.move('predictions.csv', os.path.join(folder_path, 'predictions.csv'))

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 355ms/step
F1 Score:  0.04345916627626186
Predictions saved to 'predictions.csv'


'/content/drive/MyDrive/Break through AI/AI-Studio-Ceramide/Izabel_google_colabs/predictions.csv'

In [None]:
# Load the predictions dataset
predictions_df = pd.read_csv('/content/drive/MyDrive/Break through AI/AI-Studio-Ceramide/Izabel_google_colabs/predictions.csv')

# View the predictions dataframe to compare to sample_submission
predictions_df.head()

Unnamed: 0,md5hash,predicted_label
0,0844ae634f0e6e7ef1f73c2aeecbae0e,basal-cell-carcinoma
1,3b290d262098f761d719aa07cf36c040,basal-cell-carcinoma
2,cf561d08ac46d0fda678bff6621005ee,basal-cell-carcinoma
3,e6371069be05c6b0a95b4b3f1bacc9a5,basal-cell-carcinoma
4,f76cddb37265f97508f159078dcc7e7c,basal-cell-carcinoma
