# Equitable AI For Dermatology - Break Through Tech Team Selenium

# Set Up

## Importing Libraries

In [15]:
# 1. Import Necessary Libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.applications import ResNet50
from sklearn.utils.class_weight import compute_class_weight

## Loading Data (Mounting Google Drive)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!ls /content/drive/My\ Drive/

'07 01-07 04 Weekend Itinerary.gdoc'
 1XnoQXeeTKtBC4gk9ZJwN4P_FZ4IrNmFi
 Accounts
'AP PSYCH'
'AR Instructions for Section 2 of I-9 Form (pdf version).pdf'
'Arizona Renting June-August 2023'
'AvenueE 2023-24 Program Acceptance.pdf'
 building_data_loader.ipynb
 ca_building_load_forecasting.ipynb
'Christmas Photo Album 2022'
'Colab Notebooks'
'Computer Science.gsheet'
'Consent To Record Form.pdf'
'Copy of Buget- again..gsheet'
'Copy of Copy of 0. Coding Interview Checklist.gdoc'
'Copy of Copy of MM5: Ideal vs. Actual Professional Roadmap Progress (Template).gdoc'
'Copy of ECS 164 Assignment 1, Winter 2025.gdoc'
'Copy of [FINAL TEMPLATE] Team Alliance_Fall 2024 AI Studio.gdoc'
'Copy of Job Hunter Google Sheets Tracking Template.gsheet'
'Copy of Melville Scholarship Essay (500 words on impacting society).gdoc'
'Copy of MM3: Behavioral Interview Case Study Handout (FY25).gdoc'
'Copy of Professional Roadmap Template.gdoc'
 Counselor.gdoc
'Cover Letter (1).gdoc'
'Cover Letter.gdoc'
'C++ Progra

In [4]:
train_folder_path = '/content/drive/MyDrive/VIR_AJL_Team_Selenium/Data/train.csv'
train_df = pd.read_csv(train_folder_path)

test_folder_path = '/content/drive/MyDrive/VIR_AJL_Team_Selenium/Data/test.csv'
test_df = pd.read_csv(test_folder_path)

In [5]:
# Add .jpg extension to md5hash column to reference the file_name
train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'

# Combine label and md5hash to form the correct path
train_df['file_path'] = train_df['label'] + '/' + train_df['md5hash']

# Exploratory Data Analysis

In [6]:
# View information about data in each column
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2860 entries, 0 to 2859
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   md5hash                2860 non-null   object
 1   fitzpatrick_scale      2860 non-null   int64 
 2   fitzpatrick_centaur    2860 non-null   int64 
 3   label                  2860 non-null   object
 4   nine_partition_label   2860 non-null   object
 5   three_partition_label  2860 non-null   object
 6   qc                     90 non-null     object
 7   ddi_scale              2860 non-null   int64 
 8   file_path              2860 non-null   object
dtypes: int64(3), object(6)
memory usage: 201.2+ KB


In [7]:
# Check the first few rows to understand the structure of the data set
train_df.head()

Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,ddi_scale,file_path
0,fd06d13de341cc75ad679916c5d7e6a6.jpg,4,4,prurigo-nodularis,benign-epidermal,benign,,34,prurigo-nodularis/fd06d13de341cc75ad679916c5d7...
1,a4bb4e5206c4e89a303f470576fc5253.jpg,1,1,basal-cell-carcinoma-morpheiform,malignant-epidermal,malignant,,12,basal-cell-carcinoma-morpheiform/a4bb4e5206c4e...
2,c94ce27e389f96bda998e7c3fa5c4a2e.jpg,5,5,keloid,inflammatory,non-neoplastic,1 Diagnostic,56,keloid/c94ce27e389f96bda998e7c3fa5c4a2e.jpg
3,ebcf2b50dd943c700d4e2b586fcd4425.jpg,3,3,basal-cell-carcinoma,malignant-epidermal,malignant,,34,basal-cell-carcinoma/ebcf2b50dd943c700d4e2b586...
4,c77d6c895f05fea73a8f3704307036c0.jpg,1,1,prurigo-nodularis,benign-epidermal,benign,,12,prurigo-nodularis/c77d6c895f05fea73a8f37043070...


# Data Preprocessing

In [8]:
# 3. Data Preprocessing
# Encode the labels
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])

class_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Mapping:", class_mapping)

# Compute Class Weights for Imbalance
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df['encoded_label']),
    y=train_df['encoded_label']
)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# Split the data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# 5. Data Augmentation with ImageDataGenerator
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,  # Increased rotation range
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.3,  # Slightly increased zoom
    horizontal_flip=True,
    brightness_range=[0.7, 1.3],  # Adjust brightness more
    fill_mode='nearest'
)
val_datagen = ImageDataGenerator(rescale=1./255)

# Define the directory paths
train_dir = '/content/drive/MyDrive/VIR_AJL_Team_Selenium/Data/Pictures/train'

Label Mapping: {'acne': 0, 'acne-vulgaris': 1, 'actinic-keratosis': 2, 'basal-cell-carcinoma': 3, 'basal-cell-carcinoma-morpheiform': 4, 'dermatofibroma': 5, 'dermatomyositis': 6, 'dyshidrotic-eczema': 7, 'eczema': 8, 'epidermal-nevus': 9, 'folliculitis': 10, 'kaposi-sarcoma': 11, 'keloid': 12, 'malignant-melanoma': 13, 'melanoma': 14, 'mycosis-fungoides': 15, 'prurigo-nodularis': 16, 'pyogenic-granuloma': 17, 'seborrheic-keratosis': 18, 'squamous-cell-carcinoma': 19, 'superficial-spreading-melanoma-ssm': 20}


In [9]:
print(train_df['label'].unique())
print (train_df['label'].nunique())

['prurigo-nodularis' 'basal-cell-carcinoma-morpheiform' 'keloid'
 'basal-cell-carcinoma' 'seborrheic-keratosis' 'eczema' 'folliculitis'
 'squamous-cell-carcinoma' 'actinic-keratosis' 'mycosis-fungoides'
 'acne-vulgaris' 'dyshidrotic-eczema' 'melanoma' 'epidermal-nevus'
 'malignant-melanoma' 'pyogenic-granuloma' 'dermatofibroma'
 'kaposi-sarcoma' 'acne' 'dermatomyositis'
 'superficial-spreading-melanoma-ssm']
21


In [10]:
train_df['encoded_label'] = train_df['encoded_label'].astype(int)
val_data['encoded_label'] = val_data['encoded_label'].astype(int)

In [11]:
def create_generator(dataframe, directory, batch_size=32, target_size=(128, 128), shuffle=True):
    generator = train_datagen.flow_from_dataframe(
        dataframe=dataframe,
        directory=directory,
        x_col='file_path',
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        shuffle=shuffle,
        validate_filenames=False
    )
    return generator

In [12]:
# Create Training & Validation Generators
train_generator = create_generator(train_data, train_dir)
val_generator = create_generator(val_data, train_dir, shuffle=False)

Found 2288 non-validated image filenames.
Found 572 non-validated image filenames.


In [13]:
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze the layers of ResNet50 so we don't train them
base_model.trainable = True

# Build the full model by adding custom layers
model = Sequential([
    base_model,
    Flatten(),
    Dense(256, activation='relu'),  # Dense layer for classification
    Dropout(0.5),  # Dropout to prevent overfitting
    Dense(len(label_encoder.classes_), activation='softmax')  # Output layer with softmax
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [17]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# 10. Train the Model
model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=30,  # Increased epochs for better learning
    class_weight=class_weight_dict,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

Epoch 1/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m731s[0m 10s/step - accuracy: 0.0626 - loss: 3.0306 - val_accuracy: 0.0385 - val_loss: 3.0660 - learning_rate: 5.0000e-04
Epoch 2/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m722s[0m 10s/step - accuracy: 0.0760 - loss: 3.0442 - val_accuracy: 0.0699 - val_loss: 3.0231 - learning_rate: 5.0000e-04
Epoch 3/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m716s[0m 10s/step - accuracy: 0.1093 - loss: 2.9957 - val_accuracy: 0.0979 - val_loss: 3.0368 - learning_rate: 5.0000e-04
Epoch 4/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m721s[0m 10s/step - accuracy: 0.0979 - loss: 3.0716 - val_accuracy: 0.0420 - val_loss: 3.0558 - learning_rate: 5.0000e-04
Epoch 5/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m721s[0m 10s/step - accuracy: 0.1239 - loss: 2.9030 - val_accuracy: 0.0175 - val_loss: 3.1689 - learning_rate: 5.0000e-04
Epoch 6/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7efa6cfefe10>

In [18]:
# 6. Make Predictions on Test Data
test_df['file_path'] = test_df['md5hash']

def preprocess_test_data(test_df, directory, target_size=(128, 128), batch_size=32):
    """
    Loads and preprocesses test images.

    Args:
    - test_df: DataFrame containing test image file names.
    - directory: Directory path where test images are stored.
    - target_size: Image size (default: 128x128).
    - batch_size: Number of images to process at a time.

    Returns:
    - test_generator: Preprocessed test data generator.
    """
    test_datagen = ImageDataGenerator(rescale=1./255)  # Normalize pixel values

    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_df,
        directory=directory,
        x_col='file_path',
        y_col=None,  # No labels for test data
        target_size=target_size,
        batch_size=batch_size,
        class_mode=None,  # No labels since we are predicting
        shuffle=False  # Keep order for correct result mapping
    )

    return test_generator

# Model Development

# Model Evaluation

In [19]:
# Load test data
test_dir = '/content/drive/MyDrive/VIR_AJL_Team_Selenium/Data/Pictures/test'
test_generator = preprocess_test_data(test_df, test_dir)

Found 1227 validated image filenames.


In [20]:
# Predict the labels on the validation set
predictions = model.predict(test_generator)  # Model predictions
predicted_labels = np.argmax(predictions, axis=1)  # Convert probabilities to class indices


  self._warn_if_super_not_called()


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2s/step


In [21]:
# Map numeric labels back to original category names
test_df['predicted_label'] = label_encoder.inverse_transform(predicted_labels)

print(test_df[['md5hash', 'predicted_label']].head(10))


                                md5hash predicted_label
0  0844ae634f0e6e7ef1f73c2aeecbae0e.jpg   acne-vulgaris
1  3b290d262098f761d719aa07cf36c040.jpg   acne-vulgaris
2  cf561d08ac46d0fda678bff6621005ee.jpg   acne-vulgaris
3  e6371069be05c6b0a95b4b3f1bacc9a5.jpg   acne-vulgaris
4  f76cddb37265f97508f159078dcc7e7c.jpg   acne-vulgaris
5  ed522f3617a30ab79fa11e140d442e2d.jpg   acne-vulgaris
6  13aa2de8804ae601cd45c1c4cac9bc6f.jpg   acne-vulgaris
7  76c6ff58a8babae647dcc37ea074939d.jpg   acne-vulgaris
8  d04ce86b818b31edf84e54444ec97295.jpg   acne-vulgaris
9  62c9e3126c690939ea356694a047d23b.jpg   acne-vulgaris


In [22]:
from sklearn.metrics import classification_report

# Predict on validation data
val_predictions = model.predict(val_generator)

# Convert softmax probabilities to class labels
val_predicted_labels = np.argmax(val_predictions, axis=1)

# True labels from validation set
true_labels = val_generator.labels

# Generate classification report
report = classification_report(true_labels, val_predicted_labels)

print(report)

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2s/step
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.07      1.00      0.13        40
           2       0.00      0.00      0.00        24
           3       0.00      0.00      0.00        59
           4       0.00      0.00      0.00         7
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        24
           7       0.00      0.00      0.00         8
           8       0.00      0.00      0.00        26
           9       0.00      0.00      0.00        15
          10       0.00      0.00      0.00        55
          11       0.00      0.00      0.00        25
          12       0.00      0.00      0.00        22
          13       0.00      0.00      0.00        17
          14       0.00      0.00      0.00        35
          15       0.00      0.00      0.00        20
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
