# Skintelligence
## Dataset splitting

Importing packages

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

Set the project and file paths

In [2]:
proj_dir = os.path.join(os.getcwd(), os.pardir)
vqa_file = os.path.join(proj_dir, 'Data', 'Final', 'Final Complete Dataset.csv')
train_img_file = os.path.join(proj_dir, 'Data', 'Final', 'img_train.csv')
val_img_file = os.path.join(proj_dir, 'Data', 'Final', 'img_val.csv')
test_img_file = os.path.join(proj_dir, 'Data', 'Final', 'img_test.csv')

Read the vqa csv file

In [3]:
df = pd.read_csv(vqa_file)

In [4]:
df.columns

Index(['id', 'skincap_file_path', 'ori_file_path', 'caption_zh_polish_en',
       'disease', 'question', 'answer', 'caption_zh', 'caption_zh_polish',
       'remark', 'source', 'skin_tone', 'malignant', 'fitzpatrick_scale',
       'fitzpatrick_centaur', 'nine_partition_label', 'three_partition_label',
       'url', 'Vesicle', 'Papule', 'Macule', 'Plaque', 'Abscess', 'Pustule',
       'Bulla', 'Patch', 'Nodule', 'Ulcer', 'Crust', 'Erosion', 'Excoriation',
       'Atrophy', 'Exudate', 'Purpura/Petechiae', 'Fissure', 'Induration',
       'Xerosis', 'Telangiectasia', 'Scale', 'Scar', 'Friable', 'Sclerosis',
       'Pedunculated', 'Exophytic/Fungating', 'Warty/Papillomatous',
       'Dome-shaped', 'Flat topped', 'Brown(Hyperpigmentation)', 'Translucent',
       'White(Hypopigmentation)', 'Purple', 'Yellow', 'Black', 'Erythema',
       'Comedo', 'Lichenification', 'Blue', 'Umbilicated', 'Poikiloderma',
       'Salmon', 'Wheal', 'Acuminate', 'Burrow', 'Gray', 'Pigmented', 'Cyst',
       'Do n

Process data

In [5]:
# Replace hyphen with space
df['disease'] = df['disease'].str.replace('-', ' ')

# Drop the unwanted columns
df_images = df.drop(['id', 'ori_file_path', 'caption_zh_polish_en', 'question', 'answer', 'caption_zh', 'caption_zh_polish', 'remark', 'source', 'skin_tone', 'malignant', 'fitzpatrick_scale', 'fitzpatrick_centaur', 'nine_partition_label', 'three_partition_label', 'url', 'Do not consider this image'], axis=1)
df_images = df_images.drop_duplicates().reset_index(drop=True)

Function to perform stratified data split

In [6]:
# Defining a function to split the data based on the disease column
def stratified_split(df, disease_column, min_representation, train_size=0.6, val_size=0.2, test_size=0.2):
    
    # Assigning under-represented diseases to the training set
    disease_counts = df[disease_column].value_counts()
    small_disease_df = df[df[disease_column].isin(disease_counts[disease_counts <= min_representation].index)]
    large_disease_df = df[~df[disease_column].isin(disease_counts[disease_counts <= min_representation].index)]
    
    # Split the remaining data for larger disease categories
    train_data, temp_data = train_test_split(
        large_disease_df, stratify=large_disease_df[disease_column], test_size=(val_size + test_size)
    )
    
    # Split the temporary dataset into validation and test sets
    val_data, test_data = train_test_split(
        temp_data, stratify=temp_data[disease_column], test_size=test_size / (val_size + test_size)
    )
    
    # Add the small disease entries to the training set
    train_data = pd.concat([train_data, small_disease_df])
    
    return train_data, val_data, test_data

In [7]:
# Perform a stratified split based on diseases
df_img_train, df_img_val, df_img_test = stratified_split(df_images, 'disease', 4)

Check the length of the training, validation and testing sets

In [8]:
print("Training set size:", len(df_img_train))
print("Validation set size:", len(df_img_val))
print("Test set size:", len(df_img_test))

Training set size: 2426
Validation set size: 785
Test set size: 785


Save the split datasets into separate files

In [9]:
df_img_train.to_csv(train_img_file, index=False)
df_img_val.to_csv(val_img_file, index=False)
df_img_test.to_csv(test_img_file, index=False)