In [3]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
!pip install torch


Collecting torch
  Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl (150.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.8/150.8 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Installing collected packages: typing-extensions, torch
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.7.1
    Uninstalling typing_extensions-4.7.1:
      Successfully uninstalled typing_extensions-4.7.1
Successfully installed torch-2.2.2 typing-extensions-4.12.2


In [4]:
#loading data

train_path = 'train-metadata.csv'
test_path = 'test-metadata.csv'

train_metadata = pd.read_csv(train_path)
test_metadata = pd.read_csv(test_path)

train_metadata

  train_metadata = pd.read_csv(train_path)


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,...,,Benign,Benign,,,,,,,99.804040
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,...,,Benign,Benign,,,,,,,70.442510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,...,IL_9520694,Benign,Benign,,,,,,,99.999988
401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,...,,Benign,Benign,,,,,,,99.999820
401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,...,IL_9852274,Benign,Benign,,,,,,,99.999416
401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,...,,Benign,Benign,,,,,,,100.000000


In [5]:
#data preperation part, checking for null values

missing_val = train_metadata.isnull().sum()
print("Missing Values:\n", missing_val)

Missing Values:
 isic_id                              0
target                               0
patient_id                           0
age_approx                        2798
sex                              11517
anatom_site_general               5756
clin_size_long_diam_mm               0
image_type                           0
tbp_tile_type                        0
tbp_lv_A                             0
tbp_lv_Aext                          0
tbp_lv_B                             0
tbp_lv_Bext                          0
tbp_lv_C                             0
tbp_lv_Cext                          0
tbp_lv_H                             0
tbp_lv_Hext                          0
tbp_lv_L                             0
tbp_lv_Lext                          0
tbp_lv_areaMM2                       0
tbp_lv_area_perim_ratio              0
tbp_lv_color_std_mean                0
tbp_lv_deltaA                        0
tbp_lv_deltaB                        0
tbp_lv_deltaL                        0
tbp_lv_d

In [7]:
# list of columns we want to remove
columns_to_remove = ['lesion_id', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5', 'mel_mitotic_index', 'mel_thick_mm']

#specify these columns
train_metadata_cleaned = train_metadata.drop(columns=columns_to_remove, errors='ignore')

# remove rows with 0 values
train_metadata_cleaned_no_nulls = train_metadata_cleaned.dropna()

In [14]:
# showing the unique values in target = understand the distribution of data in that column
train_metadata_cleaned_no_nulls['target'].value_counts()

0    381533
1       381
Name: target, dtype: int64

In [15]:
# the data is imbalanced

# separate the classes based on 'target' column
class_0 = train_metadata_cleaned_no_nulls[train_metadata_cleaned_no_nulls['target'] == 0]
class_1 = train_metadata_cleaned_no_nulls[train_metadata_cleaned_no_nulls['target'] == 1]

# downsample class 0 to class 1
class_0_downsampled = resample(class_0,
                               replace=False,  # sample without replacement
                               n_samples=len(class_1),  # match classes size
                               random_state=42) 

# Combine the downsampled majority class with the minority class
balanced_data = pd.concat([class_0_downsampled, class_1])


print("Shape of balanced data:", balanced_data.shape)
print("Distribution after balancing:\n", balanced_data['target'].value_counts())


Shape of balanced data: (762, 48)
Distribution after balancing:
 0    381
1    381
Name: target, dtype: int64


In [18]:
train_metadata_balanced = balanced_data

In [17]:
train_metadata_balanced = pd.get_dummies(balanced_data, columns=['sex', 'anatom_site_general'])

In [19]:
# Example of one-hot encoding
encoded_data = pd.get_dummies(train_metadata_balanced, columns=['sex', 'anatom_site_general', 'image_type'], drop_first=True)

In [20]:
print("Shape of encoded data:", encoded_data.shape)
print("First few rows of encoded data:\n", encoded_data.head())

Shape of encoded data: (762, 50)
First few rows of encoded data:
              isic_id  target  patient_id  age_approx  clin_size_long_diam_mm  \
44730   ISIC_1176500       0  IP_4013104        50.0                    3.35   
362429  ISIC_9047349       0  IP_9057861        80.0                    2.82   
17389   ISIC_0500366       0  IP_6894380        45.0                    4.81   
367815  ISIC_9176171       0  IP_6422845        70.0                    4.19   
192982  ISIC_4857877       0  IP_7331742        65.0                    2.52   

       tbp_tile_type   tbp_lv_A  tbp_lv_Aext   tbp_lv_B  tbp_lv_Bext  ...  \
44730      3D: white  21.195278    17.154688  31.151941    27.220952  ...   
362429     3D: white  17.323980    13.713120  27.650850    24.298250  ...   
17389      3D: white  19.021500    13.692660  25.292780    24.429680  ...   
367815     3D: white  19.501620    15.038260  28.079500    25.470960  ...   
192982        3D: XP  21.714430    15.247080  31.490850    29.277120

In [21]:
print("Columns after encoding:\n", encoded_data.columns)

Columns after encoding:
 Index(['isic_id', 'target', 'patient_id', 'age_approx',
       'clin_size_long_diam_mm', 'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext',
       'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H',
       'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2',
       'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA',
       'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
       'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_location',
       'tbp_lv_location_simple', 'tbp_lv_minorAxisMM',
       'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
       'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
       'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
       'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'attribution', 'copyright_license',
       'iddx_full', 'iddx_1', 'tbp_lv_dnn_lesion_confidence', 'sex_male',
       'anatom_site_general_head/neck', 'anatom_site_general_low