In [26]:
import pandas as pd
import numpy as np
import time

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
import scipy.stats as ss


import seaborn as sns
import matplotlib.pyplot as plt


In [27]:
csv_cancer=pd.read_csv("./metadata.csv")
csv_cancer

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,,,,,8,,,,...,,NEV,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,5.0,BCC,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,,,,,77,,,,...,,ACK,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,,,,,75,,,,...,,ACK,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,...,5.0,BCC,True,True,False,False,True,True,PAT_684_1302_588.png,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2293,PAT_1708,3156,,,,,73,,,,...,,ACK,True,False,False,False,False,False,PAT_1708_3156_175.png,False
2294,PAT_46,880,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,12.0,BCC,True,True,False,True,False,False,PAT_46_880_140.png,True
2295,PAT_1343,1217,,,,,74,,,,...,,SEK,False,False,False,False,False,False,PAT_1343_1217_404.png,False
2296,PAT_326,690,False,False,POMERANIA,POMERANIA,58,True,FEMALE,True,...,4.0,BCC,True,False,False,False,False,True,PAT_326_690_823.png,True


In [32]:
def filter_rows(df, diagnostic_col):
    selected_rows = []
    grouped = df.groupby(diagnostic_col)

    for group_name, group_df in grouped:
        group_df['nan_count'] = group_df.isna().sum(axis=1)
        
        sorted_group = group_df.sort_values(by='nan_count', ascending=True)
        selected_rows.append(sorted_group.head(110))
    df = pd.concat(selected_rows, ignore_index=True)
    df = df.reset_index(drop=True)
        
    return df

In [33]:
filtered_df = filter_rows(csv_cancer, 'diagnostic')
filtered_df.sort_values(by='nan_count', ascending=False)

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed,nan_count
601,PAT_1942,3918,,,,,55,,,,...,SEK,False,False,False,False,False,False,PAT_1942_3918_497.png,False,13
349,PAT_1344,1222,,,,,41,,,,...,NEV,False,False,False,False,False,True,PAT_1344_1222_456.png,False,13
347,PAT_1297,1048,,,,,66,,,,...,NEV,False,True,False,False,False,False,PAT_1297_1048_353.png,False,13
375,PAT_1391,1351,,,,,53,,,,...,NEV,False,False,False,False,False,True,PAT_1391_1351_396.png,False,13
376,PAT_1245,845,,,,,22,,,,...,NEV,False,False,False,False,False,False,PAT_1245_845_32.png,False,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,PAT_311,667,True,False,ITALY,GERMANY,66,False,FEMALE,False,...,BCC,True,False,False,False,False,True,PAT_311_667_416.png,True,0
202,PAT_147,219,False,False,POMERANIA,POMERANIA,39,False,MALE,False,...,BCC,True,True,True,False,True,True,PAT_147_219_206.png,True,0
203,PAT_885,1687,False,True,POMERANIA,POMERANIA,51,False,FEMALE,False,...,BCC,False,False,False,False,False,True,PAT_885_1687_823.png,True,0
204,PAT_734,1390,False,False,GERMANY,GERMANY,41,False,MALE,False,...,BCC,True,True,True,False,True,True,PAT_734_1390_631.png,True,0


In [34]:
columns_to_remove = ["lesion_id", "patient_id", "img_id", "biopsed"]
columns_name = filtered_df.columns.tolist()
for col in columns_to_remove:
    columns_name.remove(col)

def fill_na(df, target_column:str):
    for column in columns_name:
        df_new=df[df["diagnostic"]==target_column]
        mode_value = df_new[column].mode()[0]
        df.loc[df["diagnostic"] == target_column, column] = df.loc[df["diagnostic"] == target_column, column].fillna(mode_value)

fill_na(filtered_df, "NEV")

In [35]:
def fill_na_with_knn(df):
    # Separate the DataFrame into numeric and categorical columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

    # Fill numeric NaN values with KNN imputation
    imputer_numeric = KNNImputer(n_neighbors=5)
    df_numeric = df[numeric_cols].copy()
    df[numeric_cols] = imputer_numeric.fit_transform(df_numeric)

    # Fill categorical NaN values with mode imputation
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

fill_na_with_knn(filtered_df)

In [36]:
filtered_df['nan_count'] = filtered_df.isna().sum(axis=1)
filtered_df.sort_values(by='nan_count', ascending=False)

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed,nan_count
0,PAT_753,1427.0,False,False,GERMANY,GERMANY,61.0,False,FEMALE,False,...,ACK,False,True,False,True,False,True,PAT_753_1427_496.png,True,0
395,PAT_271,418.0,False,False,GERMANY,GERMANY,79.0,False,FEMALE,False,...,SCC,True,False,True,False,False,True,PAT_271_418_14.png,True,0
397,PAT_978,1844.0,True,False,POMERANIA,POMERANIA,72.0,True,MALE,False,...,SCC,True,UNK,True,UNK,True,True,PAT_978_1844_599.png,True,0
398,PAT_989,1861.0,False,True,POMERANIA,POMERANIA,41.0,True,MALE,True,...,SCC,True,UNK,True,UNK,True,True,PAT_989_1861_156.png,True,0
399,PAT_380,1540.0,False,False,NETHERLANDS,GERMANY,60.0,True,MALE,False,...,SCC,True,False,False,False,False,False,PAT_380_1540_432.png,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,PAT_311,667.0,True,False,ITALY,GERMANY,66.0,False,FEMALE,False,...,BCC,True,False,False,False,False,True,PAT_311_667_416.png,True,0
202,PAT_147,219.0,False,False,POMERANIA,POMERANIA,39.0,False,MALE,False,...,BCC,True,True,True,False,True,True,PAT_147_219_206.png,True,0
203,PAT_885,1687.0,False,True,POMERANIA,POMERANIA,51.0,False,FEMALE,False,...,BCC,False,False,False,False,False,True,PAT_885_1687_823.png,True,0
204,PAT_734,1390.0,False,False,GERMANY,GERMANY,41.0,False,MALE,False,...,BCC,True,True,True,False,True,True,PAT_734_1390_631.png,True,0


In [37]:
filtered_df.drop(["patient_id", "lesion_id", "biopsed", "nan_count"],axis=1, inplace=True)
filtered_df.reset_index(drop=True)

Unnamed: 0,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,...,diameter_1,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id
0,False,False,GERMANY,GERMANY,61.0,False,FEMALE,False,True,True,...,5.0,5.0,ACK,False,True,False,True,False,True,PAT_753_1427_496.png
1,False,False,POMERANIA,POMERANIA,80.0,False,FEMALE,True,False,False,...,7.0,5.0,ACK,True,False,False,False,False,False,PAT_419_2767_323.png
2,False,False,ITALY,BRAZIL,57.0,False,MALE,False,False,True,...,4.0,3.0,ACK,True,False,False,False,False,False,PAT_566_178_625.png
3,False,False,GERMANY,GERMANY,66.0,False,FEMALE,False,True,True,...,6.0,6.0,ACK,True,False,False,False,False,False,PAT_809_1528_118.png
4,False,False,POMERANIA,POMERANIA,78.0,False,FEMALE,False,False,True,...,15.0,10.0,ACK,False,False,False,True,False,True,PAT_742_1402_818.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,False,False,POMERANIA,POMERANIA,77.0,False,FEMALE,False,True,True,...,9.8,7.0,SEK,False,False,False,False,False,True,PAT_1304_1081_84.png
598,False,False,POMERANIA,POMERANIA,79.0,False,FEMALE,False,True,True,...,11.6,9.4,SEK,True,True,False,False,False,True,PAT_2056_4364_112.png
599,False,False,POMERANIA,POMERANIA,67.0,False,FEMALE,False,True,True,...,5.8,5.0,SEK,True,True,False,False,False,True,PAT_2016_4155_600.png
600,False,False,POMERANIA,POMERANIA,60.0,False,FEMALE,False,True,True,...,8.0,6.6,SEK,True,False,True,False,False,False,PAT_1185_679_115.png


In [39]:
cat_cols = filtered_df.drop(["diagnostic", "img_id"],axis=1).select_dtypes(object).columns
cat_cols

Index(['background_father', 'background_mother', 'gender', 'region', 'itch',
       'grew', 'hurt', 'changed', 'bleed', 'elevation'],
      dtype='object')

In [40]:
bool_col = cat_cols.drop(['background_father', 'background_mother', 'gender', 'region'])
bool_col

Index(['itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation'], dtype='object')

In [41]:
filtered_df[bool_col] = filtered_df[bool_col].apply(lambda x: x.map({"True": True, "False": False}))

In [42]:
num_cols = filtered_df.select_dtypes([np.int64,np.float64]).columns
num_cols

Index(['age', 'fitspatrick', 'diameter_1', 'diameter_2'], dtype='object')

In [43]:
df=filtered_df
df

Unnamed: 0,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,...,diameter_1,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id
0,False,False,GERMANY,GERMANY,61.0,False,FEMALE,False,True,True,...,5.0,5.0,ACK,False,True,False,True,False,True,PAT_753_1427_496.png
1,False,False,POMERANIA,POMERANIA,80.0,False,FEMALE,True,False,False,...,7.0,5.0,ACK,True,False,False,False,False,False,PAT_419_2767_323.png
2,False,False,ITALY,BRAZIL,57.0,False,MALE,False,False,True,...,4.0,3.0,ACK,True,False,False,False,False,False,PAT_566_178_625.png
3,False,False,GERMANY,GERMANY,66.0,False,FEMALE,False,True,True,...,6.0,6.0,ACK,True,False,False,False,False,False,PAT_809_1528_118.png
4,False,False,POMERANIA,POMERANIA,78.0,False,FEMALE,False,False,True,...,15.0,10.0,ACK,False,False,False,True,False,True,PAT_742_1402_818.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,False,False,POMERANIA,POMERANIA,77.0,False,FEMALE,False,True,True,...,9.8,7.0,SEK,False,False,False,False,False,True,PAT_1304_1081_84.png
598,False,False,POMERANIA,POMERANIA,79.0,False,FEMALE,False,True,True,...,11.6,9.4,SEK,True,True,False,False,False,True,PAT_2056_4364_112.png
599,False,False,POMERANIA,POMERANIA,67.0,False,FEMALE,False,True,True,...,5.8,5.0,SEK,True,True,False,False,False,True,PAT_2016_4155_600.png
600,False,False,POMERANIA,POMERANIA,60.0,False,FEMALE,False,True,True,...,8.0,6.6,SEK,True,False,True,False,False,False,PAT_1185_679_115.png


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602 entries, 0 to 601
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   smoke                602 non-null    bool   
 1   drink                602 non-null    bool   
 2   background_father    602 non-null    object 
 3   background_mother    602 non-null    object 
 4   age                  602 non-null    float64
 5   pesticide            602 non-null    bool   
 6   gender               602 non-null    object 
 7   skin_cancer_history  602 non-null    bool   
 8   cancer_history       602 non-null    bool   
 9   has_piped_water      602 non-null    bool   
 10  has_sewage_system    602 non-null    bool   
 11  fitspatrick          602 non-null    float64
 12  region               602 non-null    object 
 13  diameter_1           602 non-null    float64
 14  diameter_2           602 non-null    float64
 15  diagnostic           602 non-null    obj

In [45]:
ST_PARAMS = {}
for column_name in num_cols:
    ST_PARAMS[column_name] = [df[column_name].mean(), df[column_name].std()]

ST_PARAMS

{'age': [60.03654485049834, 17.89845623127993],
 'fitspatrick': [2.295016611295681, 0.7207929745792525],
 'diameter_1': [11.322923588039867, 9.223065174087516],
 'diameter_2': [8.19734219269103, 4.999579973623725]}

In [46]:
for column_name in num_cols:
    df[column_name] = \
    (df[column_name] - ST_PARAMS[column_name][0])/ST_PARAMS[column_name][1]


In [47]:
bool_col = df.select_dtypes(bool).columns
bool_col

Index(['smoke', 'drink', 'pesticide', 'skin_cancer_history', 'cancer_history',
       'has_piped_water', 'has_sewage_system'],
      dtype='object')

In [48]:
for column_name in bool_col:
    df[column_name] = df[column_name].astype(dtype=int)

df

Unnamed: 0,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,...,diameter_1,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id
0,0,0,GERMANY,GERMANY,0.053829,0,FEMALE,0,1,1,...,-0.685556,-0.639522,ACK,False,True,False,True,False,True,PAT_753_1427_496.png
1,0,0,POMERANIA,POMERANIA,1.115373,0,FEMALE,1,0,0,...,-0.468708,-0.639522,ACK,True,False,False,False,False,False,PAT_419_2767_323.png
2,0,0,ITALY,BRAZIL,-0.169654,0,MALE,0,0,1,...,-0.793979,-1.039556,ACK,True,False,False,False,False,False,PAT_566_178_625.png
3,0,0,GERMANY,GERMANY,0.333183,0,FEMALE,0,1,1,...,-0.577132,-0.439505,ACK,True,False,False,False,False,False,PAT_809_1528_118.png
4,0,0,POMERANIA,POMERANIA,1.003632,0,FEMALE,0,0,1,...,0.398683,0.360562,ACK,False,False,False,True,False,True,PAT_742_1402_818.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,0,0,POMERANIA,POMERANIA,0.947761,0,FEMALE,0,1,1,...,-0.165121,-0.239489,SEK,False,False,False,False,False,True,PAT_1304_1081_84.png
598,0,0,POMERANIA,POMERANIA,1.059502,0,FEMALE,0,1,1,...,0.030042,0.240552,SEK,True,True,False,False,False,True,PAT_2056_4364_112.png
599,0,0,POMERANIA,POMERANIA,0.389053,0,FEMALE,0,1,1,...,-0.598816,-0.639522,SEK,True,True,False,False,False,True,PAT_2016_4155_600.png
600,0,0,POMERANIA,POMERANIA,-0.002042,0,FEMALE,0,1,1,...,-0.360284,-0.319495,SEK,True,False,True,False,False,False,PAT_1185_679_115.png


In [49]:
transformer = make_column_transformer((OneHotEncoder(drop='first', dtype=bool), cat_cols), remainder='passthrough')

transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed)
encoded_region_columns = transformer.transformers_[0][1].get_feature_names_out(cat_cols)
remaining_columns = df.columns[~df.columns.isin(cat_cols)].tolist()

all_columns = encoded_region_columns.tolist() + remaining_columns

transformed_df.columns = all_columns

# Convert encoded columns to boolean dtype
transformed_df[encoded_region_columns] = transformed_df[encoded_region_columns].astype(int)

# Manually specify data types for non-categorical columns
non_cat_cols = df[remaining_columns].select_dtypes(include=np.number).columns
for col in non_cat_cols:
    transformed_df[col] = df[col].astype(df[col].dtype)

transformed_df

Unnamed: 0,background_father_BRAZIL,background_father_GERMANY,background_father_ITALY,background_father_NETHERLANDS,background_father_POLAND,background_father_POMERANIA,background_father_PORTUGAL,background_father_SPAIN,background_father_UNK,background_mother_FRANCE,...,pesticide,skin_cancer_history,cancer_history,has_piped_water,has_sewage_system,fitspatrick,diameter_1,diameter_2,diagnostic,img_id
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0.978066,-0.685556,-0.639522,ACK,PAT_753_1427_496.png
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,-0.409295,-0.468708,-0.639522,ACK,PAT_419_2767_323.png
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0.978066,-0.793979,-1.039556,ACK,PAT_566_178_625.png
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0.978066,-0.577132,-0.439505,ACK,PAT_809_1528_118.png
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,1,-0.409295,0.398683,0.360562,ACK,PAT_742_1402_818.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,0.423122,-0.165121,-0.239489,SEK,PAT_1304_1081_84.png
598,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,-0.409295,0.030042,0.240552,SEK,PAT_2056_4364_112.png
599,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,-0.409295,-0.598816,-0.639522,SEK,PAT_2016_4155_600.png
600,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,-0.131822,-0.360284,-0.319495,SEK,PAT_1185_679_115.png


In [50]:
transformed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602 entries, 0 to 601
Data columns (total 58 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   background_father_BRAZIL       602 non-null    int64  
 1   background_father_GERMANY      602 non-null    int64  
 2   background_father_ITALY        602 non-null    int64  
 3   background_father_NETHERLANDS  602 non-null    int64  
 4   background_father_POLAND       602 non-null    int64  
 5   background_father_POMERANIA    602 non-null    int64  
 6   background_father_PORTUGAL     602 non-null    int64  
 7   background_father_SPAIN        602 non-null    int64  
 8   background_father_UNK          602 non-null    int64  
 9   background_mother_FRANCE       602 non-null    int64  
 10  background_mother_GERMANY      602 non-null    int64  
 11  background_mother_ITALY        602 non-null    int64  
 12  background_mother_NETHERLANDS  602 non-null    int

In [51]:
transformed_df.to_csv("transformed_df.csv", index=False)