In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import os


In [2]:
df_meta = pd.read_csv("../data/HAM10000_metadata.csv")

In [3]:
df_meta.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [4]:
df_meta.shape

(10015, 7)

In [5]:
df_meta.describe()

Unnamed: 0,age
count,9958.0
mean,51.863828
std,16.968614
min,0.0
25%,40.0
50%,50.0
75%,65.0
max,85.0


In [6]:
df_meta['lesion_id'].value_counts()

HAM_0003789    6
HAM_0000835    6
HAM_0005263    6
HAM_0001863    6
HAM_0007427    5
              ..
HAM_0006000    1
HAM_0002762    1
HAM_0006894    1
HAM_0007132    1
HAM_0003347    1
Name: lesion_id, Length: 7470, dtype: int64

In [7]:
df_meta['dx'].value_counts()

nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: dx, dtype: int64

In [8]:
df_meta['dx_type'].value_counts()

histo        5340
follow_up    3704
consensus     902
confocal       69
Name: dx_type, dtype: int64

In [9]:
fig_0 = px.bar(df_meta.groupby(['dx_type', 'dx']).size().unstack(level=1),width=600, height=400)
fig_0.update_layout(title_text='', title_x=0.5)
fig_0.show()

In [10]:
fig_1 = px.bar(df_meta.groupby(['dx', 'dx_type']).size().unstack(level=1),width=600, height=400)
fig_1.update_layout(title_text='', title_x=0.5)
fig_1.show()

In [11]:
fig_2 = px.bar(df_meta.groupby(['sex', 'dx']).size().unstack(level=1),width=600, height=400)
fig_2.update_layout(title_text='', title_x=0.5)
fig_2.show()

In [12]:
fig_3 = px.bar(df_meta.groupby(['dx','localization']).size().unstack(level=1),width=600, height=400)
fig_3.update_layout(title_text='', title_x=0.5)
fig_3.show()

## Feature Engineering


In [13]:
def age_group(value):
    if value < 18:
        return "young"
    elif 18 <= value < 25:
        return " young adult"
    elif 25 <= value < 40:
        return "adult"
    elif 40 <= value < 60 :
        return "middle age"
    elif value >= 60:
        return "0ld"
 
df_meta['age_group'] = df_meta['age'].map(age_group)
df_meta.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,age_group
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,0ld
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,0ld
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,0ld
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,0ld
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,0ld


In [14]:

""""
akiec" : "Bowen's disease", # very early form of skin cancer 
"bcc" : "basal cell carcinoma" , # basal-cell cancer or white skin cancer
"bkl" : "benign keratosis-like lesions", # non-cancerous skin tumour
"df" : "dermatofibroma", # non-cancerous rounded bumps 
"mel" : "melanoma", # black skin cancer
"nv" : "melanocytic nevi", # mole non-cancerous
"vasc" : "vascular lesions", # skin condition

"""
def dx_class(value):
    if value == 'akiec':
        return 'malignant'
    elif value == 'bcc':
        return 'malignant'
    elif value == 'bkl':
        return 'benign'
    elif value == 'df':
        return 'benign'
    elif value == 'mel':
        return 'malignant'
    elif value == 'nv':
        return 'benign'
    elif value == 'vasc':
        return 'benign'
 
df_meta['class'] = df_meta['dx'].map(dx_class)
df_meta.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,age_group,class
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,0ld,benign
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,0ld,benign
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,0ld,benign
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,0ld,benign
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,0ld,benign


In [15]:
df_meta['class'].value_counts()

benign       8061
malignant    1954
Name: class, dtype: int64

In [16]:
fig_4 = px.bar(df_meta.groupby(['class','age_group']).size().unstack(level=1),width=600, height=400)
fig_4.update_layout(title_text='', title_x=0.5)
fig_4.show()

In [17]:
fig_4 = px.bar(df_meta.groupby(['class','dx']).size().unstack(level=1),width=600, height=400)
fig_4.update_layout(title_text='', title_x=0.5)
fig_4.show()

In [18]:
fig_4 = px.bar(df_meta.groupby(['class','dx_type']).size().unstack(level=1),width=600, height=400)
fig_4.update_layout(title_text='', title_x=0.5)
fig_4.show()

In [20]:
df_meta.to_csv("../data/meta_data.csv")