In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
from sklearn.model_selection import train_test_split


In [2]:
base_skin_dir = "/Users/somiseta/Downloads/skin-cancer-mnist-ham10000"
skin_df = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv')) # load in the data
skin_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [3]:
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

lesion_type_dict = {
    'nv': 'Melanocytic_nevi',
    'mel': 'melanoma',
    'bkl': 'Benign_keratosis-like_lesions',
    'bcc': 'Basal_cell_carcinoma',
    'akiec': 'Actinic_keratoses',
    'vasc': 'Vascular_lesions',
    'df': 'Dermatofibroma'
}

lesion_danger = {
    'nv': 0, # 0 for benign
    'mel': 1, # 1 for malignant
    'bkl': 0, # 0 for benign
    'bcc': 1, # 1 for malignant
    'akiec': 1, # 1 for malignant
    'vasc': 0,
    'df': 0
}

In [4]:
skin_df["path"] = skin_df["image_id"].map(imageid_path_dict.get) # map image_id to the path of that image

In [5]:
skin_df["path"] = skin_df["image_id"].map(imageid_path_dict.get) # map image_id to the path of that image

In [6]:
skin_df["cell_type"] = skin_df["dx"].map(lesion_type_dict.get) # map dx to type of lesion

In [8]:
skin_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions


In [9]:
skin_df["Malignant"] = skin_df["dx"].map(lesion_danger.get)

In [10]:
skin_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,Malignant
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions,0
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions,0
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions,0
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions,0
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Benign_keratosis-like_lesions,0


In [7]:
skin_df["cell_type_idx"] = pd.Categorical(skin_df["cell_type"]).codes # give each cell type a category id

In [12]:
skin_df.sample(3)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,Malignant,cell_type_idx
9061,HAM_0003223,ISIC_0024922,nv,histo,60.0,male,upper extremity,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Melanocytic_nevi,0,4
6639,HAM_0005691,ISIC_0032457,nv,follow_up,35.0,female,lower extremity,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Melanocytic_nevi,0,4
9777,HAM_0005448,ISIC_0024707,akiec,histo,60.0,male,face,/Users/somiseta/Downloads/skin-cancer-mnist-ha...,Actinic_keratoses,1,0


In [None]:
skin_df["Malignant"].value_counts().plot(kind="bar", title="Benign vs Malignant")

Most cases in our dataset are benign.

In [None]:
fig, ax1 = plt.subplots(1,1,figsize=(10,5))
skin_df["cell_type"].value_counts().plot(kind="bar", ax=ax1, title="Counts for each type of Lesions") # plot a graph counting the number of each cell type

Our dataset is biased toward Melanocytic nevi. The cell_type with the second highest samples is the noctorious melanoma

In [None]:
# let's see where lesions are mostly located
skin_df["localization"].value_counts().plot(kind='bar', title="Location of Lesions")

In [None]:
skin_df["dx_type"].value_counts().plot(kind='bar', title="Treatment received")

Description for each dx_type:

histo: "Histopathologic diagnoses of excised lesions have been performed by specialized dermatopathologists."

follow_up: "If nevi monitored by digital dermatoscopy did not show any changes during 3 follow-up visits or 1.5 years we accepted this as evidence of biologic benignity. Only nevi, but no other benign diagnoses were labeled with this type of ground-truth because dermatologists usually do not monitor dermatofibromas, seborrheic keratoses, or vascular lesions."

consensus: "For typical benign cases without histopathology or follow-up we provide an expertconsensus rating of authors PT and HK. We applied the consensus label only if both authors independently gave the same unequivocal benign diagnosis. Lesions with this type of ground-truth were usually photographed for educational reasons and did not need further follow-up or biopsy for
confirmation."

confocal: "Reflectance confocal microscopy is an in-vivo imaging technique with a resolution at near-cellular level, and some facial benign keratoses were verified by this method."

Let's look at some characteristics of our patients

In [None]:
skin_df["age"].hist(bins=50)

In [None]:
skin_df[skin_df["Malignant"] == 1]["age"].hist(bins=40)

We can see that most of patients are above 30. But for the malignant cases, most patients are 50 and above,  and 70s - year - old patients are the most present.

In [None]:
skin_df["sex"].value_counts().plot(kind="bar", title="Male vs Female")

In [None]:
skin_df[skin_df["Malignant"] == 1]["sex"].value_counts().plot(kind="bar", title="Male vs Female. Malignant Cases")

We have more male patients than female patients in both general population and in malignant case. So far we haven't looked at our image yet. So let's now change our focus into how lesions in our dataset look like.

In [None]:
from skimage.io import imread

In [None]:
skin_df["image"] = skin_df["path"].map(imread) # read the image to array values

In [None]:
skin_df.iloc[0]["image"] # here is a sample

In [None]:
# let's see what is the shape of each value in the image column
skin_df["image"].map(lambda x: x.shape).value_counts() 

In [None]:
# let's have a look at the image data

n_samples = 5 # choose 5 samples for each cell type
fig, m_axs = plt.subplots(7, n_samples, figsize=(4*n_samples, 3 * 7))

for n_axs, (type_name, type_rows) in zip(m_axs, skin_df.sort_values(["cell_type"]).groupby("cell_type")):
    n_axs[0].set_title(type_name)
    for c_ax, (_, c_row) in zip(n_axs, type_rows.sample(n_samples, random_state=0).iterrows()):
        c_ax.imshow(c_row["image"])
        c_ax.axis("off")
fig.savefig("category_samples.png", dpi=300)

Based on these images, it is still very hard for non-experts to know which is which.

## Get Average Color Information

Here we get and normalize all of the color channel information

The shape of the image array is (450, 600, 3). 3 are the 3 chanels: Red, Blue and Green! Taking the mean across axis=(0,1) gives the mean for each 3 channels.

In [None]:
# create a pandas dataframe to store mean value of Red, Blue and Green for each picture
rgb_info_df = skin_df.apply(lambda x: pd.Series({'{}_mean'.format(k): v for k, v 
                                                 in zip(["Red", "Blue", "Green"], 
                                                        np.mean(x["image"], (0, 1)))}), 1)


gray_col_vec = rgb_info_df.apply(lambda x: np.mean(x), 1) # take the mean value across columns of rgb_info_df
for c_col in rgb_info_df.columns:
    rgb_info_df[c_col] = rgb_info_df[c_col]/gray_col_vec 
rgb_info_df["Gray_mean"] = gray_col_vec
rgb_info_df.sample(3)

In [None]:
for c_col in rgb_info_df.columns:
    skin_df[c_col] = rgb_info_df[c_col].values

In [None]:
# let's draw a plot showing the distribution of different cell types over colors!
sns.pairplot(skin_df[["Red_mean", "Green_mean", "Blue_mean", "Gray_mean", "cell_type"]], 
             hue="cell_type", plot_kws = {"alpha": 0.5})

## Changes in cell type appearance as values in color chanel changes

In this section, I am doing an analysis on how each cell type looks like when each color channel values changes. E.g. the first 5 images demonstrate how cell Actinic Keratoses appearance changes as the values in red channel gets bigger. 

### Reshape image and get data for classification

In [8]:
from PIL import Image

### Resize image for baseline model

In [None]:
reshaped_image = skin_df["path"].map(lambda x: np.asarray(Image.open(x).resize((64,64), resample=Image.LANCZOS).\
                                                          convert("RGB")).ravel())

In [None]:
out_vec = np.stack(reshaped_image, 0)

In [None]:
out_df = pd.DataFrame(out_vec)

In [None]:
out_df["label"] = skin_df["cell_type_idx"]

In [None]:
out_df.head()

In [None]:
reshaped_image = skin_df["path"].map(lambda x: np.asarray(Image.open(x).resize((64,64), resample=Image.LANCZOS).\
                                                          convert("RGB")).ravel())

In [None]:
out_path = "C:/Users/somiseta/isic2018/hmnist_64_64_RBG.csv"
out_df.to_csv(out_path, index=False)

### Resize Image for Dense Net Model


In [None]:
reshaped_image = skin_df["path"].map(lambda x: np.asarray(Image.open(x).resize((221,221), resample=Image.LANCZOS).\
                                                          convert("RGB")))

out_vec = np.stack(reshaped_image, 0)

out_vec.shape

In [None]:
out_vec = out_vec.astype("float32")
out_vec /= 255

In [10]:
labels = skin_df["cell_type_idx"].values

In [None]:
X_train_orig, X_test, y_train_orig, y_test = train_test_split(out_vec, labels, test_size=0.1,random_state=0)

In [None]:
np.save("./221_221/221_221_test.npy", X_test)
np.save("./221_221/test_labels.npy", y_test)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_orig, y_train_orig, test_size=0.1, random_state=1)

In [None]:
np.save("./221_221/221_221_val.npy", X_val)
np.save("./221_221/val_labels.npy", y_val)

In [None]:
np.save("./221_221/221_221_train.npy", X_train)
np.save("./221_221/train_labels.npy", y_train)

## Resize Image for Inception V3  Model

In [9]:
reshaped_image = skin_df["path"].map(lambda x: np.asarray(Image.open(x).resize((192,256), resample=Image.LANCZOS).\
                                                          convert("RGB")))

out_vec = np.stack(reshaped_image, 0)

out_vec.shape
out_vec = out_vec.astype("float32")
out_vec /= 255

In [11]:
X_train_orig, X_test, y_train_orig, y_test = train_test_split(out_vec, labels, test_size=0.1,random_state=0)

In [12]:
np.save("./192_256/192_256_test.npy", X_test)
np.save("./192_256/test_labels.npy", y_test)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train_orig, y_train_orig, test_size=0.1, random_state=1)

In [14]:
np.save("./192_256/192_256_val.npy", X_val)
np.save("./192_256/val_labels.npy", y_val)

In [15]:
np.save("./192_256/192_256_train.npy", X_train)
np.save("./192_256/train_labels.npy", y_train)