In [None]:
import os
import zipfile
from google.colab import drive

In [None]:
drive.mount('/content/drive')

In [None]:
zip_file = '/content/drive/My Drive/canser.zip'

In [None]:
z = zipfile.ZipFile(zip_file,'r')

In [None]:
z.extractall()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import seaborn as sns
import cv2
from PIL import Image

In [None]:
skin_df = pd.read_csv('HAM10000_metadata.csv')
skin_df.head()

Classes meanings:
   - Melanocytic (nv)
   - Melanoma (mel)
   - Benign keratosis-like lesions (bkl)
   - Basal cell carcinoma (bcc)
   - Actinic keratoses (akiec)
   - Vascular lesions (vas)
   - Dermatofibroma (df)

In [None]:
# label encoding to numeric data from text
from sklearn.preprocessing import LabelEncoder

SIZE = 64
le = LabelEncoder()
le.fit(skin_df['dx'])
print(list(le.classes_))

In [None]:
skin_df['label'] = le.transform(skin_df['dx'])
skin_df.sample(10)

Data distribution visualization

In [None]:
from scipy import stats

fig = plt.figure(figsize = (15,10))

ax1 = fig.add_subplot(221)
skin_df['dx'].value_counts().plot(kind='bar',ax=ax1)
ax1.set_ylabel('Count')
ax1.set_title('Cell Type')

plt.tight_layout()
plt.show()

# we can see that dataset in unbalanced

In [None]:
skin_df.columns

In [None]:
img = cv2.imread('/content/HAM10000_images_part_1/ISIC_0024307.jpg')
img.shape

In [None]:
plt.imshow(cv2.cvtColor(img,cv2.COLOR_BGR2RGB))
plt.show()

In [None]:
skin_df = skin_df.dropna()

In [None]:
skin_df.drop

In [None]:
# loading images into dataframe
from glob import glob

image_path = {os.path.splitext(os.path.basename(x))[0]: './HAM10000_images_part_1/' + x
              for x in os.listdir('./HAM10000_images_part_1') }

for x in os.listdir('./HAM10000_images_part_2'):
    image_path.setdefault(x, './HAM10000_images_part_1/' + x)
skin_df = skin_df.copy()

skin_df['path'] = skin_df['image_id'].map(image_path.get)
skin_df = skin_df.dropna()
skin_df['image'] = skin_df['path'].map(lambda x: np.asarray(Image.open(x).resize((SIZE,SIZE))))

In [None]:
skin_df.isnull().sum()

In [None]:
# handling imbalanced dataset
from sklearn.utils import  resample

df_0 = skin_df[skin_df['label'] == 0]
df_1 = skin_df[skin_df['label'] == 1]
df_2 = skin_df[skin_df['label'] == 2]
df_3 = skin_df[skin_df['label'] == 3]
df_4 = skin_df[skin_df['label'] == 4]
df_5 = skin_df[skin_df['label'] == 5]
df_6 = skin_df[skin_df['label'] == 6]

n_samples = 500
df_0_balanced = resample(df_0,replace=True, n_samples = n_samples, random_state = 42)
df_1_balanced = resample(df_1,replace=True, n_samples = n_samples, random_state = 42)
df_2_balanced = resample(df_2,replace=True, n_samples = n_samples, random_state = 42)
df_3_balanced = resample(df_3,replace=True, n_samples = n_samples, random_state = 42)
df_4_balanced = resample(df_4,replace=True, n_samples = n_samples, random_state = 42)
df_5_balanced = resample(df_5,replace=True, n_samples = n_samples, random_state = 42)
df_6_balanced = resample(df_6,replace=True, n_samples = n_samples, random_state = 42)

skin_df_balanced = pd.concat([
    df_0_balanced,df_1_balanced,
    df_2_balanced,df_3_balanced,
    df_4_balanced,df_5_balanced,df_6_balanced,
])

skin_df_balanced['label'].value_counts()

In [None]:
skin_df_balanced.shape

In [None]:
n_samples = 5
fig, m_axs = plt.subplots(7, n_samples, figsize=(4*n_samples,3*7))
for n_axs, (type_name, type_rows) in zip(m_axs,skin_df_balanced.sort_values(['dx']).groupby('dx')):
    n_axs[0].set_title(type_name)
    for c_ax,(_,c_rows) in zip(n_axs,type_rows.sample(n_samples,random_state=1234).iterrows()):
       c_ax.imshow(c_rows['image'])
       c_ax.axis('off')

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

x = np.asarray(skin_df_balanced['image'].tolist())
x = x/255
y = skin_df_balanced['label']
y_cat = to_categorical(y, num_classes=7)

x_train,x_test,y_train,y_test = train_test_split(x,y_cat,test_size=0.25,random_state = 42)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout
from tensorflow.keras.layers.experimental.preprocessing import RandomFlip, RandomRotation, RandomZoom

data_augmentation_layer = Sequential([
    RandomFlip('horizontal',input_shape = (SIZE,SIZE,3)),
    RandomRotation(0.1),
    RandomZoom(0.1)
])

model = Sequential([

    data_augmentation_layer,

    Conv2D(filters = 256,kernel_size=(3,3),activation= 'relu',input_shape = (SIZE,SIZE,3)),
    MaxPool2D(pool_size=(2,2)),
    Dropout(0.3),

    Conv2D(filters = 256,kernel_size=(3,3),activation= 'relu'),
    MaxPool2D(pool_size=(2,2)),
    Dropout(0.3),

    Conv2D(filters = 256,kernel_size=(3,3),activation= 'relu'),
    MaxPool2D(pool_size=(2,2)),
    Dropout(0.3),

    Conv2D(filters = 256,kernel_size=(3,3),activation= 'relu',input_shape = (SIZE,SIZE,3)),
    MaxPool2D(pool_size=(2,2)),
    Dropout(0.3),
    Flatten(),

    Dense(32,activation='relu'),
    Dense(7,activation='softmax')
])
model.summary()

In [None]:
model.compile(
    optimizer = 'adam',
    loss = 'categorical_crossentropy',
    metrics = ['acc']
)

In [None]:
batch_size = 16
epochs = 50

history = model.fit(x_train, y_train, batch_size=batch_size, epochs = epochs , validation_data=(x_test,y_test),verbose=2)

In [None]:
score = model.evaluate(x_test,y_test)
score[1]

In [None]:
#plot the training and validation accuracy and loss at each epoch
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Prediction on test data
y_pred = model.predict(x_test)
# Convert predictions classes to one hot vectors
y_pred_classes = np.argmax(y_pred, axis = 1)
# Convert test data to one hot vectors
y_true = np.argmax(y_test, axis = 1)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_pred_classes, y_true))

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred_classes)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sns.set(font_scale=1.6)
sns.heatmap(cm, annot=True, linewidths=.5, ax=ax)

In [None]:
#PLot fractional incorrect misclassifications
incorr_fraction = 1 - np.diag(cm) / np.sum(cm, axis=1)
plt.bar(np.arange(7), incorr_fraction)
plt.xlabel('True Label')
plt.ylabel('Fraction of incorrect predictions')

In [None]:
model.save(os.path.join('models','skin_model.h5'))