In [40]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
from glob import glob
import numpy as np
from PIL import Image
from keras.utils.np_utils import to_categorical

In [2]:
file = pd.read_csv('./data/HAM10000_metadata.csv')
file.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [3]:
le = LabelEncoder()
le.fit(file['dx'])
print(le.classes_)
file['label'] = le.fit_transform(file['dx'])

['akiec' 'bcc' 'bkl' 'df' 'mel' 'nv' 'vasc']


In [4]:
file.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,label
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,2
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,2
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,2
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,2
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,2


In [5]:
print(file['label'].value_counts())

5    6705
4    1113
2    1099
1     514
0     327
6     142
3     115
Name: label, dtype: int64


# 資料平衡

In [6]:
from sklearn.utils import resample

In [7]:
df_0 = file[file['label'] == 0]
df_1 = file[file['label'] == 1]
df_2 = file[file['label'] == 2]
df_3 = file[file['label'] == 3]
df_4 = file[file['label'] == 4]
df_5 = file[file['label'] == 5]
df_6 = file[file['label'] == 6]

In [8]:
n_sample = 500
df_0_balanced = resample(df_0, replace=True, n_samples=n_sample, random_state=42)
df_1_balanced = resample(df_1, replace=True, n_samples=n_sample, random_state=42)
df_2_balanced = resample(df_2, replace=True, n_samples=n_sample, random_state=42)
df_3_balanced = resample(df_3, replace=True, n_samples=n_sample, random_state=42)
df_4_balanced = resample(df_4, replace=True, n_samples=n_sample, random_state=42)
df_5_balanced = resample(df_5, replace=True, n_samples=n_sample, random_state=42)
df_6_balanced = resample(df_6, replace=True, n_samples=n_sample, random_state=42)

In [9]:
file_balance = pd.concat([df_0_balanced, df_1_balanced, df_2_balanced, df_3_balanced, df_4_balanced, df_5_balanced, df_6_balanced])

In [10]:
print(file_balance['label'].value_counts())

0    500
1    500
2    500
3    500
4    500
5    500
6    500
Name: label, dtype: int64


In [20]:
image_path = {os.path.splitext(os.path.basename(x))[0]: x for x in glob(os.path.join('data/HAM10000/', '*', '*.jpg'))}

In [22]:
file_balance['path'] = file['image_id'].map(image_path.get)

In [34]:
SIZE = 32
file_balance['image'] = file_balance['path'].map(lambda x: np.asarray(Image.open(x).resize((SIZE, SIZE))))

In [41]:
X = np.asarray(file_balance['image'].tolist())
X = X / 255.
Y = file_balance['label']

# transfer into one hot code
Y_cat = to_categorical(Y, num_classes=7)

#　Start training

In [None]:
from keras.models import Sequential
from keras.layers import Dense, MaxPool2D, Conv2D, Dropout, Flatten

In [None]:
num_classes = 7

model = Sequential()
model.add(Conv2D(256, (3, 3), activation="relu", input_shape = (SIZE, SIZE, 3)))

model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.3))

model.add(Conv2D(128, (3, 3), activation="relu"))

model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.3))

model.add(Conv2D(64, (3, 3), activation="relu"))

model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.3))
model.add(Flatten())

model.add(Dense(32))
model.add(Dense(7, activation="softmax"))
model.summary()

model.compile(loss="categorical_crossentropy", optimizer='Adam', metrics=['acc'])