## Loading data into files and containers

In [28]:
import os
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt
from zipfile import ZipFile
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score

### Downloading data set should not need to do know


In [29]:
#!chmod 600 kaggle.json && (ls ~/.kaggle 2>/dev/null || mkdir ~/.kaggle) && cp kaggle.json ~/.kaggle/ && echo 'Done'
#!ls
#!kaggle datasets download --force -d kmader/skin-cancer-mnist-ham10000

In [30]:
archive = 'skin-cancer-mnist-ham10000.zip'
cur_dir = os.getcwd()
a = os.path.join(cur_dir, archive)
print(a)

/content/skin-cancer-mnist-ham10000.zip


In [31]:
#with ZipFile(archive , 'r') as zip_ref:
#    zip_ref.extractall('./')

In [32]:
path_1 = 'HAM10000_images_part_1'
path_2 = 'HAM10000_images_part_2'
path_meta = 'HAM10000_metadata.csv'

### Read data

In [33]:
file_path1 = os.path.join(cur_dir, path_1) #HAM10000_images_part_1
file_path2 = os.path.join(cur_dir, path_2) #HAM10000_images_part_2

len(os.listdir(file_path1)), len(os.listdir(file_path2))

(5000, 5015)

In [34]:
metadata_path = os.path.join(cur_dir, path_meta)
df = pd.read_csv(metadata_path)
df['lesion_id'].unique().shape, df

##
df_ids_img = df[['lesion_id','image_id','dx']]
df_ids_img['dx'].unique()

array(['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec'], dtype=object)

This shows that we have 7470 unique images and duplicates that are either different views or magnified.

### First folder processing

In [35]:
num_1 = int( len(os.listdir(file_path1)) / 2 ) # Number of images to use

In [36]:
size = (240,240)

imgs_1 = [ np.zeros((size[0],size[1],3),'float32') for k in  range( num_1 ) ]
len(imgs_1)

2500

In [37]:
target = np.zeros( num_1 ).astype('str') # Convert to int later

target.shape, 

((2500,),)

In [38]:
sizes = []
min = 100
max = -1

for img in imgs_1:
    if( img.shape not in sizes ):
        sizes.append(img.shape)
sizes

[(240, 240, 3)]

In [39]:
for i, img in enumerate( os.listdir(file_path1) ): #Loop through each file
    if( i < num_1 ):
        p_id = img.split('.')[0]#Extract id
        target[i] = df[ df['image_id'] == p_id ]['dx'].iloc[0] # Target Feature
        imgs_1[i] = cv.resize( cv.imread( os.path.join(file_path1, img) ), size ) # Load data into array 
        # May want to change the color scheme, but we will see                   

KeyboardInterrupt: ignored

In [None]:
plt.imshow(imgs_1[0])

### normalize images

In [None]:
imgs_1 = [ img / 255.0 for img in imgs_1 ]
plt.imshow(imgs_1[0])

In [None]:
imgs_1[0]

### Shuffle data


In [None]:
# Put targets and photos together
group_1 = [0 for i in range(num_1)]
for i in range(num_1):
    group_1[i] = [ imgs_1[i], target[i] ]

In [None]:
# Shuffle
np.random.shuffle(group_1)

# Ungroup
for i in range(num_1):
    imgs_1[i] = group_1[i][0]
    target[i] = group_1[i][1]
    

In [None]:
np.zeros((2,4))[0]

### Convert target feature into numerical data

In [None]:
target_dict = {'bkl': 1, 'nv': 2, 'df': 3, 'mel': 4, 'vasc': 5, 'bcc': 6, 'akiec': 7}
for k in range(14):
    print( target[k] )
 
# One-hot encode the targets
target_ohe = np.zeros((num_1, len(target_dict)))
target_ohe.shape
for i in range(num_1):
    idx = target_dict[target[i]] - 1
    target_ohe[i][idx] = 1

for k in range(10):
    print(target_ohe[k])
# Not useful probably
#target= [ target_dict[t] for t in target ]
#for k in range(14):
#    print( target[k] )



In [None]:
cnt = [0 for k in range(7)]
for k in target:
    cnt[target_dict[k] - 1] += 1
cnt[0] + cnt[2] + cnt[3]+ cnt[4]+ cnt[5] + cnt[6], cnt[1]

Notice that data is skewed towards non-cancerous pictures

### Split data

In [None]:
train_split = int ( num_1 * 0.8 )
train_x = np.array(imgs_1[:train_split])
train_y = np.array(target_ohe[:train_split])

val_x = np.array(imgs_1[train_split:])
val_y = np.array(target_ohe[train_split:])

In [None]:
# one-hot encode the targets

### Model

In [None]:
filter1 = 16
filter2 = 32
filter3 = 64

input_size = (size[0], size[1], 3)
pool_size = (2,2)

#First make our layers
input_layer = tf.keras.layers.Input(shape = input_size, name = 'input')
conv1 = Conv2D(filter1, (8,8), activation='relu', padding='same', name = 'c1')(input_layer)
pool1 = MaxPool2D(pool_size, name = 'p1')(conv1)
conv2 = Conv2D(filter2, (4,4), activation = 'relu', padding = 'same', name = 'c2')(pool1)
pool2 = MaxPool2D(pool_size, name = 'p2')(conv2)
conv3 = Conv2D(filter3, (2,2), activation = 'relu', padding = 'same', name = 'c3')(pool2)
pool3 = MaxPool2D(pool_size, name = 'p3')(conv3)
flatten = tf.keras.layers.Flatten(name = 'flatten')(pool3)
dense1 = Dense(256,activation = 'relu',name = 'd1')(flatten)
output_layer = Dense(7, activation = 'sigmoid',name = 'out')(dense1) # Change output and possibly above line

In [None]:
model1 = tf.keras.Model(input_layer, output_layer, name = 'Model1')

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6)
model1.compile(
    optimizer = 'Adam',
    loss = 'categorical_crossentropy', # Multiclass Problem
    metrics = 'accuracy',
)

model1.summary()

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='accuracy', patience = 5
    )

EPOCH = 100

#Train the model
history = model1.fit(
    train_x,
    train_y,
    epochs = EPOCH,
    batch_size = 1028,
    callbacks = [early_stop]
)

In [None]:
import matplotlib.pyplot as plt

plt.title("Accuracy per Epoch")
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.plot(range(0,len(history.history['accuracy'])), history.history['accuracy'])


In [None]:
plt.hist(predictions)

In [None]:
predictions = model1.predict(val_x)
for k in range(3):
  print(predictions[k])


In [None]:
preds = []
for k in predictions:
  max = 0
  idx = -1
  max_idx = 0
  for i in k:
    idx += 1
    if( i > max ):
      max_idx = idx
      max = i
  preds.append(max_idx)

In [None]:
plt.hist(predictions)

# Trying by identifying if a lesion is cancerous or not, disregarding type