In [107]:
import re
import os
from os import listdir
from os.path import isfile, join
from copy import deepcopy

import numpy as np
import pandas as pd

from keras import applications
from keras import optimizers
from tensorflow.keras import regularizers
import tensorflow.keras as K

from PIL import Image

K.utils.set_random_seed(270219)

We will start with two functions: One to get the ID of each image and one to find the index of each image of the training set. In this function, we have some folder named data containing the images and the csv file. We store all the images in the variable images, the IDs are the IDs of the images (the number after img_) and labels contain the id with the respective labels. trainIndex then is the images ID that correspond to the labels.csv.

In [100]:
def getImageID(imageList):
    ID = []
    for i in imageList:
        ID.append(re.findall("\d{1,}", i))
    return ID

def getIndex(IDs,labelsId):
   trainIndex = []
   for labeled in labelsId:
       for IDindex, ID in enumerate(IDs):
           if str(labeled) == ID[0]:
               trainIndex.append(IDindex)
   return trainIndex

def getValidationIndex(IDs, labelsID):
    validationIndex = []
    for IDindex, ID in enumerate(IDs):
        validation = 1
        for label in labelsID:
            if str(label) == ID[0]:
                validation = 0
        if validation:
            validationIndex.append(IDindex)
    return validationIndex
localPath = os.getcwd()
dataPath = localPath +"\data\\"

images = [f for f in listdir(dataPath) if isfile(join(dataPath, f))]
del images[-1]

IDs = getImageID(images)
labels = pd.read_csv(dataPath+"\labels.csv")
labels.malignant = labels.malignant+1
trainIndex = getIndex(IDs,labels.id)
validationIndex = getValidationIndex(IDs, labels.id)
validationID =[]
for index in validationIndex:
    validationID.append(IDs[index][0])


Great, so we have the images. Not all images are the same size however, lets find the maximum image size so we know how much we need to pad the images.

In [23]:
imageSize = [0,0]
for image in images:
    im = Image.open(dataPath+image)
    if imageSize[0] < im.size[0]:
        imageSize[0] = im.size[0]
    if imageSize[1] < im.size[1]:
        imageSize[1] = im.size[1]
print("Maximum image size will be: "+ str(imageSize))

Maximum image size will be: [896, 896]


Now we know what the maximum size will be, lets pad all the images to our desired dimensions

In [24]:
def add_margin(pil_img, top, right):
    width, height = pil_img.size
    new_width = width + right
    new_height = height + top
    result = Image.new(pil_img.mode, (new_width, new_height), (0, 0, 0))
    result.paste(pil_img, (0, top))
    return result

try:
    if not len(listdir("data_padded")) == 186:
        for image in images:
            im = Image.open(dataPath+image)
            dWidth =  imageSize[0] - im.size[0]
            dHeight = imageSize[1] - im.size[1]
            if dWidth or dHeight:
                im = add_margin(im, dHeight,dWidth)
            im.save('data_padded/'+image)
except:
    for image in images:
        im = Image.open(dataPath+image)
    dWidth =  imageSize[0].astype(int) - im.size[0]
    dHeight = imageSize[1].astype(int) - im.size[1]
    if dWidth or dHeight:
        im = add_margin(im, dHeight,dWidth)
    im.save('data_padded/'+image)

In [84]:
paddedImages = [f for f in listdir("data_padded") if isfile(join("data_padded", f))]
trainData = list(K.utils.img_to_array(Image.open('data_padded/'+paddedImages[i])) for i in trainIndex)

for i in range(len(trainData)):
    trainData[i] = (trainData[i]-np.mean(trainData[i]))/np.std(trainData[i])

validationData = list(K.utils.img_to_array(Image.open('data_padded/'+paddedImages[i])) for i in validationIndex)

for i in range(len(validationData)):
    validationData[i] = (validationData[i]-np.mean(validationData[i]))/np.std(validationData[i])

trainData = np.array(trainData)
validationData =np.array(validationData)
labelData = np.array(labels.malignant)

In [117]:
resmodel = applications.ResNet50(include_top=False, weights='imagenet', input_shape=(imageSize[0], imageSize[1], 3))

model = K.models.Sequential()
model.add(resmodel)
model.add(K.layers.Dropout(.2, input_shape=(7,7,2048)))
model.add(K.layers.Flatten())
model.add(K.layers.Dropout(.2, input_shape=(7,7,2048)))
model.add(K.layers.Dense(3, activation='softmax',
                         kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
                         bias_regularizer=regularizers.L2(1e-4),))
for layer in model.layers[:-4]:
    layer.trainable = False
for i, layer in enumerate(model.layers):
    print(i, layer.name, "-", layer.trainable)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizers.Adam(learning_rate=0.0001), metrics=["accuracy"])
modelName = "DoubleDropout5Epoch"

0 resnet50 - False
1 dropout_3 - True
2 flatten_6 - True
3 dropout_4 - True
4 dense_6 - True


In [118]:
#startpoint 10 epochs
if not os.path.isdir(modelName):
    history = model.fit(trainData,labelData , verbose = 1, batch_size = 16, epochs = 5)
    model.summary()
    model.save(modelName)
else:
    print("Model already exist, not retraining it :)")
    answer = input("Do you want to load the model instead? Y/N")
    if answer == "Y" or answer == "y":
        model = K.models.load_model(modelName)
        print("Model successfully loaded")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50 (Functional)       (None, 28, 28, 2048)      23587712  
                                                                 
 dropout_3 (Dropout)         (None, 28, 28, 2048)      0         
                                                                 
 flatten_6 (Flatten)         (None, 1605632)           0         
                                                                 
 dropout_4 (Dropout)         (None, 1605632)           0         
                                                                 
 dense_6 (Dense)             (None, 3)                 4816899   
                                                                 
Total params: 28,404,611
Trainable params: 4,816,899
Non-trainable params: 23,587,712
__________________________________________________



INFO:tensorflow:Assets written to: DoubleDropout5Epoch\assets


INFO:tensorflow:Assets written to: DoubleDropout5Epoch\assets


In [119]:
prediction = model.predict(validationData)



In [120]:
predictedType = []
for p in prediction:
    maxp = np.argmax(p)
    predictedType.append(maxp-1)

predictions = {"id": validationID, "malignant": predictedType}
predictiondf = pd.DataFrame(data = predictions)
predictiondf.to_csv("prediction.csv", index= False)
print(predictiondf)

         id  malignant
0    103601         -1
1    105480         -1
2    118847         -1
3    125877          1
4    133778         -1
..      ...        ...
119  968389         -1
120   97549         -1
121  976505          1
122  996288         -1
123  997841         -1

[124 rows x 2 columns]


In [114]:
#in advance, maybe lets try different models?