In [2]:
import os
import re
import glob
import random

import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
import seaborn as sns 

from skimage import morphology 
from skimage.transform import rotate
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler

from time import time

In [3]:
# Creating the main data set
features = pd.read_csv("../features/feature_set_with_multiple_perimeters.csv", sep = ";")
data = pd.read_csv("../data/ISIC-2017_Training_Part3_GroundTruth.csv")

image_data = pd.merge(features, data, on = "image_id")

image_data = image_data.drop(["image_id", "seborrheic_keratosis", "area" , "perimeter_1", "perimeter_2",
                             "perimeter_3", "perimeter_4", "compactness_1", "compactness_3", "compactness_4"], axis = 1)

feature_list = image_data.columns.tolist()
feature_list.remove('melanoma')

feature_list

['compactness_2',
 'asymmetry',
 'luminance_average',
 'luminance_variance',
 'red_average',
 'green_average',
 'blue_average',
 'color_variance']

In [18]:
# Separating the data into train and test sets\n",

df = image_data

# Creates a random variable between 1 and 100 to facilitate splitting the data into multiple sets
np.random.seed(0)
separator = np.random.randint(1, 101, size = (df.shape[0], 1))

# Creates 20 random noise variable to avoid overfitting
noise = pd.DataFrame(data = np.random.RandomState(23).uniform(0, 0.1, size = (df.shape[0], 20)), columns = [i for i in range(20)])

X = df[feature_list]
X_noisy = pd.merge(X, noise, left_index = True, right_index = True)

X_scaled = StandardScaler().fit_transform(X.values)
X_scaled_df = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)
X_noisy_scaled = StandardScaler().fit_transform(X_noisy.values)
X_noisy_scaled_df = pd.DataFrame(X_noisy_scaled, index=X_noisy.index, columns=X_noisy.columns)
X_noisy_scaled_df["separator"] = separator
y = df['melanoma']


#Separating the dataset into 5 different validation sets and 1 test set
X_val1, y_val1 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] < 16], y[X_noisy_scaled_df["separator"] < 16]
X_val2, y_val2 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 15][X_noisy_scaled_df["separator"] < 31], y[X_noisy_scaled_df["separator"] > 15][X_noisy_scaled_df["separator"] < 31]
X_val3, y_val3 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 30][X_noisy_scaled_df["separator"] < 46], y[X_noisy_scaled_df["separator"] > 30][X_noisy_scaled_df["separator"] < 46]
X_val4, y_val4 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 45][X_noisy_scaled_df["separator"] < 61], y[X_noisy_scaled_df["separator"] > 45][X_noisy_scaled_df["separator"] < 61]
X_val5, y_val5 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 60][X_noisy_scaled_df["separator"] < 76], y[X_noisy_scaled_df["separator"] > 60][X_noisy_scaled_df["separator"] < 76]
X_test, y_test= X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 75], y[X_noisy_scaled_df["separator"] > 75]

validation_sets = [(X_val1, y_val1), (X_val2, y_val2), (X_val3, y_val3), (X_val4, y_val4), (X_val5, y_val5)]

#Creating the training sets from the validation sets
X_train1 = pd.concat([X_val2, X_val3, X_val4, X_val5])
X_train2 = pd.concat([X_val1, X_val3, X_val4, X_val5])
X_train3 = pd.concat([X_val1, X_val2, X_val4, X_val5])
X_train4 = pd.concat([X_val1, X_val2, X_val3, X_val5])
X_train5 = pd.concat([X_val1, X_val2, X_val3, X_val4])

y_train1 = pd.concat([y_val2, y_val3, y_val4, y_val5])
y_train2 = pd.concat([y_val1, y_val3, y_val4, y_val5])
y_train3 = pd.concat([y_val1, y_val2, y_val4, y_val5])
y_train4 = pd.concat([y_val1, y_val2, y_val3, y_val5])
y_train5 = pd.concat([y_val1, y_val2, y_val3, y_val4])

training_sets = [(X_train1, y_train1), (X_train2, y_train2), (X_train3, y_train3), (X_train4, y_train4), (X_train5, y_train5)]








  X_val2, y_val2 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 15][X_noisy_scaled_df["separator"] < 31], y[X_noisy_scaled_df["separator"] > 15][X_noisy_scaled_df["separator"] < 31]
  X_val3, y_val3 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 30][X_noisy_scaled_df["separator"] < 46], y[X_noisy_scaled_df["separator"] > 30][X_noisy_scaled_df["separator"] < 46]
  X_val4, y_val4 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 45][X_noisy_scaled_df["separator"] < 61], y[X_noisy_scaled_df["separator"] > 45][X_noisy_scaled_df["separator"] < 61]
  X_val5, y_val5 = X_noisy_scaled_df[X_noisy_scaled_df["separator"] > 60][X_noisy_scaled_df["separator"] < 76], y[X_noisy_scaled_df["separator"] > 60][X_noisy_scaled_df["separator"] < 76]


pandas.core.frame.DataFrame

In [21]:
# Creating the classifiers

classifiers_name = ["KNN_1", "KNN_3", "KNN_5", "KNN_10", "KNN_50", "Tree", "Gaussian"]
classifiers = []
classifiers.append(KNeighborsClassifier(n_neighbors = 1))
classifiers.append(KNeighborsClassifier(n_neighbors = 3))
classifiers.append(KNeighborsClassifier(n_neighbors = 5))
classifiers.append(KNeighborsClassifier(n_neighbors = 10))
classifiers.append(KNeighborsClassifier(n_neighbors = 50))
classifiers.append(DecisionTreeClassifier())
classifiers.append(GaussianProcessClassifier())

accuracy_scores = []
auc_scores = []
f1_scores = []


# Calculating the scores for each classifier for each training set
for i in range(len(training_sets)):
    trained_classifiers = [classifier.fit(training_sets[i][0], training_sets[i][1]) for classifier in classifiers]
    predictions = [trained.predict(validation_sets[i][0]) for trained in trained_classifiers]
    accuracy_scores.append([accuracy_score(validation_sets[i][1], prediction) for prediction in predictions])
    auc_scores.append([roc_auc_score(validation_sets[i][1], prediction) for prediction in predictions])
    f1_scores.append([f1_score(validation_sets[i][1], prediction) for prediction in predictions])


    
    
accuracy_scores_avg = []
auc_scores_avg = []
f1_scores_avg = []

# Calculating the average score for each classifier
for i in range(len(classifiers)):
    accuracy_scores_avg.append(np.mean([accuracy_scores[j][i] for j in range(len(training_sets))]))
    auc_scores_avg.append(np.mean([auc_scores[j][i] for j in range(len(training_sets))]))
    f1_scores_avg.append(np.mean([f1_scores[j][i] for j in range(len(training_sets))]))


columns = ["Classifier", "Accuracy score", "Roc Auc score", "F1 score"]
data = np.array([classifiers_name, np.round(accuracy_scores_avg, decimals = 5), 
                 np.round(auc_scores_avg, decimals = 5), np.round(f1_scores_avg, decimals = 5)]).T
Classifier_evaluation = pd.DataFrame(data = data, columns = columns)
Classifier_evaluation

Unnamed: 0,Classifier,Accuracy score,Roc Auc score,F1 score
0,KNN_1,0.6928,0.49407,0.16718
1,KNN_3,0.75585,0.50516,0.12797
2,KNN_5,0.77727,0.49677,0.05784
3,KNN_10,0.80475,0.50013,0.01301
4,KNN_50,0.80886,0.5,0.0
5,Tree,0.68125,0.50982,0.21503
6,Gaussian,0.7043,0.49482,0.15955


In [None]:
#Evaluating the features
predictions = [trained.predict(X_val) for trained in trained_classifiers]

accuracy_scores = [accuracy_score(y_val, prediction) for prediction in predictions]
auc_scores = [roc_auc_score(y_val, prediction) for prediction in predictions]
f1_scores = [f1_score(y_val, prediction) for prediction in predictions]

columns = ["Classifier", "Accuracy score", "Roc Auc score", "F1 score"]
data = np.array([classifiers_name, np.round(accuracy_scores, decimals = 5), 
                 np.round(auc_scores, decimals = 5), np.round(f1_scores, decimals = 5)]).T\
Classifier_evaluation = pd.DataFrame(data = data, columns = columns)
Classifier_evaluation