In [1]:
import os 
import re
import glob

import numpy as np 
import pandas as pd 
from PIL import Image 
from matplotlib import pyplot as plt 
import seaborn as sns 

from skimage import morphology 
from skimage.transform import rotate
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler

from time import time 

In [3]:
# Creating the main data set
features = pd.read_csv("../features/feature_set_with_multiple_perimeters.csv", sep = ";")
data = pd.read_csv("../data/ISIC-2017_Training_Part3_GroundTruth.csv")

image_data = pd.merge(features, data, on = "image_id")

image_data = image_data.drop(["image_id", "seborrheic_keratosis"], axis = 1)

feature_list = image_data.columns.tolist()
feature_list.remove('melanoma')


In [13]:
# Separating the data into train and test sets

df, df2 = image_data, image_data

noise = pd.DataFrame(data = np.random.RandomState(23).uniform(0, 0.1, size=(df2.shape[0], 20)), columns = [i for i in range(20)])

X = df2[feature_list]
X_noisy = pd.merge(X, noise, left_index = True, right_index = True)

x_scaled = StandardScaler().fit_transform(X.values)
x_scaled_df = pd.DataFrame(x_scaled, index=X.index, columns=X.columns)
x_noisy_scaled = StandardScaler().fit_transform(X_noisy.values)
x_noisy_scaled_df = pd.DataFrame(x_noisy_scaled, index=X_noisy.index, columns=X_noisy.columns)
y = df2['melanoma']


X_dev, X_test, y_dev, y_test = train_test_split(x_scaled_df, y, stratify=y, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, stratify=y_dev)

print(X_dev.describe())


              area  perimeter_1  perimeter_2  perimeter_3  perimeter_4  \
count  1500.000000  1500.000000  1500.000000  1500.000000  1500.000000   
mean     -0.014749    -0.015470    -0.014806    -0.014354    -0.014117   
std       0.958892     0.967143     0.972224     0.974376     0.976028   
min      -0.430947    -0.827480    -0.841699    -0.858442    -0.861356   
25%      -0.380365    -0.532137    -0.540179    -0.545082    -0.549488   
50%      -0.312437    -0.335007    -0.338658    -0.336868    -0.337106   
75%      -0.139213     0.075881     0.081778     0.088849     0.090981   
max       7.628426     8.950688     8.525753     8.538010     8.174256   

       compactness_1  compactness_2  compactness_3  compactness_4  \
count    1500.000000    1500.000000    1500.000000    1500.000000   
mean       -0.010674      -0.009503      -0.007820      -0.007395   
std         0.920334       0.933402       0.941120       0.945100   
min        -0.753550      -0.912473      -1.000981      -

In [8]:
# training the classifiers 
classifiers_name = ["KNN_1", "KNN_3", "KNN_5", "KNN_10", "KNN_50", "Tree", "Gaussian"]
classifiers = []
classifiers.append(KNeighborsClassifier(n_neighbors = 1))
classifiers.append(KNeighborsClassifier(n_neighbors = 3))
classifiers.append(KNeighborsClassifier(n_neighbors = 5))
classifiers.append(KNeighborsClassifier(n_neighbors = 10))
classifiers.append(KNeighborsClassifier(n_neighbors = 50))
classifiers.append(DecisionTreeClassifier())
classifiers.append(GaussianProcessClassifier())

trained_classifiers = [classifier.fit(X_train, y_train) for classifier in classifiers]                   


In [9]:
#Evaluating the features
predictions = [trained.predict(X_val) for trained in trained_classifiers]

accuracy_scores = [accuracy_score(y_val, prediction) for prediction in predictions]
auc_scores = [roc_auc_score(y_val, prediction) for prediction in predictions]
f1_scores = [f1_score(y_val, prediction) for prediction in predictions]

columns = ["Classifier", "Accuracy score", "Roc Auc score", "F1 score"]
data = np.array([classifiers_name, accuracy_scores, auc_scores, f1_scores]).T
Classifier_evaluation = pd.DataFrame(data = data, columns = columns)
Classifier_evaluation

Unnamed: 0,Classifier,Accuracy score,Roc Auc score,F1 score
0,KNN_1,0.6906666666666667,0.496135831381733,0.1830985915492957
1,KNN_3,0.776,0.5375878220140514,0.2075471698113207
2,KNN_5,0.7893333333333333,0.5292740046838408,0.1684210526315789
3,KNN_10,0.8133333333333334,0.5165105386416862,0.0789473684210526
4,KNN_50,0.8133333333333334,0.5,0.0
5,Tree,0.736,0.5515222482435598,0.2666666666666666
6,Gaussian,0.8186666666666667,0.5473067915690867,0.1904761904761904


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.
 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.