# Imports

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
import matplotlib.pyplot as plt
import time
import multiprocessing

# Import Data

In [2]:
data_1_bin = pd.read_csv("../Data/norm_1_binary.csv")
data_1 = pd.read_csv("../Data/norm_1.csv")
data_2_bin = pd.read_csv("../Data/norm_2_binary.csv")
data_2 = pd.read_csv("../Data/norm_2.csv")

# Separate Data

In [3]:
y_1_bin = data_1_bin['type']
x_1_bin = data_1_bin.drop(['type'], axis='columns')

y_1 = data_1['type']
x_1 = data_1.drop(['type'], axis='columns')

y_2_bin = data_2_bin['type']
x_2_bin = data_2_bin.drop(['type'], axis='columns')

y_2 = data_2['type']
x_2 = data_2.drop(['type'], axis='columns')

# Split Data

In [4]:
x_1_bin_train, x_1_bin_test, y_1_bin_train, y_1_bin_test = train_test_split(x_1_bin, y_1_bin, test_size=0.2, random_state=1, stratify=y_1_bin)
x_1_train, x_1_test, y_1_train, y_1_test = train_test_split(x_1, y_1, test_size=0.2, random_state=1, stratify=y_1)
x_2_bin_train, x_2_bin_test, y_2_bin_train, y_2_bin_test = train_test_split(x_2_bin, y_2_bin, test_size=0.2, random_state=1, stratify=y_2_bin)
x_2_train, x_2_test, y_2_train, y_2_test = train_test_split(x_2, y_2, test_size=0.2, random_state=1, stratify=y_2)

# Build Model (binary with normalization 1)

In [5]:
knn_1_bin = KNeighborsClassifier(n_neighbors = 17)

In [6]:
knn_1_bin.fit(x_1_bin_train, y_1_bin_train);

In [7]:
knn_1_bin.score(x_1_bin_test, y_1_bin_test)

0.9940584538242264

# Build Model (multi classification with normalization 1)

In [8]:
knn_1 = KNeighborsClassifier(n_neighbors = 17)

In [9]:
knn_1.fit(x_1_train, y_1_train);

In [10]:
knn_1.score(x_1_test, y_1_test)

0.9933372256757221

# Build Model (binary with normalization 2)

In [11]:
knn_2_bin = KNeighborsClassifier(n_neighbors = 17)

In [12]:
knn_2_bin.fit(x_2_bin_train, y_2_bin_train);

In [13]:
knn_2_bin.score(x_2_bin_test, y_2_bin_test)

0.9951574681457568

# Build Model (multi classification with normalization 2)

In [14]:
knn_2 = KNeighborsClassifier(n_neighbors = 17)

In [15]:
knn_2.fit(x_2_train, y_2_train);

In [16]:
knn_2.score(x_2_test, y_2_test)

0.9948827145653741

# K Fold Cross Validation

In [17]:
def get_score(model, x_train, y_train, x_test, y_test):
    
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)
    

In [18]:
def train_model(model, x_train, y_train, index):
    
    print("Training model {}!".format(index))
    
    model.fit(x_train, y_train)
    return model

In [24]:
def cross_validation(model, data):
    
    scores = []

    Y = data['type']
    X = data.drop(['type'], axis='columns')
    
    skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    iteration = 1
    index = 0
    
    results = []
    
    for train_index, test_index in skf.split(X,Y):
        
        print("Enter {}. iteration!".format(iteration))
        
        x_train, y_train, x_test, y_test = X.loc[train_index], Y.loc[train_index], X.loc[test_index], Y.loc[test_index]
        
        p = multiprocessing.Process(target=train_model, args=[model, x_train, y_train, iteration])
        p.start()
        # p.join()
        
        scores = scores + [model.score(x_test, y_test)]
        
        index += 1
        iteration += 1
        
    print(scores)
    print("The average is {}".format(sum(scores)/len(scores)))
        
    

In [25]:
cross_validation(knn_1_bin, data_1_bin)



Enter 1. iteration!
Training model 1!
Enter 2. iteration!
Training model 2!
Enter 3. iteration!
Training model 3!
Enter 4. iteration!
Training model 4!
Enter 5. iteration!
Training model 5!
Enter 6. iteration!
Training model 6!
Enter 7. iteration!
Training model 7!
Enter 8. iteration!
Training model 8!
Enter 9. iteration!
Training model 9!
Enter 10. iteration!
Training model 10!
[0.9937495707122742, 0.9944364310735627, 0.995741465760011, 0.9951232914348513, 0.9945051171096916, 0.9959472455007556, 0.9961533177634291, 0.9958098639923066, 0.9949168841873883, 0.994573430416266]
The average is 0.9950956617950537


[0.9937495707122742, 0.9944364310735627, 0.995741465760011, 0.9951232914348513, 0.9945051171096916, 0.9959472455007556, 0.9961533177634291, 0.9958098639923066, 0.9949168841873883, 0.994573430416266]