In [8]:
from sklearn.preprocessing import LabelEncoder
import sklearn
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from scipy.io import arff
import pandas as pd
import math
import os.path
import scipy
import scipy.stats

In [None]:
"""k-nearest neighbours """

In [9]:
def k_nearest_neighbours(X, y, num_iter=5, training_percentage=0.85, testing_percentage=0.15):
     
    num_training_examples = math.floor(X.shape[0]*training_percentage) # number of rows in a single column * our training %
    num_testing_examples = math.ceil(X.shape[0]*testing_percentage)

    X_train = X.iloc[:num_training_examples, :].to_numpy() # isolating 85% of input data for our training
    X_test = X.iloc[:num_testing_examples, :].to_numpy()   # isolating 15% of input data for out testing

    y_train = y.iloc[:num_training_examples].to_numpy().reshape(-1)    # isolating 85% of our target to match with our training input
    y_test = y.iloc[:num_testing_examples].to_numpy().reshape(-1)      # isolating 15% of our target to match with our testing input
    
    # preprocessing training x
    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    """fitting support vector regression model"""
    # calling hyperparamter search for knn
    random_search_knn = hyperparameterSearch(X_train,y_train,num_iter)
    
    # preprocessing testing x
    X_test = scaler.transform(X_test)
    
    # predicting
    y_pred = random_search_knn.predict(X_test)
    
    accuracy1 = sklearn.metrics.mean_squared_error(y_test, y_pred)
    accuracy2 = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    accuracy3 = sklearn.metrics.max_error(y_test, y_pred)
    accuracy4 = sklearn.metrics.r2_score(y_test, y_pred)
    accuracy6 = sklearn.metrics.explained_variance_score(y_test, y_pred)
    accuracy7 = sklearn.metrics.accuracy_score(y_test, y_pred)
    
    print('                 MINIMIZE: ')
    print("Mean squared error\t", accuracy1)
    print("Mean absolute error\t", accuracy2)
    print("Max error\t\t", accuracy3)
    print('                 MAXIMIZE: ')
    print("r2 Score\t\t", accuracy4)
    print("Explained Variance Score", accuracy6)
    print("accuracy_score\t\t", accuracy7)
    print('---------------------------------------------')

In [16]:
def hyperparameterSearch(X_train, y_train, num_iter):
    knn = sklearn.neighbors.KNeighborsClassifier().fit(X_train, y_train)
    param_distribution = {'n_neighbors': range(1, 100), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'weights': ['uniform', 'distance'], 'leaf_size': scipy.stats.uniform(1, 50)}
    randomized_search_ = sklearn.model_selection.RandomizedSearchCV(knn, param_distribution, n_iter=num_iter, verbose=1, random_state=0).fit(X_train,y_train)
    print('Best Hyperparameters = ' + str(randomized_search_.best_params_))
    return randomized_search_

In [None]:
"""importing datasets""";

In [15]:
"""1. Diabetic Retinopathy"""
# data details:
directory = '/Users/annikatimermanis/Desktop/project/datasets/classification/messidor_features/messidor_features.arff'
data = arff.loadarff(directory)
data = pd.DataFrame(data[0])

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target

binary_values = y.to_numpy().reshape(-1)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(binary_values)
y = integer_encoded
# y is converted into a numpy array, therefore we ca
# not use illoc to slice, so we convert it back to dataframe for simplicitiy
y = pd.DataFrame(y)

print("""Diabetic Retinopathy""")
k_nearest_neighbours(X, y)

Diabetic Retinopathy
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters = {'algorithm': 'brute', 'leaf_size': 27.444745987645224, 'n_neighbors': 25, 'weights': 'distance'}
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
accuracy_score		 1.0
---------------------------------------------


In [17]:
"""2. Default of credit card clients"""
# data details:
directory = '/Users/annikatimermanis/Desktop/project/datasets/classification/credit_card_clients/credit_card_clients.xls'
data = pd.read_excel(directory, skiprows=1) 

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Default of Credit Card Clients""")  
k_nearest_neighbours(X, y)

Default of Credit Card Clients
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters = {'algorithm': 'kd_tree', 'leaf_size': 41.60843643877466, 'n_neighbors': 47, 'weights': 'uniform'}
                 MINIMIZE: 
Mean squared error	 0.19755555555555557
Mean absolute error	 0.19755555555555557
Max error		 1
                 MAXIMIZE: 
r2 Score		 -0.15125615125615122
Explained Variance Score -0.0445247345247346
accuracy_score		 0.8024444444444444
---------------------------------------------


In [18]:
"""3. Breast Cancer Wisconsin"""
directory = '/Users/annikatimermanis/Desktop/project/datasets/classification/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
data = pd.read_csv(directory, delimiter=',', header=None)
data = data[(data != '?').all(axis=1)] # removes all rows that have ?
 
X = data.iloc[:,:-1]
y = data.iloc[:,-1:]

print("""Breast Cancer Wisconsin""")  
k_nearest_neighbours(X, y)

Breast Cancer Wisconsin
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters = {'algorithm': 'brute', 'leaf_size': 28.244159149844844, 'n_neighbors': 10, 'weights': 'distance'}
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
accuracy_score		 1.0
---------------------------------------------


In [None]:
"""4. Statlog (German credit data)"""
directory = '/Users/annikatimermanis/Desktop/project/datasets/classification/german_credit_card/german.data-numeric'
data = pd.read_csv(directory, delimiter=',', header=None)



print("""Statlog (German credit data)""")  

In [56]:
"""5. Adult"""
directory = '/Users/annikatimermanis/Desktop/project/datasets/classification/adult/adult.data'
data = pd.read_csv(directory, delimiter=',', header=None)

# loop to iterate through all columns in dataframe and check if the data is string type
for column in data:
    if type(data[column][0]) == str:
        # if data in column is type string, we want to convert it to equivalent numerical labels
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data[column])
        data[column] = integer_encoded

X = data.iloc[:,:-1]
y = data.iloc[:,-1:]

print("""Adult data""") 
k_nearest_neighbours(X, y)

Adult data
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters = {'algorithm': 'auto', 'leaf_size': 49.18313802505146, 'n_neighbors': 59, 'weights': 'distance'}
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
accuracy_score		 1.0
---------------------------------------------


In [61]:
"""6. Yeast"""
directory = '/Users/annikatimermanis/Desktop/project/datasets/classification/yeast/yeast.data'
data = pd.read_csv(directory, delim_whitespace=True, header=None) # delim_whitespace=True because we have a multichar delimeter of whitespaces

# loop to iterate through all columns in dataframe and check if the data is string type
for column in data:
    if type(data[column][0]) == str:
        # if data in column is type string, we want to convert it to equivalent numerical labels
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data[column])
        data[column] = integer_encoded

X = data.iloc[:,:-1]
y = data.iloc[:,-1:]

print("""Yeast data""") 
k_nearest_neighbours(X, y)

Yeast data
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters = {'algorithm': 'brute', 'leaf_size': 28.244159149844844, 'n_neighbors': 10, 'weights': 'distance'}
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
accuracy_score		 1.0
---------------------------------------------


In [71]:
"""7. Thoraric Surgery"""
directory = '/Users/annikatimermanis/Desktop/project/datasets/classification/ThoraricSurgery/ThoraricSurgery.arff'
data = arff.loadarff(directory)
data = pd.DataFrame(data[0])

for column in data:
    if type(data[column][0]) == str:
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data[column])
        data[column] = integer_encoded

X = data.iloc[:,:-1]
y = data.iloc[:,-1:]

# binary_values = y.to_numpy().reshape(-1)
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(binary_values)
# y = integer_encoded
# # y is converted into a numpy array, therefore we cannot use illoc to slice, so we convert it back to dataframe for simplicitiy
# y = pd.DataFrame(y)

print(X)

print("""Thoraric Surgery""") 
k_nearest_neighbours(X, y)

         DGN  PRE4  PRE5     PRE6  PRE7  PRE8  PRE9 PRE10 PRE11    PRE14  \
0    b'DGN2'  2.88  2.16  b'PRZ1'  b'F'  b'F'  b'F'  b'T'  b'T'  b'OC14'   
1    b'DGN3'  3.40  1.88  b'PRZ0'  b'F'  b'F'  b'F'  b'F'  b'F'  b'OC12'   
2    b'DGN3'  2.76  2.08  b'PRZ1'  b'F'  b'F'  b'F'  b'T'  b'F'  b'OC11'   
3    b'DGN3'  3.68  3.04  b'PRZ0'  b'F'  b'F'  b'F'  b'F'  b'F'  b'OC11'   
4    b'DGN3'  2.44  0.96  b'PRZ2'  b'F'  b'T'  b'F'  b'T'  b'T'  b'OC11'   
..       ...   ...   ...      ...   ...   ...   ...   ...   ...      ...   
465  b'DGN2'  3.88  2.12  b'PRZ1'  b'F'  b'F'  b'F'  b'T'  b'F'  b'OC13'   
466  b'DGN3'  3.76  3.12  b'PRZ0'  b'F'  b'F'  b'F'  b'F'  b'F'  b'OC11'   
467  b'DGN3'  3.04  2.08  b'PRZ1'  b'F'  b'F'  b'F'  b'T'  b'F'  b'OC13'   
468  b'DGN3'  1.96  1.68  b'PRZ1'  b'F'  b'F'  b'F'  b'T'  b'T'  b'OC12'   
469  b'DGN3'  4.72  3.56  b'PRZ0'  b'F'  b'F'  b'F'  b'F'  b'F'  b'OC12'   

    PRE17 PRE19 PRE25 PRE30 PRE32   AGE  
0    b'F'  b'F'  b'F'  b'T'  b'F'  60.0  
1  

ValueError: could not convert string to float: b'DGN2'