In [34]:
# load modules and functions
from scipy.io import arff
import urllib.request
import pandas as pd
import numpy as np
import io
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score


In [35]:
#load dataset from github

url_train = "https://raw.githubusercontent.com/Carloszone/ALY-6020/master/Week%201/FaceAll/FaceAll_TRAIN.arff"
url_test = "https://raw.githubusercontent.com/Carloszone/ALY-6020/master/Week%201/FaceAll/FaceAll_TEST.arff"

arff_train = urllib.request.urlopen(url_train)
arff_test = urllib.request.urlopen(url_test)
data_train = arff.loadarff(io.StringIO(arff_train.read().decode('utf-8')))
data_test = arff.loadarff(io.StringIO(arff_test.read().decode('utf-8')))


In [36]:
df_train = pd.DataFrame(data_train[0])
df_train.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att123,att124,att125,att126,att127,att128,att129,att130,att131,target
0,-0.247592,-0.332503,-0.632105,-1.024779,-1.426342,-1.580426,-1.336747,-1.060614,-0.723881,-0.617905,...,-0.273452,-0.077336,-0.119991,-0.350027,-0.490925,-0.142705,0.02771,0.09079,-0.327312,b'1'
1,-0.641577,-0.93942,-1.300174,-1.385949,-1.269212,-1.036537,-0.758694,-0.63115,-0.581992,-0.539002,...,-0.607773,-0.939176,-0.640406,-0.191614,0.062604,0.030992,-0.388804,-0.581837,-0.24863,b'1'
2,-1.672048,-1.881515,-2.090981,-1.697734,-1.29507,-0.822547,-0.347806,0.73743,1.852206,2.448954,...,0.058114,0.370807,0.014227,-0.325895,-0.325895,-0.325895,-0.325895,-0.325895,-0.325895,b'1'
3,-0.463782,-1.111783,-1.747124,-2.112366,-2.409313,-2.011936,-1.557494,-0.735304,0.059425,0.728291,...,-0.061193,-0.06225,-0.062439,-0.062439,-0.370382,-0.72611,-0.760803,-0.76392,-0.76392,b'1'
4,-0.764739,-1.118529,-1.464805,-1.650747,-1.79088,-1.465283,-1.153654,-0.932041,-0.664873,-0.188906,...,-0.372712,-0.489895,-0.414688,-0.297505,-0.178539,-0.059297,-0.265746,-0.50423,-0.50423,b'1'


In [37]:
df_test = pd.DataFrame(data_test[0])
df_test.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att123,att124,att125,att126,att127,att128,att129,att130,att131,target
0,-0.637379,-1.163642,-1.560159,-1.607592,-1.369525,-1.028345,-0.796911,-0.673459,-0.606461,-0.121185,...,-0.387549,-0.793876,-0.731225,-0.333497,-0.168134,-0.059181,-0.370224,-0.578104,-0.296681,b'1'
1,-0.262558,-0.673262,-1.083966,-1.4343,-1.781666,-1.76029,-1.700768,-1.416769,-1.096037,-0.308092,...,-0.269868,-0.337655,-0.234648,-0.175097,-0.381111,-0.558154,-0.455147,-0.455147,-0.455147,b'1'
2,-0.647722,-0.85273,-1.057737,-1.377993,-1.703917,-1.472404,-1.183225,-0.973524,-0.776829,-0.419171,...,-0.253859,-0.374525,-0.580565,-0.743433,-0.642469,-0.541118,-0.436043,-0.436043,-0.436043,b'1'
3,-0.168912,-0.707683,-1.246455,-1.258758,-1.262834,-1.286919,-1.311639,-1.232654,-1.148651,-0.767647,...,-0.336361,-0.329791,-0.223019,-0.126104,-0.232875,-0.333076,-0.226304,-0.226304,-0.226304,b'1'
4,-0.846935,-1.148083,-1.449231,-1.52779,-1.595401,-1.422643,-1.225018,-1.183937,-1.168472,-0.752264,...,-0.254825,-0.750571,-0.646832,-0.527317,-0.311391,-0.116104,-0.120328,-0.120328,-0.120328,b'1'


In [77]:
# transform target into numeric type
df_train['target'] = df_train.target.astype(str)
df_test['target'] = df_test.target.astype(str)

df_train['target'] = df_train.target.str.extract(pat = '([0-9]+)').astype(int)
df_test['target'] =df_test.target.str.extract(pat = '([0-9]+)').astype(int)

In [78]:
def  KNN(train_x, train_y, test_x, test_y, n:list, weight = 'uniform'):
    train_accuracy = []
    #train_recall = []
    #train_f1score = []
    test_accuracy = []
    #test_recall = []
    #test_f1score = []
    
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    # split the training set
    x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state = 2021)
    
    for num in n:
        model = KNeighborsClassifier(n_neighbors=num, weights = weight)
        model.fit(x_train,y_train) 
        train_pred = model.predict(x_test) 
        test_pred = model.predict(test_x)
        
        train_accuracy.append(accuracy_score(y_test,train_pred))
        #train_recall.append(recall_score(y_test,train_pred, average = 'micro'))
        #train_f1score.append(f1_score(y_test,train_pred, average = 'micro'))
        test_accuracy.append(accuracy_score(test_y,test_pred))
        #test_recall.append(recall_score(test_y,test_pred, average = 'micro'))
        #test_f1score.append(f1_score(test_y,test_pred, average = 'micro'))
    result = pd.DataFrame({
        'n':n,
        'train accuracy':train_accuracy,
        'test accuracy':test_accuracy
    })
    return result

In [79]:
# set train_x, train_y, test_x, test_y
train_x = df_train.iloc[:,:-1]
train_y = df_train.iloc[:,-1]
test_x = df_test.iloc[:,:-1]
test_y = df_test.iloc[:,-1]

In [80]:
n = list(range(1,20))
no_s_result = KNN(train_x, train_y, test_x, test_y, n)

In [81]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_x = sc.fit_transform(train_x)
test_x = sc.transform(test_x)

In [82]:
n = list(range(1,20))
s_result = KNN(train_x, train_y, test_x, test_y, n)

In [83]:
n = list(range(1,20))
s_d_result = KNN(train_x, train_y, test_x, test_y, n, 'distance')

In [84]:
result = pd.DataFrame({
        'n':s_d_result['n'],
        'no standardize train accuracy': no_s_result['train accuracy'],
        'no standardize test accuracy': no_s_result['test accuracy'],
        'standardize train accuracy': s_result['train accuracy'],
        'standardize test accuracy': s_result['test accuracy'],
        'weight distance train accuracy': s_d_result['train accuracy'],
        'weight distance test accuracy': s_d_result['test accuracy']
    })
result = result.set_index('n')

In [85]:
result

Unnamed: 0_level_0,no standardize train accuracy,no standardize test accuracy,standardize train accuracy,standardize test accuracy,weight distance train accuracy,weight distance test accuracy
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.848214,0.697041,0.857143,0.662722,0.857143,0.662722
2,0.821429,0.66213,0.839286,0.647337,0.857143,0.662722
3,0.839286,0.656213,0.821429,0.61716,0.848214,0.640237
4,0.8125,0.640237,0.767857,0.605917,0.8125,0.627811
5,0.8125,0.628994,0.758929,0.59645,0.785714,0.615385
6,0.776786,0.620118,0.741071,0.585799,0.767857,0.609467
7,0.758929,0.608284,0.723214,0.570414,0.758929,0.597041
8,0.723214,0.597041,0.705357,0.560947,0.732143,0.589941
9,0.714286,0.585207,0.714286,0.54497,0.75,0.576923
10,0.705357,0.578698,0.696429,0.533136,0.741071,0.563905
