# KNN in python

In [28]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [30]:
df=pd.read_pickle("pickle1")

In [53]:
df.shape

(67639, 711)

In [31]:
df.head()

Unnamed: 0,id,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,gaze_1_z,gaze_angle_x,gaze_angle_y,eye_lmk_x_0,...,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,QoE
0,1,0.08201,0.06294,-0.994642,-0.124776,0.080031,-0.988952,-0.022,0.072,160.5,...,0,0,0,0,0,0,0,0,0,2
1,1,0.078303,0.060465,-0.995094,-0.128854,0.077414,-0.988637,-0.025,0.069,161.0,...,0,0,0,0,0,0,0,0,0,2
2,1,0.080084,0.059067,-0.995036,-0.130403,0.07609,-0.988537,-0.025,0.068,161.1,...,0,0,0,0,0,0,0,0,0,2
3,1,0.081053,0.058107,-0.995014,-0.129042,0.074312,-0.988851,-0.024,0.067,161.1,...,0,0,0,0,0,0,0,0,0,2
4,1,0.082496,0.059614,-0.994807,-0.128192,0.076915,-0.988762,-0.023,0.069,160.9,...,0,0,0,0,0,0,0,0,0,2


## Standardize the variables
Siccome il KNN classifica in base alla distanza dai punti di test, si fa un rescaling per rappresentare tutti i dati in modo simile.
È buona pratica quindi normalizzare i dati in modo tale che possano essere valutati uniformemente.

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()

In [10]:
#calcola la media e la varianza che verrano usate nel transform successivo (non consideriamo la colonna target)

scaler.fit(df.drop('QoE',axis=1))

StandardScaler()

In [11]:
scaled_features = scaler.transform(df.drop('QoE',axis=1))

In [12]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

Unnamed: 0,id,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,gaze_1_z,gaze_angle_x,gaze_angle_y,eye_lmk_x_0,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,-1.603384,-1.271694,-0.889513,-1.405812,-1.378494,-0.779758,-0.586249,-1.318922,-0.849503,1.416222,...,-0.128219,-0.688898,-0.338595,-0.50349,-0.277895,-0.773873,-0.369774,-0.253659,-0.047614,-0.408551
1,-1.603384,-1.301956,-0.914407,-1.418809,-1.409756,-0.805934,-0.575568,-1.341865,-0.878817,1.435432,...,-0.128219,-0.688898,-0.338595,-0.50349,-0.277895,-0.773873,-0.369774,-0.253659,-0.047614,-0.408551
2,-1.603384,-1.287417,-0.928468,-1.417141,-1.42163,-0.819177,-0.572177,-1.341865,-0.888589,1.439274,...,-0.128219,-0.688898,-0.338595,-0.50349,-0.277895,-0.773873,-0.369774,-0.253659,-0.047614,-0.408551
3,-1.603384,-1.279507,-0.938124,-1.416508,-1.411197,-0.836961,-0.582824,-1.334217,-0.89836,1.439274,...,-0.128219,-0.688898,-0.338595,-0.50349,-0.277895,-0.773873,-0.369774,-0.253659,-0.047614,-0.408551
4,-1.603384,-1.267727,-0.922966,-1.410556,-1.404681,-0.810925,-0.579806,-1.32657,-0.878817,1.43159,...,-0.128219,-0.688898,-0.338595,-0.50349,-0.277895,-0.773873,-0.369774,-0.253659,-0.047614,-0.408551


## Train Test dataset split

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
#splitting del dataset in parte destinata al training e parte destinata al test (x_train sono le feature per il
# training e y_train sono le lables per il training e viceversa x_test sono le feature per il test e y_test le
# lables per il test)

x_train, x_test, y_train, y_test = train_test_split(scaled_features,df['QoE'],test_size=0.60)

## KNN

In [36]:
from sklearn.neighbors import KNeighborsClassifier

In [37]:
knn = KNeighborsClassifier(n_neighbors=5)

In [38]:
#training del modello con fit

knn.fit(x_train,y_train)

KNeighborsClassifier()

In [45]:
#test di predizione del modello con predict

pred = knn.predict(x_test)

In [46]:
pred

array([2, 3, 3, ..., 3, 3, 3])

## Predictions and evaluations

In [47]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [48]:
print(confusion_matrix(y_test,pred))

#confusion matrix: matrice che ha come indici di colonne i valori reali obiettivo di studio (actual values) e
# come indici di riga i valori predetti. La diagonale presenta il numero dei valori che sono stati correttamen-
# te indovinati (TP - True Positive) mentre sopra la diagonale sono presenti i valori falsi positivi (FP) (cioè
# quelli che sono stati dichiarati di quel valore ma che in realtà non lo erano) e sotto i falsi negativi (FN)
# (cioè quelli che non stati dichiarati come dovevano).

[[ 2957    21     0     0     0]
 [   32  9213    42    11     2]
 [    2    48 16626    52    11]
 [    0    15    74  7599     0]
 [    0     1    12     0  3866]]


In [49]:
print(classification_report(y_test,pred))

#RECALL: tasso di true positive (es: di tutti gli 1 che c'erano (TP: quelli indovinati + FN: quelli che sono
# stati dichiarati essere qualcos'altro), quanti ne sono stati indovinati correttamente?
# TP/(TP+FN) cioè della matrice sopra 2944/(2944+31+1+3))

#PRECISION: di tutti i valori che sono stati dichiarati essere 1 (TP + FP), quanti erano effettivamente 1?
# TP/(TP/FP)

#FBeta-Score: (1+Beta^2)*(Precision*Recall) / Beta^2*Precision+Recall
# scegliere il Beta in base alle necessità; in questo caso Beta=1 va bene per considerare allo stesso modo
# falsi positivi e falsi negativi; nel caso in cui si dovesse avere un problema in cui i falsi positivi sono
# più importanti allora si riduce il valore di Beta (tipicamente 0,5 o un altro valore tra 0 e 1), se invece
# i falsi negativi sono da considerare maggiormente allora alzo il valore di Beta (per esempio mettendo 2)

              precision    recall  f1-score   support

           1       0.99      0.99      0.99      2978
           2       0.99      0.99      0.99      9300
           3       0.99      0.99      0.99     16739
           4       0.99      0.99      0.99      7688
           5       1.00      1.00      1.00      3879

    accuracy                           0.99     40584
   macro avg       0.99      0.99      0.99     40584
weighted avg       0.99      0.99      0.99     40584



In [50]:
#ACCURACY: TP / TP+FP+FN

print(accuracy_score(y_test,pred))

0.9920411985018727
