# K-Nearest Neighbor Classifier

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier

## Data Preprocessing

### glass.csv
#### Classify the types of glass
* Motivatd by criminological investigation <br/>
At the scene of the crime, the glass left can be used as evidence... if it is correctly identified!<br/><br/>

* Features : <br/>
```
RI : regractive index 
Na : Sodium 
Mg: Magnesium 
...
```
* Types of glass : <br/>
```
building_windows_float_processed
building_windows_non_float_processed
vehicle_windows_float_processed
```

In [2]:
df = pd.read_csv("../datasets/glass.csv")
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0.0,0.0,'build wind float'
1,1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0.0,0.0,'vehic wind float'
2,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.0,0.0,'build wind float'
3,1.51299,14.4,1.74,1.54,74.55,0.0,7.59,0.0,0.0,tableware
4,1.53393,12.3,0.0,1.0,70.16,0.12,16.19,0.0,0.24,'build wind non-float'


In [3]:
X = df.values[:,:-1]
Y = df.values[:,-1]
print(X)
print(Y)

[[1.51793 12.79 3.5 ... 8.77 0.0 0.0]
 [1.51643 12.16 3.52 ... 8.53 0.0 0.0]
 [1.51793 13.21 3.48 ... 8.43 0.0 0.0]
 ...
 [1.51613 13.92 3.52 ... 7.94 0.0 0.14]
 [1.51689 12.67 2.88 ... 8.54 0.0 0.0]
 [1.51852 14.09 2.19 ... 9.32 0.0 0.0]]
["'build wind float'" "'vehic wind float'" "'build wind float'"
 'tableware' "'build wind non-float'" "'build wind non-float'"
 "'vehic wind float'" "'build wind float'" 'headlamps'
 "'build wind non-float'" "'build wind non-float'"
 "'build wind non-float'" "'build wind float'" "'vehic wind float'"
 "'vehic wind float'" "'build wind non-float'" 'headlamps'
 "'build wind non-float'" 'containers' "'build wind non-float'"
 "'build wind float'" "'build wind non-float'" "'build wind non-float'"
 "'build wind float'" 'containers' "'build wind non-float'"
 "'build wind non-float'" 'headlamps' "'build wind non-float'"
 "'vehic wind float'" "'build wind non-float'" "'vehic wind float'"
 'tableware' "'build wind non-float'" "'build wind float'"
 "'build wind 

## Model 

In [4]:
clf = KNeighborsClassifier(
    n_neighbors = 10,
    weights="uniform",
    metric="euclidean"
)

### KNN parameters
## n_neighbors : number of neighbors(k)
## weights : weight function used in prediction
##       'uniform' : all neighbors have same weight
##       'distance' : weights are given according to the distance
##        * note : user defined function can also be called
## metric : the distance metric to use

## K-fold Cross-Validation

In [6]:
cv = KFold(
    n_splits = 10,
    shuffle = True,
    random_state = 0 # random seed
)

cv_results = cross_val_score(clf,X,Y,cv=cv)

print(cv_results.mean())

0.6199134199134199


## Prediction with KNN

In [9]:
clf.fit(X,Y) # Train fit the model to the train data X

pred_y = clf.predict([[1.5,13,1.5,1.5,70,0.5,8.9,0.1,0.2]])
print(pred_y)

["'build wind non-float'"]


## Comparison with Varying k

In [11]:
K = [20,5,1]

for k in K :
    clf = KNeighborsClassifier(n_neighbors = k, weights="uniform", metric="euclidean")
    
    results = cross_val_score(clf,X,Y,cv=cv)
    
    print(f"{k} neighbors : {results.mean()}")

# Note : It is not always a good idea to increase k

20 neighbors : 0.6155844155844156
5 neighbors : 0.648051948051948
1 neighbors : 0.7370129870129871
