# Working with kNN
<p style=font-size:20px;color:rgba(255,255,255,255);> This is the second session of our machine learning journey!</p>
<p style=font-size:14px;color:rgba(255,255,255,255);> Here we apply kNN on our the data!</p>
<p style=font-size:20px;color:yellow;> Importing required libraries </p>

In [None]:
import numpy as np
# pip install numpy
import pandas as pd
# pip install pandas
import matplotlib.pyplot as plt 
# pip install matplotlib
from sklearn.preprocessing import StandardScaler
# pip install sikit-learn   # for sklearn
# pip install preprocessing
from imblearn.over_sampling import RandomOverSampler
# pip install imblearn

from imblearn.under_sampling import RandomUnderSampler

### ====================================================================================================
<p style=font-size:25px;color:yellow;> Download dataset </p>
https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope


<p style=color:yellow;font-size:20xp> Openning the csv file, changing the name of the columns, showing the first five columns </p>

In [None]:
# df = pd.read_csv("magic04.data")      # reading the dataset
# print(df)     # showing the dataset
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]        # name of the columns from the other file called "magic04.names"

df = pd.read_csv("magic04.data", names=cols)      # reading the dataset again, but this time we are naming the columns
# print(df)
df = df.loc[:,['fAlpha','class']]
cols = ["fAlpha", "class"]        # name of the columns from the other file called "magic04.names"

df.head()        # shows the first five rows of df

### ====================================================================================================
<p style=color:yellow;font-size:20xp;>Changing the character labels into numbers, looking at histograms</p>

In [None]:
# in this blick, we want to see which feature (label) can separate data classes better. We plot histograms, and the one/ those
# that look more separable, are better for our classification goal
# Based on the plots that come below, we see that it seems 'fAlpha' is the best feature for our purpose!
df["class"].unique()        # shows all the different elements in the column labled "class"; exactly like unique in MATLAB
df["class"] = (df["class"] == "g").astype(int)      # compares the elements in the "class" column with "g" and returns "True" or "Flase". Then converts these binaries into integers 1 and 0

for label in cols[:-1]:
    plt.hist(df[df["class"]==1][label], color='blue', label='gamma', alpha=0.7, density=True)
    plt.hist(df[df["class"]==0][label], color='red', label='hadron', alpha=0.7, density=True)
    plt.title(label)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

### ====================================================================================================
# Train, validation, test datasets
<p style=color:yellow;font-size:20xp> First 60% as train, from 60% to 80% as validation, the rest as test </p>

In [None]:
train, valid, test = np.split(df.sample(frac=1, replace=False), [int(0.6*len(df)), int(0.8*len(df))])
# Note:
# Randomly sample 30% of the rows with replacement and a specific random state
# random_sample_with_replace = df.sample(frac=0.3, replace=True, random_state=42)

### ====================================================================================================
<p style=color:yellow;font-size:25xp;>Writing a function that gets data, z-scores each column of data, and balances them (either oversample or undersample), in a way that all of them have the same number of elements </p>
<p style=color:rgba(0,255,0,255);font-size:25xp;>Balancing is kind of necessary for KNN; otherwise, the larger class can affect our classification! </p>

In [None]:
# making a function called 'scale-dataset' for scaling the data
def scale_dataset(dataframe, oversample, undersample):
    X = dataframe[dataframe.columns[:-1]].values    # '.columns[:-1] refers to all the columns except than the last column
    Y = dataframe[dataframe.columns[-1]].values     # '.columns[-1]' refers to the last column, '.values' turns dataframe into matrix
    
    # here we put 'StandardScaler()' in a separate variable called 'scaler' to make using it easier. Instead of the next two lines, we could write "StandardScaler().fit_transform(X)"
    scaler = StandardScaler()       # StandardScaler transforms the data in a way that it has zero mean and a standard deviation of 1.   
    X = scaler.fit_transform(X)     # fit_transform finds the mean and std of each column of data. Then for all the element of each column, subtracts the mean of them, and divides the result to the std
    # fit_transform and StandardScaler together work as z-tansform
    if oversample:
        # here we put 'RandomOverSampler()' in a separate variable calles 'ros' to make using it easier
        ros = RandomOverSampler()   
        X, Y = ros.fit_resample(X, Y)       
        # 'fit_resample' picks samples from 'X' for each group of 'Y'. In this example Y includes two groups 1, and 0. Now, using 'OverSampler'
        # means that if one of these groups has less members than the other group, it randomly picks some elements from that group again (repeat them)
        # in a way that at the end both group have the same number of elements (the size of the larger group)

    if undersample:
        # here we put 'RandomUnderSampler()' in a separate variable calles 'rus' to make using it easier
        rus = RandomUnderSampler()   
        X, Y = rus.fit_resample(X, Y)       
        # 'fit_resample' picks samples from 'X' for each group of 'Y'. In this example Y includes two groups 1, and 0. Now, using 'OverSampler'
        # means that if one of these groups has less members than the other group, it randomly removes some elements from larger group in a way
        # that at the end both group have the same number of elements (the size of the smaller group)

    data = np.hstack((X, np.reshape(Y, (-1,1))))        # '-1' in 'reshape' means that we leave that dimension unassigned, meaning that here the result has one column, but the number of rows is not assigned
    # hstack horizontally stacks the input arrays
    return data, X, Y

### ====================================================================================================
<p style=color:yellow;font-size:25px;>Testing the function that we wrote in the previous block</p>

### ====================================================================================================
## Using our function to take samples from data for training, validation, and testing

In [None]:
train, Xtrain, Ytrain = scale_dataset(train, oversample=True, undersample=False)
valid, Xvalid, Yvalid = scale_dataset(valid, oversample=False, undersample=False)
test, Xtest, Ytest = scale_dataset(test, oversample=False, undersample=False)

<p style=color:yellow;font-size:40px;>kNN</p>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(Xtrain, Ytrain)

In [None]:
Ypred = knn_model.predict(Xtest)
# print(Ypred)
# print(Ytest)


In [None]:
from sklearn.metrics import classification_report
print(classification_report(Ytest,Ypred))

<p style=color:rgba(0,255,0,255);font-size:25xp;>Evaluating the accuracy of our classification (just accuracy, not the whole report!)</p>

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(Ytest, Ypred)

<p style=color:rgba(255,255,0,255);font-size:25xp;>Testing different number of neighbors for KNN to find the best one!</p>

In [None]:
from sklearn.model_selection import cross_val_score

Ks = np.arange(3,20,2)
best_score = -1
best_K = None

for k in Ks:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
    average_score = np.mean(scores)

    if average_score > best_score:
        best_score = average_score
        best_K = k

print("Best score is: ", best_score)
print("Best number of neighbors: ", best_K)

<p style=color:rgba(255,255,0,255);font-size:25xp;>Testing different algorithms for finding the distance between points on KNN to find the best one!</p>

In [None]:
from sklearn.model_selection import cross_val_score


best_score = -1
best_K = None

for k in Ks:
    knn_model = KNeighborsClassifier(n_neighbors=3)
    scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
    average_score = np.mean(scores)

    if average_score > best_score:
        best_score = average_score
        best_K = k

print("Best score is: ", best_score)
print("Best number of neighbors: ", best_K)

In [None]:
best_score = -1
best_distance = None
knn_model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'euclidean'
# ----------------------------------------------------------------------------------
knn_model = KNeighborsClassifier(n_neighbors=3, metric='manhattan')
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'manhattan'
# ----------------------------------------------------------------------------------
knn_model = KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=2)
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'minkowski'
# ----------------------------------------------------------------------------------
knn_model = KNeighborsClassifier(n_neighbors=3, metric='chebyshev')
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'chebyshev'
# ----------------------------------------------------------------------------------
# knn_model = KNeighborsClassifier(n_neighbors=11, metric='mahalanobis')
# knn_model.fit(Xtrain, Ytrain)
# Ypred = knn_model.predict(Xtest)
# allAccuracies.append(accuracy_score(Ytest, Ypred))
# ----------------------------------------------------------------------------------
knn_model = KNeighborsClassifier(n_neighbors=3, metric='hamming')
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'hamming'
# ----------------------------------------------------------------------------------
knn_model = KNeighborsClassifier(n_neighbors=3, metric='canberra')
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'canberra'
# ----------------------------------------------------------------------------------
knn_model = KNeighborsClassifier(n_neighbors=3, metric='cosine')
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'cosine'
# ----------------------------------------------------------------------------------
knn_model = KNeighborsClassifier(n_neighbors=3, metric='jaccard')
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'jaccard'
# ----------------------------------------------------------------------------------
knn_model = KNeighborsClassifier(n_neighbors=3, metric='braycurtis')
scores = cross_val_score(knn_model, Xtrain, Ytrain, cv=5)  # 5-fold cross-validation
average_score = np.mean(scores)

if average_score > best_score:
    best_score = average_score
    best_distance = 'braycurtis'
# ----------------------------------------------------------------------------------
# knn_model = KNeighborsClassifier(n_neighbors=11, metric='haversine')
# knn_model.fit(Xtrain, Ytrain)
# Ypred = knn_model.predict(Xtest)
# allAccuracies.append(accuracy_score(Ytest, Ypred))


print(best_distance)
print(best_score)
