# NEAREST NEIGHBORS

#### LOAD THE DEPENDANCIES

In [None]:
import pandas as pd
from pandas import set_option
from pandas.plotting import scatter_matrix
import numpy as np
from numpy import set_printoptions

import matplotlib.pyplot as plt

import os
import sys
sys.path.insert(0, "C:\\Users\\Crystal\\Desktop\\Programs\\my-modules-and-libraries")

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

from scipy import stats
import seaborn as sns
import math



In [None]:
#### KNN Classifier

def KNN(k,X_train,y_train,X_test,y_test):
    """KNN algorithm"""
    
    f1_scores=[]
    accur=[]
    preci=[]
    recall=[]
    for i in k:
        
        # Define KNN Model
        classifier = KNeighborsClassifier(n_neighbors=i, weights='uniform', algorithm='auto',
                                           leaf_size=30, p=2, metric='euclidean',metric_params=None)
        # Fit Model
        classifier.fit(X_train,y_train)

        y_pred = classifier.predict(X_test)
        
        f1,a,p,r=metrics(y_test, y_pred)
        
        f1_scores.append(f1)
        accur.append(a)
        preci.append(p)
        recall.append(r)
        
    print('\n','f1_scores: ',f1_scores)
    print('accuracy: ',accur)
    
    return f1_scores,accur,preci,recall
        
        

In [None]:
#### Evaluate Model

def metrics(y_test, y_pred):
    """Confusion matrix and associated metrics"""
    matrix = confusion_matrix(y_test, y_pred)
    tn,fp,fn,tp=confusion_matrix(y_test, y_pred).ravel()
    precision=precision_score(y_test,y_pred)
    recall=recall_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    accuracy=accuracy_score(y_test,y_pred)
    print('Confusion matrix breakdown:',('tn:',tn,'fp:',fp,'fn:',fn,'tp:',tp),'\n')
    print('Confusion matrix:\n', matrix)
    print('Precision: When it predicts yes, how often is it correct?:',precision)
    print('Recall.True Positive Rate: When it\'s actually yes, how often does it predict yes?:',recall)
    print('F1:score is the harmonic average of the precision and recall,:',f1)
    print('Accuracy.Overall, how often is the classifier correct?: ',accuracy)
    print('Misclassification Rate.Overall, how often is it wrong?: ',(1-accuracy))

    return (f1,accuracy,precision,recall)
    


#### PREPROCESSING: Replacing zeros where it is not a valid value for that feature.
##### This done here by replacing the zero values with a NAN, then replacing the NAN with the average value for non-zero values in that column

In [None]:
def replacing_zeros(dataset,the_headers):
    """Function used to remove zeros from numeric features when 0 is not practical"""

    for header in the_headers:
        dataset[header]=dataset[header].replace(0,np.nan)
        mean=int(dataset[header].mean(skipna=True))
        dataset[header]=dataset[header].replace(np.nan,mean)
        
    return dataset


#### PREPROCESSING: Split the dataset

In [None]:
def split_the_dataset(dataset,input_headers,target_header):
    
#     X=dataset.iloc[:,0,8]
#     X=dataset.iloc[:,[1,2,4,5,6,7]]
#     y=dataset.iloc[:,8]
    X=dataset[[input_headers]]
    y=dataset[[target_header]]
    
    X.head()
    
    return X,y

#### RETRIEVE THE DATASET

In [None]:
location=r'C:\Users\Crystal\Desktop\Programs\dataset_repo\diabetes.csv'
# location=r'C:\Users\Crystal\Desktop\Programs\dataset_repo\CDH_Train.csv'
dataset=pd.read_csv(location)

In [None]:
dataset.info()

In [None]:
dataset.head()

In [None]:
dataset.head()

#### PREPROCESSING: Quick look at the features

In [None]:
# X.hist(bins=50,figsize=(15,15))
# X.plot(kind='hist',subplots=True,layout=(3,3),sharex=False, figsize=(15,15))

headers=X.columns.tolist()
fig, axes = plt.subplots(nrows=1, ncols=len(headers), figsize=(20, 10))
print(headers)
for i,head in enumerate(headers,0):
    
    axes[i].hist(x=X[head],bins=50,edgecolor='black')
    axes[i].set(title=head)
    axes[i].grid()

plt.show()

In [None]:
X.plot(kind='density',subplots=True,layout=(3,3),sharex=False, figsize=(15,15))
plt.show()

#### PREPROCESSING:Target Summary

In [None]:
def target_summary(dataset,target_header):
    """PREPROCESSING:Target Summary"""
    print(dataset.groupby(target_header).size())
    print((dataset.groupby(target_header).size()/len(y)*100))


#### PREPROCESSING:Train - Test Split of the data

In [None]:
def train_test_split(X,y):
    """PREPROCESSING:Train - Test Split of the data"""
    X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.33,train_size=None,random_state=42,shuffle=True,)
    X_train.head()
    
    return X_train,X_test,y_train,y_test

#### PREPROCESSING: Feature Scaling

In [None]:
def feature_scaling(X_train,X_test)
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X=X_train,y=None)
X_test=sc_X.fit_transform(X=X_test,y=None)

print(sc_X.fit(X_train))



#### Choose a value of k by taking the sqrt of the number of data points

In [None]:
y_test.size

In [None]:
k_suggest=round(math.sqrt(y_test.size))
k=[3,5,7,9,11,13,15,17,19,21]
k_suggest

In [None]:
f1,accur,precision,recall=KNN(k,X_train,y_train,X_test,y_test)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
axes[0,0].plot(k,f1,marker='o')
axes[0,1].plot(k,accur,marker='o')
axes[0,0].set(title='F1 Score')
axes[0,1].set(title='Accuracy')
axes[0,0].grid()
axes[0,1].grid()

axes[1,0].plot(k,precision,marker='o')
axes[1,1].plot(k,recall,marker='o')
axes[1,0].set(title='Precision')
axes[1,1].set(title='Recall')
axes[1,0].grid()
axes[1,1].grid()

plt.show()

In [None]:
zero_not_accepted=['Glucose','BloodPressure','SkinThickness','BMI','Insulin']