In [1]:
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE 
import numpy as np
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


Using TensorFlow backend.


<div class="alert alert-block alert-info">
<b>Loading the data :</b> We loaded the data from the given data source to demonstrate K neighbors Classifier
</div>

In [2]:
path_of_input_file = 'D:\\kaggle_trials\\lower-back-pain-symptoms-dataset\\Dataset_spine.csv'
df                 = pd.read_csv(path_of_input_file,)
df.head(4)

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11,Col12,Class_att,Unnamed: 13
0,63.027818,22.552586,39.609117,40.475232,98.672917,-0.2544,0.744503,12.5661,14.5386,15.30468,-28.658501,43.5123,Abnormal,
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,0.415186,12.8874,17.5323,16.78486,-25.530607,16.1102,Abnormal,
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,0.474889,26.8343,17.4861,16.65897,-29.031888,19.2221,Abnormal,Prediction is done by using binary classificat...
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,0.369345,23.5603,12.7074,11.42447,-30.470246,18.8329,Abnormal,


<div class="alert alert-block alert-info">
<b>Data Imbalance:</b> We check the data imbalance here. Clearly we have an imbalanced dataset
</div>

In [3]:
num_labels = df['Class_att'].unique()
print('The number of labels are ',len(num_labels))

The number of labels are  2


In [4]:
for i in range(len(num_labels)):
    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['Class_att']==num_labels[i]]))

The number of  Abnormal  labels are :-  210
The number of  Normal  labels are :-  100


<div class="alert alert-block alert-info">
<b>Preprocessing:</b> We now preproocess and make the dataset balanced
</div>

In [5]:
lb           = LabelBinarizer()
Y            = lb.fit_transform(df['Class_att'].values)
X            = df[df.columns[:12]].values

sm           = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X, Y)

In [6]:
print('Positive examples before Oversampling is ', sum(Y == [1])[0])
print('Negative examples before Oversampling is ', sum(Y == [0])[0])
print('\n')
print('Positive examples after Oversampling is ', sum(Y_res == [1]))
print('Negative examples after Oversampling is ', sum(Y_res == [0]))
print('\n')

Positive examples before Oversampling is  100
Negative examples before Oversampling is  210


Positive examples after Oversampling is  210
Negative examples after Oversampling is  210




<div class="alert alert-block alert-info">
<b>Train Test Split:</b> We split the data to train and test components.
</div>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)

<div class="alert alert-block alert-info">
<b>Hyper parameter Grid:</b> We create a grid for different hyper parameters to iterate from 
</div>

In [8]:
kneighbors_grid = {'n_neighbors' : hp.choice('n_neighbors',range(10,20)),
                   'weights'     : hp.choice('weights',['uniform','distance']),
                   'algorithm'   : hp.choice('algorithm',['ball_tree','kd_tree','brute']),
                   'leaf_size'   : hp.choice('leaf_size',range(1,50)),
                   'metric'      : hp.choice('metric',['euclidean','manhattan','chebyshev','minkowski'])
}

In [9]:
def hyperopt_train_test(params):
    clf = KNeighborsClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, kneighbors_grid, algo=tpe.suggest, max_evals=500, trials=trials)
best_parameters = space_eval(kneighbors_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 500/500 [00:06<00:00, 77.86it/s, best loss: -0.8609696283720053]
The best parameter tuned on training set is given by :-  {'algorithm': 'ball_tree', 'leaf_size': 45, 'metric': 'chebyshev', 'n_neighbors': 10, 'weights': 'distance'}


<div class="alert alert-block alert-info">
<b>Model Fitting and conclusion:</b> We now fit the model and then provide a classification analysis on the model fit
</div>

In [10]:
knnclf = KNeighborsClassifier(**best_parameters)
knnclf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=45, metric='chebyshev',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='distance')

In [11]:
y_hat = knnclf.predict(X_test)

In [12]:
print(classification_report(y_hat,y_test))

              precision    recall  f1-score   support

           0       0.72      0.98      0.83        54
           1       0.98      0.75      0.85        85

    accuracy                           0.84       139
   macro avg       0.85      0.87      0.84       139
weighted avg       0.88      0.84      0.84       139

