In [2]:
#importing the libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import percentile
from numpy.random import rand    

In [3]:
#Loading the mitbih data set
test = pd.read_csv("mitbih_test.csv", header=None)
train = pd.read_csv("mitbih_train.csv", header=None)

In [4]:
#Fining the outliers
continous_features =   train[187].unique()
def outliers(df_out, drop = False):
    for each_feature in df_out.columns:
        feature_data = df_out[each_feature]
        Q1 = np.percentile(feature_data, 25.) # 25th percentile of the data of the given feature
        Q3 = np.percentile(feature_data, 75.) # 75th percentile of the data of the given feature
        IQR = Q3-Q1 
        outlier_step = IQR * 1.5 
        outliers = feature_data[~((feature_data >= Q1 - outlier_step) & (feature_data <= Q3 + outlier_step))].index.tolist()  
        if not drop:
            print('For the feature {}, No of Outliers is {}'.format(each_feature, len(outliers)))
        if drop:
            train.drop(outliers, inplace = True, errors = 'ignore')
            print('Outliers from {} feature removed'.format(each_feature))
outliers(train[continous_features])

For the feature 0, No of Outliers is 12161
For the feature 1, No of Outliers is 5575
For the feature 2, No of Outliers is 0
For the feature 3, No of Outliers is 1786
For the feature 4, No of Outliers is 6596


In [5]:
#Removing the outliers
outliers(train[continous_features], drop=True)

Outliers from 0 feature removed
Outliers from 1 feature removed
Outliers from 2 feature removed
Outliers from 3 feature removed
Outliers from 4 feature removed


###### Resampling and building the KNN model

In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils import resample
import random
import matplotlib.pyplot as plt
import seaborn as sns
train_lbl0 = resample(train[train[187]==0], replace=True, n_samples=15000, random_state=113)
train_lbl1 = resample(train[train[187]==1], replace=True, n_samples=15000, random_state=113)
train_lbl2 = resample(train[train[187]==2], replace=True, n_samples=15000, random_state=113)
train_lbl3 = resample(train[train[187]==3], replace=True, n_samples=15000, random_state=113)
train_lbl4 = resample(train[train[187]==4], replace=True, n_samples=15000, random_state=113)

In [7]:
train= pd.concat([train_lbl0, train_lbl1, train_lbl2, train_lbl3, train_lbl4])
labels = train[187].astype('int64') # last column has the labels
print("Count in each label: ")
print(labels.value_counts())

Count in each label: 
4    15000
3    15000
2    15000
1    15000
0    15000
Name: 187, dtype: int64


In [8]:
train.shape

(75000, 188)

In [9]:
x_train = train.iloc[:, : -1].values
y_train = train.iloc[:, -1].values

In [10]:
x_test = test.iloc[:, : -1].values
y_test = test.iloc[:, -1].values

In [11]:
print(x_train)

[[1.         0.88453609 0.61237115 ... 0.         0.         0.        ]
 [1.         0.8888889  0.52430558 ... 0.         0.         0.        ]
 [0.91666669 0.76234567 0.36419752 ... 0.         0.         0.        ]
 ...
 [1.         0.47955391 0.52788103 ... 0.         0.         0.        ]
 [1.         0.47569445 0.50347221 ... 0.         0.         0.        ]
 [1.         0.6736111  0.4212963  ... 0.         0.         0.        ]]


In [12]:
print(y_train)

[0. 0. 0. ... 4. 4. 4.]


In [13]:
print(x_test)

[[1.         0.75826448 0.11157025 ... 0.         0.         0.        ]
 [0.90842491 0.7838828  0.53113556 ... 0.         0.         0.        ]
 [0.73008847 0.21238938 0.         ... 0.         0.         0.        ]
 ...
 [1.         0.96735907 0.62017804 ... 0.         0.         0.        ]
 [0.98412699 0.5674603  0.60714287 ... 0.         0.         0.        ]
 [0.97396964 0.91323209 0.86550975 ... 0.         0.         0.        ]]


In [14]:
print(y_test)

[0. 0. 0. ... 4. 4. 4.]


In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [16]:
print(x_train)

[[ 0.62965712  0.40540237  0.52082393 ... -0.15798999 -0.15650701
  -0.15550876]
 [ 0.62965712  0.43406513  0.10498512 ... -0.15798999 -0.15650701
  -0.15550876]
 [-0.88616344 -0.39920815 -0.6510328  ... -0.15798999 -0.15650701
  -0.15550876]
 ...
 [ 0.62965712 -2.26136095  0.12186809 ... -0.15798999 -0.15650701
  -0.15550876]
 [ 0.62965712 -2.28677508  0.00661154 ... -0.15798999 -0.15650701
  -0.15550876]
 [ 0.62965712 -0.98351556 -0.38141678 ... -0.15798999 -0.15650701
  -0.15550876]]


In [17]:
print(x_test)

[[ 0.45017238 -0.01663683 -1.37839723 ... -0.08819499 -0.08446082
  -0.0835278 ]
 [ 0.05975141  0.10052673  0.45723507 ... -0.08819499 -0.08446082
  -0.0835278 ]
 [-0.70056768 -2.51315818 -1.86652611 ... -0.08819499 -0.08446082
  -0.0835278 ]
 ...
 [ 0.45017238  0.93964253  0.84680317 ... -0.08819499 -0.08446082
  -0.0835278 ]
 [ 0.38249941 -0.88926633  0.78977321 ... -0.08819499 -0.08446082
  -0.0835278 ]
 [ 0.33919461  0.69209661  1.92014929 ... -0.08819499 -0.08446082
  -0.0835278 ]]


In [18]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(x_train, y_train)

KNeighborsClassifier()

In [19]:
print(classifier.predict(sc.transform([[9.60E-01,8.63E-01,4.62E-01,1.97E-01,9.40E-02,1.25E-01,9.97E-02,8.83E-02,7.41E-02,8.26E-02,7.41E-02,6.27E-02,6.55E-02,6.55E-02,6.27E-02,7.69E-02,7.12E-02,8.26E-02,9.12E-02,9.69E-02,8.26E-02,8.26E-02,9.12E-02,1.05E-01,1.23E-01,1.48E-01,1.82E-01,1.94E-01,2.14E-01,2.08E-01,2.22E-01,2.54E-01,2.71E-01,2.88E-01,2.85E-01,2.93E-01,2.56E-01,2.48E-01,1.88E-01,1.45E-01,1.08E-01,8.26E-02,7.98E-02,7.41E-02,1.42E-02,1.14E-02,6.27E-02,5.13E-02,5.70E-02,4.84E-02,2.85E-02,3.13E-02,7.69E-02,2.56E-02,2.85E-02,3.70E-02,9.40E-02,8.55E-02,3.99E-02,5.98E-02,7.41E-02,7.98E-02,9.12E-02,9.97E-02,1.08E-01,8.83E-02,9.12E-02,6.55E-02,8.83E-02,7.69E-02,8.26E-02,9.69E-02,9.97E-02,1.34E-01,1.03E-01,3.99E-02,6.55E-02,7.41E-02,8.26E-02,8.55E-02,5.70E-02,4.56E-02,1.03E-01,3.99E-02,1.14E-02,1.71E-02,3.13E-02,5.70E-03,8.55E-03,3.13E-02,5.13E-02,5.70E-02,8.83E-02,6.55E-02,1.14E-02,5.70E-02,3.99E-02,3.99E-02,2.56E-02,2.85E-03,1.99E-02,2.56E-02,1.14E-02,2.85E-02,1.99E-02,2.28E-02,3.42E-02,1.42E-02,5.13E-02,6.84E-02,1.40E-01,2.88E-01,5.27E-01,7.78E-01,1.00E+00,8.89E-01,4.93E-01,1.91E-01,8.83E-02,6.27E-02,3.42E-02,0.00E+00,3.42E-02,1.71E-02,2.85E-03,0.00E+00,4.84E-02,4.84E-02,5.41E-02,4.27E-02,5.41E-02,5.98E-02,6.27E-02,7.12E-02,7.69E-02,9.97E-02,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00,0.00E+00]])))

[0.]


In [20]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0. 0.]
 [0. 0.]
 [0. 0.]
 ...
 [4. 4.]
 [4. 4.]
 [4. 4.]]


In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[15306   969  1060   583   200]
 [   90   422    32     7     5]
 [  180   103  1074    46    45]
 [    5     2    18   135     2]
 [   25    11    28     2  1542]]


0.8440983007491321

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.84      0.91     18118
         1.0       0.28      0.76      0.41       556
         2.0       0.49      0.74      0.59      1448
         3.0       0.17      0.83      0.29       162
         4.0       0.86      0.96      0.91      1608

    accuracy                           0.84     21892
   macro avg       0.56      0.83      0.62     21892
weighted avg       0.92      0.84      0.87     21892

