In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1.read the data

In [2]:
data = pd.read_csv('../input/classified-data/Classified Data')

In [3]:
data.sample(2)

Unnamed: 0.1,Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ,TARGET CLASS
798,798,1.264444,0.901345,0.476093,0.636815,1.240612,0.927326,1.324535,0.413136,1.024086,1.021621,1
627,627,0.679125,1.456697,0.725949,0.884633,0.861456,0.803774,1.01548,1.445247,1.187241,1.555708,0


# 2.preprocessing

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1000 non-null   int64  
 1   WTT           1000 non-null   float64
 2   PTI           1000 non-null   float64
 3   EQW           1000 non-null   float64
 4   SBI           1000 non-null   float64
 5   LQE           1000 non-null   float64
 6   QWG           1000 non-null   float64
 7   FDJ           1000 non-null   float64
 8   PJF           1000 non-null   float64
 9   HQE           1000 non-null   float64
 10  NXJ           1000 non-null   float64
 11  TARGET CLASS  1000 non-null   int64  
dtypes: float64(10), int64(2)
memory usage: 93.9 KB


In [5]:
data.isna().sum()

Unnamed: 0      0
WTT             0
PTI             0
EQW             0
SBI             0
LQE             0
QWG             0
FDJ             0
PJF             0
HQE             0
NXJ             0
TARGET CLASS    0
dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler

st_scaler = StandardScaler()
st_scaler.fit(data.drop('TARGET CLASS', axis = 1))
features = st_scaler.fit_transform(data.drop('TARGET CLASS', axis = 1))


In [7]:
features

array([[-1.73031962, -0.12354188,  0.18590747, ..., -1.48236813,
        -0.9497194 , -0.64331425],
       [-1.72685552, -1.08483602, -0.43034845, ..., -0.20224031,
        -1.82805088,  0.63675862],
       [-1.72339142, -0.78870217,  0.33931821, ...,  0.28570652,
        -0.68249379, -0.37784986],
       ...,
       [ 1.72339142,  0.64177714, -0.51308341, ..., -2.36249443,
        -0.81426092,  0.11159651],
       [ 1.72685552,  0.46707241, -0.98278576, ..., -0.03677699,
         0.40602453, -0.85567   ],
       [ 1.73031962, -0.38765353, -0.59589427, ..., -0.56778932,
         0.3369971 ,  0.01034996]])

In [8]:
features_df = pd.DataFrame(features, columns = data.columns[:-1])
features_df.head()

Unnamed: 0.1,Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ
0,-1.73032,-0.123542,0.185907,-0.913431,0.319629,-1.033637,-2.308375,-0.798951,-1.482368,-0.949719,-0.643314
1,-1.726856,-1.084836,-0.430348,-1.025313,0.625388,-0.444847,-1.152706,-1.129797,-0.20224,-1.828051,0.636759
2,-1.723391,-0.788702,0.339318,0.301511,0.755873,2.031693,-0.870156,2.599818,0.285707,-0.682494,-0.37785
3,-1.719927,0.982841,1.060193,-0.621399,0.625299,0.45282,-0.26722,1.750208,1.066491,1.241325,-1.026987
4,-1.716463,1.139275,-0.640392,-0.709819,-0.057175,0.822886,-0.936773,0.596782,-1.472352,1.040772,0.27651


In [9]:
features_df.shape

(1000, 11)

# 3.Corr func

In [10]:
corr = data.corr()
fig = px.imshow(corr, text_auto=True)
fig.show()

# 4.Select Model

In [11]:
X = features
y = data['TARGET CLASS']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [13]:
knn_model = KNeighborsClassifier(n_neighbors = 2)
knn_model.fit(x_train, y_train)
KNeighborsClassifier(n_neighbors=2)
y_pred = knn_model.predict(x_test)

In [14]:
print(confusion_matrix(y_test, y_pred))

[[99  9]
 [11 81]]


In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       108
           1       0.90      0.88      0.89        92

    accuracy                           0.90       200
   macro avg       0.90      0.90      0.90       200
weighted avg       0.90      0.90      0.90       200



In [16]:
model_acc = knn_model.score(x_test, y_test)
print('KNN Model Accuracy:', model_acc * 100, '%')

KNN Model Accuracy: 90.0 %


In [17]:
error_rate = []

for i in range(1, 40):
    knn_model = KNeighborsClassifier(n_neighbors = i)
    knn_model.fit(x_train, y_train)
    knn_y_pred = knn_model.predict(x_test)
    error_rate.append(np.mean(knn_y_pred != y_test))


In [18]:
knn_model = KNeighborsClassifier(n_neighbors = 1)

knn_model.fit(x_train, y_train)
k_pred = knn_model.predict(x_test)

print('WITH K = 1')
print('\n')
print(confusion_matrix(y_test, k_pred))
print('\n')
print(classification_report(y_test, k_pred))

WITH K = 1


[[96 12]
 [ 6 86]]


              precision    recall  f1-score   support

           0       0.94      0.89      0.91       108
           1       0.88      0.93      0.91        92

    accuracy                           0.91       200
   macro avg       0.91      0.91      0.91       200
weighted avg       0.91      0.91      0.91       200



In [19]:
one_k_train_accuracy = knn_model.score(x_train, y_train)
print('K = 1 Training Accuracy:', one_k_train_accuracy * 100, '%')

one_k_test_accuracy = knn_model.score(x_test, y_test)
print('K = 1 Test Accuracy:', one_k_test_accuracy * 100, '%')

K = 1 Training Accuracy: 100.0 %
K = 1 Test Accuracy: 91.0 %


In [20]:
knn_model = KNeighborsClassifier(n_neighbors = 24)

knn_model.fit(x_train, y_train)
k_pred = knn_model.predict(x_test)

print('WITH K = 24')
print('\n')
print(confusion_matrix(y_test, k_pred))
print('\n')
print(classification_report(y_test, k_pred))

WITH K = 24


[[94 14]
 [ 3 89]]


              precision    recall  f1-score   support

           0       0.97      0.87      0.92       108
           1       0.86      0.97      0.91        92

    accuracy                           0.92       200
   macro avg       0.92      0.92      0.91       200
weighted avg       0.92      0.92      0.92       200



In [21]:
twentyfour_k_train_accuracy = knn_model.score(x_train, y_train)
print('K = 10 Training Accuracy:', twentyfour_k_train_accuracy * 100, '%')

twentyfour_k_test_accuracy = knn_model.score(x_test, y_test)
print('K = 24 Test Accuracy:', twentyfour_k_test_accuracy * 100, '%')

K = 10 Training Accuracy: 94.375 %
K = 24 Test Accuracy: 91.5 %
