In [1]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
# load the data
data = pd.read_excel('Raisin_Dataset.xlsx')
data.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


In [2]:
# check for missing values
data.isnull().sum()

Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64

In [3]:
# check for duplicates
data.duplicated().sum()

0

In [4]:
# categorical conversion
# on Class, Kecimen to 1 and else 0
def make_categorical(raisin_):
    if raisin_ == 'Kecimen':
        return 1
    else:
        return 0
    
data['Class_10'] = data['Class'].apply(make_categorical)
data['Class'] = data['Class'].apply(make_categorical)
data

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class,Class_10
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.040,1,1
1,75166,406.690687,243.032436,0.801805,78789,0.684130,1121.786,1,1
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,1,1
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,1,1
4,79408,352.190770,290.827533,0.564011,81463,0.792772,1073.251,1,1
...,...,...,...,...,...,...,...,...,...
895,83248,430.077308,247.838695,0.817263,85839,0.668793,1129.072,0,0
896,87350,440.735698,259.293149,0.808629,90899,0.636476,1214.252,0,0
897,99657,431.706981,298.837323,0.721684,106264,0.741099,1292.828,0,0
898,93523,476.344094,254.176054,0.845739,97653,0.658798,1258.548,0,0


In [5]:
data.drop('Class_10', axis=1, inplace=True)

# Independent vs dependent variables
# Splitting the dataset into independent variables and dependent variable
X = data.drop(['Class'], axis=1)
y = data['Class']

# Split the data into 70-30% ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [6]:

# Scaling the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
knn

In [7]:
def eval_metric(model, X_train, y_train, X_test, y_test):

    """ to get the metrics for the model """

    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [8]:
y_pred = knn.predict(X_test_scaled)
y_pred_proba = knn.predict_proba(X_test_scaled)
results = {"Actual": y_test, "Pred":y_pred, "Proba_1":y_pred_proba[:,1], "Proba_0":y_pred_proba[:,0]}
pd.DataFrame.from_dict(results).sample(10)

Unnamed: 0,Actual,Pred,Proba_1,Proba_0
182,1,1,0.8,0.2
232,1,0,0.2,0.8
452,0,0,0.2,0.8
257,1,1,1.0,0.0
268,1,1,0.8,0.2
867,0,1,0.6,0.4
298,1,0,0.0,1.0
91,1,1,1.0,0.0
769,0,0,0.0,1.0
840,0,1,1.0,0.0


In [9]:
eval_metric(knn,X_train_scaled, y_train, X_test_scaled, y_test)

Test_Set
[[114  27]
 [ 18 111]]
              precision    recall  f1-score   support

           0       0.86      0.81      0.84       141
           1       0.80      0.86      0.83       129

    accuracy                           0.83       270
   macro avg       0.83      0.83      0.83       270
weighted avg       0.84      0.83      0.83       270


Train_Set
[[267  42]
 [ 22 299]]
              precision    recall  f1-score   support

           0       0.92      0.86      0.89       309
           1       0.88      0.93      0.90       321

    accuracy                           0.90       630
   macro avg       0.90      0.90      0.90       630
weighted avg       0.90      0.90      0.90       630

