Step 1: Importing the libraries

In [1]:

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,classification_report

Step 2: Importing dataset

In [3]:
df_train = pd.read_table('trainKNN.txt',delimiter=',',header=None, index_col=0,names=['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','GlassType'])
df_test = pd.read_table('testKNN.txt',delimiter=',',header=None, index_col=0,names=['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','GlassType'])

In [4]:
print(df_train.shape, df_test.shape)

(196, 10) (18, 10)


In [5]:
df_train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,GlassType
1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [6]:
df_train['GlassType'].value_counts()

2    73
1    67
7    26
3    14
5    10
6     6
Name: GlassType, dtype: int64

Step 3: Checking for missing data

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196 entries, 1 to 211
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   RI         196 non-null    float64
 1   Na         196 non-null    float64
 2   Mg         196 non-null    float64
 3   Al         196 non-null    float64
 4   Si         196 non-null    float64
 5   K          196 non-null    float64
 6   Ca         196 non-null    float64
 7   Ba         196 non-null    float64
 8   Fe         196 non-null    float64
 9   GlassType  196 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 16.8 KB


In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18 entries, 68 to 214
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   RI         18 non-null     float64
 1   Na         18 non-null     float64
 2   Mg         18 non-null     float64
 3   Al         18 non-null     float64
 4   Si         18 non-null     float64
 5   K          18 non-null     float64
 6   Ca         18 non-null     float64
 7   Ba         18 non-null     float64
 8   Fe         18 non-null     float64
 9   GlassType  18 non-null     int64  
dtypes: float64(9), int64(1)
memory usage: 1.5 KB


Step 4: Splitting the datasets into training sets and Test sets

In [9]:
x_train = df_train.iloc[:,[0,1,2,3,4,5,6,7,8]]
y_train = df_train.iloc[:,9]
x_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8]]
y_test = df_test.iloc[:,9]

Step 5: Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
func = StandardScaler()
func.fit(x_train)
x_train_std = func.transform(x_train)
x_test_std = func.transform(x_test)

Step 6: Building models

In [11]:
from scipy.spatial import distance
num_neighbors = [1,2,3,4,5,6,7,8,9,10]
acc_results = []
for num in num_neighbors:
    model = KNeighborsClassifier(n_neighbors=num, metric=distance.sqeuclidean).fit(x_train_std, y_train)
    acc_results.append(accuracy_score(y_test, model.predict(x_test_std)))

In [12]:
acc_results

[0.6111111111111112,
 0.4444444444444444,
 0.6111111111111112,
 0.5,
 0.5555555555555556,
 0.5555555555555556,
 0.5555555555555556,
 0.6111111111111112,
 0.5555555555555556,
 0.5]

In [13]:
from scipy.spatial import distance
num_neighbors = [1,2,3,4,5,6,7,8,9,10]
acc_results = []
for num in num_neighbors:
    model = KNeighborsClassifier(n_neighbors=num, metric=distance.cityblock).fit(x_train_std, y_train)
    acc_results.append(accuracy_score(y_test, model.predict(x_test_std)))

In [14]:
acc_results

[0.6666666666666666,
 0.4444444444444444,
 0.6111111111111112,
 0.5,
 0.5555555555555556,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5]

We can see the k value for which each of the model gives maximum accuracy.

In [15]:
euclid_model = KNeighborsClassifier(n_neighbors=8, metric=distance.sqeuclidean).fit(x_train_std, y_train) # Square Euclidean distance model
manhattan_model = KNeighborsClassifier(n_neighbors=1, metric=distance.cityblock).fit(x_train_std, y_train) #Manhattan distance model

In [16]:
manhattan_predictions = manhattan_model.predict(x_test_std)
euclid_predictions = euclid_model.predict(x_test_std) 
df = pd.DataFrame({'actual': y_test, 'manhattan': manhattan_predictions, 'euclid': euclid_predictions})
df.head()

Unnamed: 0,actual,manhattan,euclid
68,1,1,1
69,1,1,1
70,1,1,1
144,2,2,2
145,2,1,1


In [17]:
df

Unnamed: 0,actual,manhattan,euclid
68,1,1,1
69,1,1,1
70,1,1,1
144,2,2,2
145,2,1,1
146,2,1,2
161,3,3,2
162,3,2,2
163,3,1,2
174,5,5,5


In [18]:
print(classification_report(y_test, manhattan_predictions, target_names=df['actual'].astype(str).unique()))
print("\n",classification_report(y_test, euclid_predictions, target_names=df['actual'].astype(str).unique()))

              precision    recall  f1-score   support

           1       0.50      1.00      0.67         3
           2       0.25      0.33      0.29         3
           3       1.00      0.33      0.50         3
           5       1.00      0.67      0.80         3
           6       1.00      0.67      0.80         3
           7       1.00      1.00      1.00         3

    accuracy                           0.67        18
   macro avg       0.79      0.67      0.68        18
weighted avg       0.79      0.67      0.68        18


               precision    recall  f1-score   support

           1       0.75      1.00      0.86         3
           2       0.29      0.67      0.40         3
           3       0.00      0.00      0.00         3
           5       1.00      0.33      0.50         3
           6       1.00      0.67      0.80         3
           7       0.75      1.00      0.86         3

    accuracy                           0.61        18
   macro avg       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
print("Accuracy score for the model with manhattan distance: ",accuracy_score(y_test, manhattan_predictions))
print("Accuracy score for the model with squared euclidean distance: ",accuracy_score(y_test, euclid_predictions))

Accuracy score for the model with manhattan distance:  0.6666666666666666
Accuracy score for the model with squared euclidean distance:  0.6111111111111112


## **Output label**

Type of glass: (class attribute)

building_windows_float_processed

building_windows_non_float_processed

vehicle_windows_float_processed

vehicle_windows_non_float_processed (none in this database)

containers

tableware

headlamps