You are provided with a dataset from USA Forensic Science Service
which has description of 6 types of glass; defined in terms of their
oxide content (i.e. Na, Fe, K, etc). Your task is to use K-Nearest
Neighbor (KNN) classifier to classify the glasses.

In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from scipy.spatial import distance


In [8]:
train = pd.read_csv('/content/trainKNN.txt')
train.columns = ['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']
train = train.drop('ID', axis=1)

train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
1,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
2,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
3,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1
4,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.0,0.26,1


In [9]:
test = pd.read_csv('/content/testKNN.txt')
test.columns = ['ID', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass']
test = test.drop('ID', axis=1)

test.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52152,13.12,3.58,0.9,72.2,0.23,9.82,0.0,0.16,1
1,1.523,13.31,3.58,0.82,71.99,0.12,10.17,0.0,0.03,1
2,1.51709,13.0,3.47,1.79,72.72,0.66,8.18,0.0,0.0,2
3,1.5166,12.99,3.18,1.23,72.97,0.58,8.81,0.0,0.24,2
4,1.51839,12.85,3.67,1.24,72.57,0.62,8.68,0.0,0.35,2


In [14]:
def standardize(s):
  for i in s.columns:
    if i != 'Type of glass':
      s[i] = (s[i] - s[i].mean())/s[i].std()
  return s

In [16]:
train = standardize(train)
test = standardize(test)

train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,-0.219548,0.65758,0.611149,-0.195318,0.118214,-0.061969,-0.75892,-0.339452,-0.583925,1
1,-0.687312,0.19894,0.575195,0.170317,0.458946,-0.195574,-0.794005,-0.339452,-0.583925,1
2,-0.203193,-0.20874,0.675867,-0.33751,-0.039046,0.071637,-0.485255,-0.339452,-0.583925,1
3,-0.281699,-0.1323,0.625531,-0.439075,0.576892,0.041947,-0.590511,-0.339452,-0.583925,1
4,-0.759276,-0.74382,0.61834,0.332822,0.432736,0.175553,-0.590511,-0.339452,2.421657,1


In [28]:
euclid = KNeighborsClassifier(n_neighbors=8, metric = distance.sqeuclidean)
manhattan = KNeighborsClassifier(n_neighbors=8, metric = distance.cityblock)

x_train = train.drop(['Type of glass'], axis=1)
y_train = train['Type of glass']

euclid.fit(x_train, y_train)
manhattan.fit(x_train, y_train)



KNeighborsClassifier(metric=<function cityblock at 0x7f2024c4e3b0>,
                     n_neighbors=8)

In [30]:
x_test = test.drop(['Type of glass'], axis=1)
y_test = test['Type of glass']

print('Euclid Predictions = ', euclid.predict(x_test))
print('Manhattan Predictions = ', manhattan.predict(x_test))


Euclid Predictions =  [1 1 2 1 2 2 2 1 5 2 5 6 2 1 7 7 7]
Manhattan Predictions =  [1 1 2 2 1 2 2 1 5 2 5 6 6 2 7 7 7]


In [34]:
df = pd.DataFrame({'Actual': y_test, 'Euclid': euclid.predict(x_test), 'Manhattan': manhattan.predict(x_test) })
df.head()

Unnamed: 0,Actual,Euclid,Manhattan
0,1,1,1
1,1,1,1
2,2,2,2
3,2,1,2
4,2,2,1


In [47]:
# Performance Evaluation

manhattan_count = len(df.loc[df['Manhattan'] == df['Actual']])

euclid_count = len(df.loc[df['Euclid'] == df['Actual']])

print('Manhattan Accuracy: ',round(100*manhattan_count/len(df), 2),"\n")

print(classification_report(y_test, manhattan.predict(x_test), target_names=df['Actual'].astype(str).unique()),"\n")



print('Square Euclidean Accuracy: ',round(100*euclid_count/len(df), 2),"\n")

print(classification_report(y_test, euclid.predict(x_test), target_names=df['Actual'].astype(str).unique()))


Manhattan Accuracy:  64.71 

              precision    recall  f1-score   support

           1       0.50      1.00      0.67         2
           2       0.33      0.67      0.44         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      0.67      0.80         3
           7       1.00      1.00      1.00         3

    accuracy                           0.65        17
   macro avg       0.64      0.67      0.62        17
weighted avg       0.65      0.65      0.62        17
 

Square Euclidean Accuracy:  58.82 

              precision    recall  f1-score   support

           1       0.40      1.00      0.57         2
           2       0.33      0.67      0.44         3
           3       0.00      0.00      0.00         3
           5       1.00      0.67      0.80         3
           6       1.00      0.33      0.50         3
           7       1.00      1.00      1.00         3

    accura

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
