Prepare a model for glass classification using KNN

Data Description:

RI : refractive index

Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)

Mg: Magnesium

AI: Aluminum

Si: Silicon

K:Potassium

Ca: Calcium

Ba: Barium

Fe: Iron

Type: Type of glass: (class attribute)
1 -- building_windows_float_processed
 2 --building_windows_non_float_processed
 3 --vehicle_windows_float_processed
 4 --vehicle_windows_non_float_processed (none in this database)
 5 --containers
 6 --tableware
 7 --headlamps







In [None]:
#IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

#**Imporing Dataset**

In [None]:
from google.colab import files
uploaded=files.upload()

In [None]:
glass = pd.read_csv('glass.csv')

In [None]:
glass 

# EDA & VISUALIZATION

In [None]:
glass.info()

In [None]:
glass.describe()

# Check duplicate rows

In [None]:
glass[glass.duplicated()]

In [None]:
df = glass.drop_duplicates()

In [None]:
df

# Check correlation between datapoints

In [None]:
corr = df.corr()

In [None]:
corr

In [None]:
sns.heatmap(corr)

#We can notice that Ca and K values don't affect Type that much.

#Also Ca and RI are highly correlated, this means using only RI is enough.

#So we can go ahead and drop Ca, and also K.(performed later)

# Scatter plot of two features

In [None]:
sns.scatterplot(df['RI'],df['Na'],hue=df['Type'])

# pairwise plot

In [None]:
#pairwise plot of all the features
sns.pairplot(df,hue='Type')
plt.show()

##The pairplot shows that the data is not linear and KNN can be applied to get nearest neighbors and classify the glass types

In [None]:
df

# Feature Scaling

In [None]:
DF= df.iloc[:,0:9]

In [None]:
DF

In [None]:
array= DF.values

In [None]:
array

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Normalization function
stscaler = StandardScaler().fit(array)
X = stscaler.transform(array)

In [None]:
X

In [None]:
df_knn = pd.DataFrame(X,columns=df.columns[:-1])

In [None]:
df_knn

In [None]:
x= df_knn
y= df['Type']

In [None]:
x

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
x_train,x_test,y_train,y_test= train_test_split(x,y, test_size=0.3,random_state=45)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

# KNN Model

In [None]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(x_train,y_train)

In [None]:
#Predicting on test data
preds = model.predict(x_test) # predicting on test data set 
pd.Series(preds).value_counts() # getting the count of each category

In [None]:
pd.crosstab(y_test,preds) # getting the 2 way table to understand the correct and wrong predictions


In [None]:
print("Accuracy", accuracy_score(y_test,preds)*100)

In [None]:
model.score(x_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
print(classification_report(y_test,preds))

# Grid Search for Algorithm Tuning

In [None]:
n_neighbors = np.array(range(1,15))
param_grid = dict(n_neighbors=n_neighbors)

In [None]:
model = KNeighborsClassifier()
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(x, y)

In [None]:
print(grid.best_score_)
print(grid.best_params_)

# Visualizing the CV results

In [None]:
k_values = np.arange(1,25)
train_accuracy = []
test_accuracy = []

for i, k in enumerate(k_values):
    # k from 1 to 25(exclude)
    knn = KNeighborsClassifier(n_neighbors=k)
    # Fit with knn
    knn.fit(x_train,y_train)
    #train accuracy
    train_accuracy.append(knn.score(x_train, y_train))
    # test accuracy
    test_accuracy.append(knn.score(x_test, y_test))
# Plot
plt.figure(figsize=[13,8])
plt.plot(k_values, test_accuracy, label = 'Testing Accuracy')
plt.plot(k_values, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.title('-value VS Accuracy')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.xticks(k_values)
plt.savefig('graph.png')
plt.show()
print("Best accuracy is {} with K = {}".format(np.max(test_accuracy),1+test_accuracy.index(np.max(test_accuracy))))