# KNN vs PCA+KNN vs K-Means clustering

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time                               #to measure time taken by the algorithm to fit data
print("Libraries imported")

Libraries imported


## Dataset

In [3]:
# mushroom classification challenge Kaggle.
mush=pd.read_csv(r'mushrooms.csv')
mush.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [11]:
mush.shape

(8124, 23)

In [9]:
mush.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


### Our class has two unique values, edible and poisonous and we have to classify into these two classes.
#### we  an see there are 4208 samples for edible and 3916 samples for poisonous mushrooms so our dataset is not imbalanced.


In [23]:
#Creating dummy variables for incorporating char values

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

cols=mush.columns
for i in cols:
    le.fit(mush[i])
    mush[i]=le.transform(mush[i])
    
mush.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [24]:
# train-tesst split

data=mush.drop(["class"],axis=1)
target=mush["class"]

from sklearn.model_selection import train_test_split as tts
xtr,xte,ytr,yte=tts(data,target,test_size=0.2)

In [25]:
print("Shape of x_train",xtr.shape,"Shape of x_test",xte.shape,"Shape of y_train",ytr.shape,"Shape of y_test",yte.shape)

Shape of x_train (6499, 22) Shape of x_test (1625, 22) Shape of y_train (6499,) Shape of y_test (1625,)


## KNN

In [138]:
#Importing K-Neighbours classifier from sklearn
from sklearn.neighbors import KNeighborsClassifier as knn

start=time.process_time()
clf1=knn(n_neighbors=3).fit(xtr,ytr)
print("Time taken to fit using only KNN is", (time.process_time()-start))

Time taken to fit using only KNN is 0.0297874269999987


In [44]:
#Now that we have fit the data, we now check accuracy
from sklearn.metrics import accuracy_score as asc
print("Accuracy of our KNN model is",asc(yte,clf1.predict(xte)))

Accuracy of our KNN model is 0.9993846153846154


# PCA + KNN

#### I do not apply mean transformation or standardization to the data since they are categorical variables with no unit which only describe the properties, not measure them.

In [34]:
from sklearn.decomposition import PCA
pca=PCA(0.99)

#fitting the training set
pca.fit(xtr)
print("Number of components capturing 99% of variation is", pca.n_components_,"out of 22.")

Number of components capturing 99% of variation is 14 out of 22.


In [45]:
#Transforming our train and test sets from 22-dimensional sets to 14-dimensional sets
p_xtr=pca.transform(xtr)
p_xte=pca.transform(xte)

### Applying KNN after PCA transformations to data and observing time it takes

In [139]:
start2=time.process_time()
clf2=knn(n_neighbors=3).fit(p_xtr,ytr)
print("Time taken to fit PCA components using KNN is", (time.process_time()-start2))

Time taken to fit PCA components using KNN is 0.010050101999997452


In [143]:
# Measuring accuracy
print("Accuracy of our PCA+KNN model is",asc(yte,clf2.predict(p_xte)))

Accuracy of our PCA+KNN model is 0.9987692307692307


# K-Means Clustering

In [171]:
from sklearn.cluster import KMeans
kmns=KMeans(n_clusters=2)

start3=time.process_time()
clusters=kmns.fit_predict(data)
print("Time taken to fit K_Means clustering to data dataset is", (time.process_time()-start3))

Time taken to fit K_Means clustering to data dataset is 0.3875107129999975


In [174]:
# Accuracy
print("Accuracy of our K-Means model is",asc(target,clusters))

Accuracy of our K-Means model is 0.7090103397341211
