## 1. Importing libraries

In [139]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import imblearn

## 2. Preparing Dataframe 

In [140]:
df = pd.read_csv('datasets_478_974_mushrooms.csv')

In [141]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## 3. Data Cleaning

**3.1 Using Label Encoder to convert dataset in numerical format**  

In [142]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in df.columns:
    df[col] = encoder.fit_transform(df[col])

In [143]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


**3.2 Splitting the data in the form of dependent and independent variables**

In [77]:
X = df.drop('class',axis=1)
y = df['class']

**3.3 Checking for data imbalalnce**

In [78]:
y.value_counts()

0    4208
1    3916
Name: class, dtype: int64

Class '1' indicates Mashroom is Poisionous
Class '0' indicates Mashroom is eatable

**Data has imbalance as class '0' count is higher than class '1'**

**3.4 Correcting data imbalance using undersampling**

In [79]:
from imblearn.under_sampling import NearMiss

In [80]:
nm=NearMiss()
X_res,y_res = nm.fit_sample(X,y)

In [81]:
y_res.value_counts()

1    3916
0    3916
Name: class, dtype: int64

**Now data is in perfect balance for both the classes**

In [82]:
print(X_res.shape,y_res.shape)

(7832, 22) (7832,)


**3.5 Feature Scalling** 

In [83]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_res= sc.fit_transform(X_res)

**3.6 Applying Principle component analysis with n_components = 4**

In [85]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)

X_res = pca.fit_transform(X_res)

In [86]:
X_res.shape

(7832, 4)

## 4. Model training with KNN algorithm

**4.1. Splitting The data into training and testing dataset**

In [88]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=101)

**4.2 fitting the model on test dataset with n_neightbour = 1 for instance**

In [89]:
from sklearn.neighbors import KNeighborsClassifier

In [147]:
model = KNeighborsClassifier(n_neighbors=2)

In [148]:
model.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=2)

## 5. Prediction and Evaluation

**5.1 Predicting output with Test dataset**

In [149]:
y_predict = model.predict(X_test)

In [150]:
y_predict

array([1, 1, 1, ..., 1, 1, 0])

**5.2 Evaluating the model by creating a function to find traing and testing accyracy**

In [151]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [152]:
def print_score(model,X_train,y_train,X_test,y_test,train=True):
    if train == True:
        print("Training results:\n")
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train,model.predict(X_train))))
        print('Classification Report:\n{}\n'.format(classification_report(y_train,model.predict(X_train))))
        print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_train,model.predict(X_train))))
        res = cross_val_score(model, X_train, y_train, cv=10, n_jobs=-1, scoring='accuracy')
       
    elif train == False:
        print("Test results:\n")
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test,model.predict(X_test))))
        print('Classification Report:\n{}\n'.format(classification_report(y_test,model.predict(X_test))))
        print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_test,model.predict(X_test))))

In [153]:
print_score(model,X_train,y_train,X_test,y_test,train=True)

Training results:

Accuracy Score: 0.9962

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2742
           1       1.00      0.99      1.00      2740

    accuracy                           1.00      5482
   macro avg       1.00      1.00      1.00      5482
weighted avg       1.00      1.00      1.00      5482


Confusion Matrix:
[[2742    0]
 [  21 2719]]



In [154]:
print_score(model,X_train,y_train,X_test,y_test,train=False)

Test results:

Accuracy Score: 0.9932

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1174
           1       1.00      0.99      0.99      1176

    accuracy                           0.99      2350
   macro avg       0.99      0.99      0.99      2350
weighted avg       0.99      0.99      0.99      2350


Confusion Matrix:
[[1173    1]
 [  15 1161]]

