In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np



In [2]:
# Reading the mushroom dataset from CSV file
df = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [3]:
# The shape of dataset
f'Records: {df.shape[0]} & Features: {df.shape[1]}'

'Records: 8124 & Features: 23'

In [4]:
# Explarotory analysis
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [5]:
# The column "veil-type" has the same value therfore its irelevent
df['veil-type'].value_counts()

p    8124
Name: veil-type, dtype: int64

In [6]:
# Removing irelevent column
df.drop('veil-type', inplace = True, axis=1)
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,s,w,w,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,s,w,w,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,s,w,w,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,s,w,w,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,s,w,w,w,o,e,n,a,g


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [8]:
# Checking for null values
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [9]:
# Checking for class inbalance
df['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [10]:
# Seperating features and one hot encoding them
x = pd.get_dummies(df.drop('class', axis=1))
x.shape

(8124, 116)

In [11]:
x

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8120,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
8121,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [12]:
# Seperating labels and one hot encoding them
y = df['class']
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y.shape

(8124,)

In [13]:
y

array([1, 0, 0, ..., 0, 1, 0])

In [14]:
#Spliting data into 80% training data & 20% testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [15]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((6499, 116), (6499,), (1625, 116), (1625,))

In [16]:
# Creating a dataframe to store results
results = pd.DataFrame()
Name = []
Accuracy = []

In [17]:
# Applying Logistic Regression
lr = LogisticRegression()
lr_pred = lr.fit(x_train, y_train).predict(x_test)
Name.append('Logistic Regression')
Accuracy.append(accuracy_score(y_test, lr_pred))
print("Accuracy score using Logistic Regression is: {}%".format(accuracy_score(y_test, lr_pred)*100))
print(classification_report(y_test, lr_pred))

Accuracy score using Logistic Regression is: 99.81538461538462%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [18]:
# Applying Naive Bayes
gnb = GaussianNB()
gnb_pred = gnb.fit(x_train, y_train).predict(x_test)
Name.append('Naive Bayes')
Accuracy.append(accuracy_score(y_test, gnb_pred))
print("Accuracy score using Naive Bayes is: {}%".format(accuracy_score(y_test, gnb_pred)*100))
print(classification_report(y_test, gnb_pred))

Accuracy score using Naive Bayes is: 96.55384615384615%
              precision    recall  f1-score   support

           0       1.00      0.93      0.96       820
           1       0.94      1.00      0.97       805

    accuracy                           0.97      1625
   macro avg       0.97      0.97      0.97      1625
weighted avg       0.97      0.97      0.97      1625



In [19]:
# Applying AdaBoost
ada = AdaBoostClassifier()
ada_pred = ada.fit(x_train, y_train).predict(x_test)
Name.append('AdaBoost')
Accuracy.append(accuracy_score(y_test, ada_pred))
print("Accuracy score using AdaBoost is: {}%".format(accuracy_score(y_test, ada_pred)*100))
print(classification_report(y_test, ada_pred))

Accuracy score using AdaBoost is: 100.0%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [20]:
# Applying & fine tuning K-Nearest Neighbor
k_values = range(1,51)
best_acc = 0
best_k = 0
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    knn_pred = knn.predict(x_test)
    acc_value = accuracy_score(y_test, knn_pred)
    if acc_value > best_acc:
        best_acc = acc_value
        best_k = k
print("Max accuracy:", best_acc)
print("Best k:", best_k)

Max accuracy: 1.0
Best k: 1


In [21]:
knn = KNeighborsClassifier(n_neighbors=best_k)
knn_pred = knn.fit(x_train, y_train).predict(x_test)
Name.append('K-Nearest Neighbor')
Accuracy.append(accuracy_score(y_test, knn_pred))
print("Accuracy score using K-Nearest Neighbor is: {}%".format(accuracy_score(y_test, knn_pred)*100))
print(classification_report(y_test, knn_pred))

Accuracy score using K-Nearest Neighbor is: 100.0%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [22]:
# Applying & fine tuning Support Vectors Classifier
kernels = ['linear', 'rbf', 'poly']
gammas = [0.1, 1, 10, 100]
cs = [0.1, 1, 10, 100, 1000]
best_acc = 0
best_kernel = ''
best_gamma = 0
best_c = 0
for kernel in kernels:
    for gamma in gammas:
        for c in cs:
            svc = SVC(kernel=kernel, gamma=gamma, C=c)
            svc_pred = svc.fit(x_train, y_train).predict(x_test)
            acc_value = accuracy_score(y_test, svc_pred) * 100
            if acc_value > best_acc:
                best_acc = acc_value
                best_kernel = kernel
                best_gamma = gamma
                best_c = c
print("Max accuracy:", best_acc)
print("Best kernel:", best_kernel)
print("Best gamma:", best_gamma)
print("Best C:", best_c)

Max accuracy: 100.0
Best kernel: linear
Best gamma: 0.1
Best C: 1


In [23]:
svc = SVC(kernel=best_kernel, gamma=best_gamma, C=best_c)
svc_pred = svc.fit(x_train, y_train).predict(x_test)
Name.append('Support Vectors Classifier')
Accuracy.append(accuracy_score(y_test, svc_pred))
print("Accuracy score using Support Vectors Classifier is: {}%".format(accuracy_score(y_test, svc_pred)*100))
print(classification_report(y_test, svc_pred))

Accuracy score using Support Vectors Classifier is: 100.0%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [24]:
# Applying & fine tuning Decision Trees
max_depths = [None] + list(range(5, 51, 5))
best_acc = 0
best_max_depth = 0
for max_depth in max_depths:
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(x_train, y_train)
    dt_pred = dt.predict(x_test)
    acc_value = accuracy_score(y_test, dt_pred)
    if acc_value > best_acc:
        best_acc = acc_value
        best_max_depth = max_depth
print("Max accuracy:", best_acc)
print("Best maximum depth:", best_max_depth)

Max accuracy: 1.0
Best maximum depth: None


In [25]:
dst = DecisionTreeClassifier(max_depth=best_max_depth)
dst_pred = dst.fit(x_train, y_train).predict(x_test)
Name.append('Decision Trees')
Accuracy.append(accuracy_score(y_test, dst_pred))
print("Accuracy score using Decision Trees is: {}%".format(accuracy_score(y_test, dst_pred)*100))
print(classification_report(y_test, dst_pred))

Accuracy score using Decision Trees is: 100.0%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [26]:
# Applying & fine tuning Random Forest
n_estimators = range(10, 201, 10)
max_depths = [None] + list(range(5, 51, 5))
best_acc = 0
best_n_estimators = 0
best_max_depth = 0
for n_estimator in n_estimators:
    for max_depth in max_depths:
        rf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth)
        rf.fit(x_train, y_train)
        rf_pred = rf.predict(x_test)
        acc_value = accuracy_score(y_test, rf_pred)
        if acc_value > best_acc:
            best_acc = acc_value
            best_n_estimators = n_estimator
            best_max_depth = max_depth
print("Max accuracy:", best_acc)
print("Best number of estimators:", best_n_estimators)
print("Best maximum depth:", best_max_depth)

Max accuracy: 1.0
Best number of estimators: 10
Best maximum depth: None


In [27]:
rf = RandomForestClassifier(max_depth=best_max_depth, n_estimators=best_n_estimators)
rf_pred = rf.fit(x_train, y_train).predict(x_test)
Name.append('Random Forest')
Accuracy.append(accuracy_score(y_test, rf_pred))
print("Accuracy score using Random Forest is: {}%".format(accuracy_score(y_test, rf_pred)*100))
print(classification_report(y_test, rf_pred))

Accuracy score using Random Forest is: 100.0%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [28]:
# Showing the results 
results['Name'] = Name
results['Accuracy'] = Accuracy
results

Unnamed: 0,Name,Accuracy
0,Logistic Regression,0.998154
1,Naive Bayes,0.965538
2,AdaBoost,1.0
3,K-Nearest Neighbor,1.0
4,Support Vectors Classifier,1.0
5,Decision Trees,1.0
6,Random Forest,1.0
