In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


# X and y

In [3]:
input_data=df.drop('class',axis=1)
X=pd.get_dummies(input_data,drop_first=True)

In [4]:
y=df['class']

# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

# Gradient Boost Classifier Model with default parameters

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model=GradientBoostingClassifier()
gb_model.fit(X_train,y_train)

ypred_train=gb_model.predict(X_train)
ypred_test=gb_model.predict(X_test)

from sklearn.metrics import accuracy_score
print('Train accuracy:',accuracy_score(y_train,ypred_train))
print('Test accuracy:',accuracy_score(y_test,ypred_test))

from sklearn.model_selection import cross_val_score
scores=cross_val_score(gb_model,X,y,cv=5)
print('Cross Validation Score:',scores.mean())

Train accuracy: 0.9996922603477458
Test accuracy: 0.9993846153846154
Cross Validation Score: 0.9192312239484653


# Identifying best parameters for Gradient Boost model using hyperparameter tuning 

In [7]:
from sklearn.model_selection import GridSearchCV

estimator=GradientBoostingClassifier()

param_grid={'n_estimators':[1,5,10,40,100],
            'learning_rate':[0.1,0.2,0.3,0.5,0.8,1]}

grid=GridSearchCV(estimator,param_grid,cv=5,scoring='accuracy')

grid.fit(X_train,y_train)

print(grid.best_params_)

grid.best_estimator_

{'learning_rate': 0.5, 'n_estimators': 40}


GradientBoostingClassifier(learning_rate=0.5, n_estimators=40)

# Feature Importances

In [8]:
grid.best_estimator_.feature_importances_

array([8.13838970e-05, 1.29538823e-16, 5.57836439e-17, 0.00000000e+00,
       0.00000000e+00, 1.27225590e-03, 8.68144564e-06, 3.98393828e-05,
       2.15279989e-07, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.70115553e-17, 0.00000000e+00, 0.00000000e+00, 2.72861667e-06,
       2.18578349e-03, 4.89948888e-02, 1.87061481e-04, 3.47255979e-03,
       1.85991628e-02, 3.67075387e-06, 6.29386732e-01, 1.09716739e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.12015332e-03,
       2.71941999e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.56940816e-07, 2.62758426e-03, 1.36173284e-01, 9.81872022e-03,
       5.76927007e-02, 1.23115324e-02, 2.50388870e-06, 1.14190376e-05,
       3.31278750e-08, 0.00000000e+00, 1.35354114e-02, 7.23131416e-06,
       0.00000000e+00, 0.00000000e+00, 6.73917104e-04, 0.00000000e+00,
      

In [9]:
feats=pd.DataFrame(index=X.columns,
                  data=grid.best_estimator_.feature_importances_,
                  columns=['Importance'])

important_features=feats[feats['Importance']>0.01]
important_features

Unnamed: 0,Importance
bruises_t,0.048995
odor_l,0.018599
odor_n,0.629387
stalk-root_c,0.136173
stalk-root_r,0.057693
stalk-surface-above-ring_k,0.012312
stalk-surface-below-ring_y,0.013535
spore-print-color_h,0.016548
spore-print-color_r,0.028946


In [10]:
imp_features_list=feats[feats['Importance']>0.01].index.to_list()
imp_features_list

['bruises_t',
 'odor_l',
 'odor_n',
 'stalk-root_c',
 'stalk-root_r',
 'stalk-surface-above-ring_k',
 'stalk-surface-below-ring_y',
 'spore-print-color_h',
 'spore-print-color_r']

# Gradient Boost with best hyperparameters

In [11]:
X=X[imp_features_list]
y=df['class']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

gb_bhp=GradientBoostingClassifier(n_estimators=40,learning_rate=0.5)
gb_bhp.fit(X_train,y_train)

ypred_train=gb_bhp.predict(X_train)
predictions=gb_bhp.predict(X_test)

from sklearn.metrics import accuracy_score
print('Train accuracy:',accuracy_score(y_train,ypred_train))
print('Test accuracy:',accuracy_score(y_test,predictions))

from sklearn.model_selection import cross_val_score
scores=cross_val_score(gb_bhp,X,y,cv=5)
print('Cross Validation Score:',scores.mean())

Train accuracy: 0.9927893070699965
Test accuracy: 0.9938474159146842
Cross Validation Score: 0.966276923076923


# Confusion Matrix

In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

array([[1262,   12],
       [   3, 1161]])

# Classification Report

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           e       1.00      0.99      0.99      1274
           p       0.99      1.00      0.99      1164

    accuracy                           0.99      2438
   macro avg       0.99      0.99      0.99      2438
weighted avg       0.99      0.99      0.99      2438

