In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\gagan soni\Downloads\mushrooms.csv")
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
df.shape

(8124, 23)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [5]:
inp = df.drop(columns = 'class')
out = df['class']

In [6]:
X = pd.get_dummies(inp,drop_first = True).astype(int)
y = pd.get_dummies(out,drop_first = True).astype(int)

In [7]:
X_train,X_test,y_train,y_test  = train_test_split(X,y , random_state = 9 , test_size = 0.2)

In [8]:
# model estimator
estimator = XGBClassifier(random_state = 42) # lock the random state no for the fix the data 
# parameters grid
param_grid = {'n_estimators':list(range(1,20)),
              'learning_rate':[0,0.1,0.2],
             'max_depth':[3,4,5],
             'gamma':[0,0.15,]} # we can use bootstrap here 
#grid search
grid = GridSearchCV(estimator,param_grid,cv = 5,scoring = 'accuracy')

grid.fit(X_train,y_train)
# best parameter for adabost
grid.best_params_

{'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 16}

In [9]:
grid.best_estimator_.feature_importances_

array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.8482545e-03,
       0.0000000e+00, 2.1757150e-02, 0.0000000e+00, 1.0373301e-02,
       3.4237947e-02, 0.0000000e+00, 4.9518293e-01, 1.6374875e-02,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 2.2504633e-02,
       2.6238766e-03, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.7728975e-01, 0.0000000e+00,
       8.7993741e-02, 4.1577610e-04, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.3759161e-02, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e

In [10]:
features = pd.DataFrame(data = grid.best_estimator_.feature_importances_,
                       index = X.columns,columns = ['Importance'])
features.head()

Unnamed: 0,Importance
cap-shape_c,0.0
cap-shape_f,0.0
cap-shape_k,0.0
cap-shape_s,0.0
cap-shape_x,0.0


In [11]:
imp_features = features[features['Importance']>0].index.tolist()
imp_features

['cap-color_w',
 'bruises_t',
 'odor_f',
 'odor_l',
 'odor_n',
 'odor_p',
 'gill-spacing_w',
 'gill-size_n',
 'stalk-root_c',
 'stalk-root_r',
 'stalk-surface-above-ring_k',
 'stalk-surface-below-ring_y',
 'stalk-color-below-ring_n',
 'spore-print-color_r',
 'spore-print-color_u',
 'spore-print-color_w']

In [12]:
X_imp = X[imp_features]

In [13]:
X_train,X_test,y_train,y_test  = train_test_split(X_imp,y , random_state = 9 , test_size = 0.2)

In [14]:
# modeling 
xg_boost =  XGBClassifier(gamma = 0,learning_rate=0.2,max_depth = 5,n_estimators= 16)
xg_boost.fit(X_train,y_train)

# train accuracy 
y_pred_train = xg_boost.predict(X_train)
print('train aaccuracy',accuracy_score(y_train,y_pred_train))

# cross validatoin score 
print('cross validation score is :',cross_val_score(xg_boost,X_train,y_train,cv = 5).mean())

# test accuracy 
y_pred_test = xg_boost.predict(X_test)
print('test accuracy is :', accuracy_score(y_test,y_pred_test))

train aaccuracy 1.0
cross validation score is : 0.9993843785160182
test accuracy is : 1.0
