In [83]:
# Imports
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder

In [84]:
# Reading in the dataset
df = pd.read_csv('../data/cleaned_data_1.csv')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df

Unnamed: 0,class,cap_diameter,cap_shape,cap_surface,cap_color,does_bruise_or_bleed,gill_attachment,gill_spacing,gill_color,stem_height,...,stem_root,stem_surface,stem_color,veil_type,veil_color,has_ring,ring_type,spore_print_color,habitat,season
0,1,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,1,16.60,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,1,14.07,x,g,o,f,e,,w,17.80,...,s,y,w,u,w,t,g,,d,w
3,1,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,1,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,1,1.18,s,s,y,f,f,f,f,3.93,...,,,y,,,f,f,,d,a
61065,1,1.27,f,s,y,f,f,f,f,3.18,...,,,y,,,f,f,,d,a
61066,1,1.27,s,s,y,f,f,f,f,3.86,...,,,y,,,f,f,,d,u
61067,1,1.24,f,s,y,f,f,f,f,3.56,...,,,y,,,f,f,,d,u


# Additional Cleaning and Preprocessing

In [85]:
# Previously empty cells are now represented as NaNs, which will be addressed below
# Values that are unknown will be labeled as 'unknown.'
df.fillna('x', axis = 1, inplace = True)

In [86]:
df.head()

Unnamed: 0,class,cap_diameter,cap_shape,cap_surface,cap_color,does_bruise_or_bleed,gill_attachment,gill_spacing,gill_color,stem_height,...,stem_root,stem_surface,stem_color,veil_type,veil_color,has_ring,ring_type,spore_print_color,habitat,season
0,1,15.26,x,g,o,f,e,x,w,16.95,...,s,y,w,u,w,t,g,x,d,w
1,1,16.6,x,g,o,f,e,x,w,17.99,...,s,y,w,u,w,t,g,x,d,u
2,1,14.07,x,g,o,f,e,x,w,17.8,...,s,y,w,u,w,t,g,x,d,w
3,1,14.17,f,h,e,f,e,x,w,15.77,...,s,y,w,u,w,t,p,x,d,w
4,1,14.64,x,h,o,f,e,x,w,16.53,...,s,y,w,u,w,t,p,x,d,w


In [87]:
df['class'] = df['class'].apply(lambda x: 'p' if x == 1 else 'e')

In [88]:
# Train test split for model evaluation purposes
X = df.iloc[:, 1:]
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [89]:
X_train.isnull().sum()

cap_diameter            0
cap_shape               0
cap_surface             0
cap_color               0
does_bruise_or_bleed    0
gill_attachment         0
gill_spacing            0
gill_color              0
stem_height             0
stem_width              0
stem_root               0
stem_surface            0
stem_color              0
veil_type               0
veil_color              0
has_ring                0
ring_type               0
spore_print_color       0
habitat                 0
season                  0
dtype: int64

# Model Selection and Implementation

Given my previous findings mentioned in cleaning_eda_1.ipynb, I've decided to implement Random Forests as the algorithm is robust to outliers, doesn't require feature scaling, and doesn't make any formal assumptions regarding feature distributions. It also handles skewed data and multi-modal data as well.

I'll be using RandomizedSearchCV for hyperparameter tuning.

In [91]:
# One Hot Encoding categorical variables and passing through continuous variables
cat_col = ['cap_shape', 'cap_surface', 'cap_color', 'does_bruise_or_bleed', 'gill_attachment', 
           'gill_spacing', 'gill_color', 'stem_root', 'stem_surface', 'stem_color', 'veil_type', 'veil_color', 
           'has_ring', 'ring_type', 'spore_print_color', 'habitat', 'season']

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown = 'ignore'), cat_col)
], remainder = 'passthrough')

pipeline = Pipeline([
    ('ct', ct),
    ('rf', RandomForestClassifier(random_state = 42))
])

rf_params = [{
    'rf__n_estimators': [100, 200, 400, 600],
    'rf__max_depth': [10, 50, 100, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}]

rf_rs = RandomizedSearchCV(pipeline, rf_params, n_iter = 100, cv = 5, verbose = 1, random_state = 42, n_jobs = -1,
                          error_score = 'raise')

rf_rs.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, error_score='raise',
                   estimator=Pipeline(steps=[('ct',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('ohe',
                                                                               OneHotEncoder(handle_unknown='ignore'),
                                                                               ['cap_shape',
                                                                                'cap_surface',
                                                                                'cap_color',
                                                                                'does_bruise_or_bleed',
                                                                                'gill_attachment',
                                                                                'gill_spacing',
                            

# Model Evaluation

For the sake of predicting a mushroom's edibility, I decided to utilize both accuracy and sensitivity as my metrics for evaluating model performance. This is to balance between maximizing the total number of right predictions as well as reduce the number of false negatives (predicting poisonous mushrooms as edible) predicted by the model.

In [94]:
# Train and Test Accuracy
print(f'Train Accuracy: {rf_rs.best_score_}, Test Accuracy: {accuracy_score(y_test, rf_rs.predict(X_test))}')

Train Accuracy: 1.0, Test Accuracy: 1.0


In [113]:
# Train and Test Sensitivity / Recall
train_sens = recall_score(y_train, rf_rs.predict(X_train), pos_label = 'p')
test_sens = recall_score(y_test, rf_rs.predict(X_test), pos_label = 'p')
print(f'Train Sensitivity: {train_sens}, Test Sensitivity: {test_sens}')

Train Sensitivity: 1.0, Test Sensitivity: 1.0


In [136]:
# Train Confusion Matrix 
confusion_matrix(y_train, rf_rs.predict(X_train))

array([[20400,     0],
       [    0, 25401]])

In [137]:
# Test Confusion Matrix
confusion_matrix(y_test, rf_rs.predict(X_test))

array([[6781,    0],
       [   0, 8487]])

There is no evidence of overfitting given the lack of difference between performance for both metrics across the training and testing dataset. However, I would say given the extensive computation time (> 2 hours) alongside the 100% scores suggest high bias in my model.

In [131]:
# Feature Importances
rf_rs.best_estimator_['rf'].feature_importances_

array([1.39494955e-02, 8.72334517e-03, 7.89149736e-03, 3.25741655e-03,
       2.74514662e-03, 5.56206528e-03, 1.36435428e-02, 7.07478513e-03,
       8.18656904e-03, 1.01593181e-02, 6.37131038e-03, 5.04951219e-03,
       1.04498064e-02, 3.61629154e-03, 1.28287096e-02, 9.01891750e-03,
       4.79102113e-03, 1.44879515e-02, 7.94593546e-03, 2.18314547e-03,
       9.39816927e-03, 4.97178857e-03, 1.28979025e-03, 1.03944328e-03,
       1.05868569e-02, 3.64546963e-03, 3.41884937e-03, 8.26510444e-03,
       3.68913649e-03, 6.67126135e-03, 6.50091218e-03, 1.38262719e-02,
       1.46659081e-02, 1.08869810e-02, 1.40044181e-02, 1.11796945e-02,
       4.81877725e-03, 2.00177157e-02, 8.65787036e-03, 1.82759165e-02,
       1.95763386e-02, 2.49338333e-02, 4.83032602e-03, 1.77321783e-02,
       1.36110089e-03, 3.00927987e-03, 5.47279279e-03, 4.84739347e-03,
       1.89845407e-03, 8.82432999e-03, 6.55073969e-03, 9.21284534e-03,
       3.39878477e-03, 1.22220845e-03, 1.92974072e-02, 1.22158630e-02,
      

In [132]:
rf_rs.best_estimator_['rf'].n_features_in_

127

It's apparent that out of 127 features utilized in my model, there are none that are particularly important in classifying a mushroom's edibility. Rather, it is a combination of all the features that produce strong classification results.  This is supported by my EDA found in my cleaning_eda_1.ipynb notebook, which found a lack of strong correlation between any of the features present in the dataset and the target (class) variable. 