# Mushroom Classifier

In [73]:
import pandas as pd

## Loading and Preprocessing Dataset

In [74]:
data_df = pd.read_csv('./datasets/mushrooms.csv')
data_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [75]:
print(data_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [76]:
print("Dataset shape", data_df.shape)

Dataset shape (8124, 23)


In [77]:
from sklearn.preprocessing import LabelEncoder

In [78]:
encoders = []
def preprocess_data(df, test=False):
    processed_df = df.copy()

    for idx, fe in enumerate(df.columns):
        if not test:
            encoder = LabelEncoder()
            processed_df[fe] = encoder.fit_transform(df[fe])
            encoders.append((fe, encoder))

        else:
            processed_df[fe] = encoders[idx][1].transform(df[fe])
    

    return processed_df

processed_df = preprocess_data(data_df)
display(processed_df)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


## Training a classifier

In [79]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [80]:
def process_model(model):


    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("F1 Score", f1_score(y_pred, y_test))
    print("Classification Report")
    print(classification_report(y_pred, y_test))
    print("Confusion Matrix")
    print(confusion_matrix(y_pred, y_test))

    return model
    

In [81]:
X_train, X_test, y_train, y_test = train_test_split(
processed_df.drop(['class', 'veil-type' ], axis=1),
processed_df['class'],
test_size=0.2, 
random_state=0
)

In [82]:
svc  = process_model(model=SVC())

F1 Score 0.990228013029316
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       863
           1       0.98      1.00      0.99       762

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625

Confusion Matrix
[[850  13]
 [  2 760]]


## Evaluating model's Performance

In [83]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix

y_pred = svc.predict(X_test)
print("F1 Score", f1_score(y_pred, y_test))
print("Classification Report")
print(classification_report(y_pred, y_test))
print("Confusion Matrix")
print(confusion_matrix(y_pred, y_test))

F1 Score 0.990228013029316
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       863
           1       0.98      1.00      0.99       762

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625

Confusion Matrix
[[850  13]
 [  2 760]]


## Saving the model for deployment

In [84]:
import joblib
joblib.dump(svc, 'models/classifier-v2')

['models/classifier-v2']