In [84]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

## Load the prepared dataset 

In [85]:
df = pd.read_csv("prepared_mushrooms_sample.csv")

In [86]:
pd.set_option('display.max_columns', None)
df.head(1)

Unnamed: 0.1,Unnamed: 0,ring-number,bruises_true,cap-color_buff,cap-color_cinammon,cap-color_gray,cap-color_green,cap-color_pink,cap-color_purple,cap-color_red,cap-color_white,cap-color_yellow,cap-shape_conical,cap-shape_convex,cap-shape_flat,cap-shape_knobbed,cap-shape_sunken,cap-surface_grooves,cap-surface_scaly,cap-surface_smooth,class_poisonous,gill-attachment_free,gill-color_brown,gill-color_buff,gill-color_chocolate,gill-color_gray,gill-color_green,gill-color_orange,gill-color_pink,gill-color_purple,gill-color_red,gill-color_white,gill-color_yellow,gill-size_narrow,gill-spacing_crowded,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods,odor_anise,odor_creosote,odor_fishy,odor_foul,odor_musty,odor_none,odor_pungent,odor_spicy,population_clustered,population_numerous,population_scattered,population_several,population_solitary,ring-type_flaring,ring-type_large,ring-type_none,ring-type_pendant,spore-print-color_brown,spore-print-color_buff,spore-print-color_chocolate,spore-print-color_green,spore-print-color_orange,spore-print-color_purple,spore-print-color_white,spore-print-color_yellow,stalk-color-above-ring_buff,stalk-color-above-ring_cinammon,stalk-color-above-ring_gray,stalk-color-above-ring_orange,stalk-color-above-ring_pink,stalk-color-above-ring_red,stalk-color-above-ring_white,stalk-color-above-ring_yellow,stalk-color-below-ring_buff,stalk-color-below-ring_cinammon,stalk-color-below-ring_gray,stalk-color-below-ring_orange,stalk-color-below-ring_pink,stalk-color-below-ring_red,stalk-color-below-ring_white,stalk-color-below-ring_yellow,stalk-root_club_1,stalk-root_equal_1,stalk-root_nan_1,stalk-root_rooted_1,stalk-shape_tapering,stalk-surface-above-ring_scaly,stalk-surface-above-ring_silky,stalk-surface-above-ring_smooth,stalk-surface-below-ring_scaly,stalk-surface-below-ring_silky,stalk-surface-below-ring_smooth,veil-color_orange,veil-color_white,veil-color_yellow
0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0


In [87]:
# remove this col
df = df.drop('Unnamed: 0', axis=1)

## The label is class_poisonous 

In [88]:
df["class_poisonous"].value_counts()

0    2000
1    2000
Name: class_poisonous, dtype: int64

###### equal amount of poisonous vs edible 

## Break data train and test set

In [89]:
y = df['class_poisonous']
X = df.drop('class_poisonous', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [90]:
X_train.shape, X_test.shape

((3200, 94), (800, 94))

### Start by Training Logistic Regression

### Create a pipeline 

In [91]:
pipe = Pipeline(steps=[('lg', LogisticRegression())])

In [92]:
param_grid = {
    'lg__solver': ['lbfgs', 'liblinear', 'newton-cholesky', 'sag', 'saga'],  
    'lg__max_iter': [50,100,200,300]}

In [None]:
search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, refit=True)
search.fit(X_train, y_train)

In [94]:
print("Best CV score = %0.3f:" % search.best_score_)
print("Best parameters: ", search.best_params_)

# store the best params and best model for later use
LG_best_params = search.best_params_
LG_best_model = search.best_estimator_

Best CV score = 0.999:
Best parameters:  {'lg__max_iter': 50, 'lg__solver': 'lbfgs'}


###### We can see that the highest accuracy of 99% was achived by using the solved newton-cholesky and max_iter of 50


## Now Train SVM

In [95]:
pipe = Pipeline(steps=[('svm', svm.SVC(probability=True))])
param_grid = {
    'svm__C': [0.1, 1, 10, 100],  
    'svm__gamma': [1, 0.1, 0.01, 0.001], 
    'svm__kernel': ['rbf', 'linear', 'poly']}
search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, refit=True)
search.fit(X_train, y_train)
print("Best CV score = %0.3f:" % search.best_score_)
print("Best parameters: ", search.best_params_)

# store the best params and best model for later use
SVM_best_params = search.best_params_
SVM_best_model = search.best_estimator_

Best CV score = 1.000:
Best parameters:  {'svm__C': 0.1, 'svm__gamma': 1, 'svm__kernel': 'poly'}


###### We can see that the highest accuracy of 100% was achived by using a C value of 0.1, gamma of 1 
and the poly kernal


## Now train Random Forest

In [96]:
pipe = Pipeline(steps=[('rf', RandomForestClassifier())])
param_grid = {
    'rf__n_estimators': [100, 200, 300], # Number of trees in the forest
    'rf__max_depth': [None, 5, 10], # Maximum depth of the trees
    'rf__min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
    'rf__min_samples_leaf': [1, 2, 4] # Minimum number of samples required at each leaf node
}
search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv=5, refit=True)
search.fit(X_train, y_train)
print("Best CV score = %0.3f:" % search.best_score_)
print("Best parameters: ", search.best_params_)

#store the best params and best model for later use
RF_best_params = search.best_params_
RF_best_model = search.best_estimator_

Best CV score = 1.000:
Best parameters:  {'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}


We can see the Random Forest performed best using a max depth of none, min_sample_leaf of 1 
min_samples_split of 2 and n_estimators set to 100

## Now train naive bayes 

In [97]:
from sklearn.naive_bayes import GaussianNB

#### using an article we found online https://medium.com/analytics-vidhya/how-to-improve-naive-bayes-9fa698e14cba 
#### we applied param tuning 

In [99]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}
search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=5, n_jobs=-1)
search.fit(X_train, y_train)
print(search.best_estimator_)
print("Best CV score = %0.3f:" % search.best_score_)
#store the best params and best model for later use
NB_best_params = search.best_params_
NB_best_model = search.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
GaussianNB(var_smoothing=0.02848035868435802)
Best CV score = 0.996:


###### Using var smoothing of 0.0284we achieved an accuracy of 99.6%

## Use each model to make predictions on the test data 

#### Calculating the accruacy, precision, recall and f1_score for each model using the  classification_report() sklearn 

In [100]:
from sklearn.metrics import classification_report

In [101]:
def evaluate(X_test, y_test, model):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=["Edible","Poisonous"]))

# Logistic Regression

In [102]:
evaluate(X_test, y_test, LG_best_model)

              precision    recall  f1-score   support

      Edible       1.00      1.00      1.00       400
   Poisonous       1.00      1.00      1.00       400

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800



# SVM

In [103]:
evaluate(X_test, y_test, SVM_best_model)

              precision    recall  f1-score   support

      Edible       1.00      1.00      1.00       400
   Poisonous       1.00      1.00      1.00       400

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800



# Random Forest

In [104]:
evaluate(X_test, y_test, RF_best_model)

              precision    recall  f1-score   support

      Edible       1.00      1.00      1.00       400
   Poisonous       1.00      1.00      1.00       400

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800



# Naive Bayes

In [105]:
evaluate(X_test, y_test, NB_best_model)

              precision    recall  f1-score   support

      Edible       0.99      0.99      0.99       400
   Poisonous       0.99      0.99      0.99       400

    accuracy                           0.99       800
   macro avg       0.99      0.99      0.99       800
weighted avg       0.99      0.99      0.99       800



#### Multiple models achieving an accuracy of 100% may indicate this classification problem is too simple for machine learning algorithms, it may also be due to the size of the training and test sets which are quiet small


In [None]:
### We tried changing the size of trai:test. This had no impact and the accuracy stayed at 100%

In [111]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.4, shuffle=True, stratify=y)

# random forest using best params found
rf = RandomForestClassifier(max_depth=None, min_samples_leaf=1, 
                                min_samples_split=2, n_estimators=100)
rf.fit(X_train2, y_train2)

In [112]:
evaluate(X_test2, y_test2, rf)

              precision    recall  f1-score   support

      Edible       1.00      1.00      1.00       800
   Poisonous       1.00      1.00      1.00       800

    accuracy                           1.00      1600
   macro avg       1.00      1.00      1.00      1600
weighted avg       1.00      1.00      1.00      1600

