In [3]:
#import all dependencies
import numpy as np
from pymongo import MongoClient
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
from pprint import pprint

## Import all records from DB

In [4]:
#create db cursor
collection = MongoClient(port = 27017)["proj_4"]['mushrooms']

In [5]:
#check db connection
print(f"Count of documents in db: {collection.count_documents({})}")
print("Sample document: ")
pprint(collection.find_one())

Count of documents in db: 61069
Sample document: 
{'_id': ObjectId('67ce59daef8581886d4f0416'),
 'cap-color': 'o',
 'cap-diameter': '15.26',
 'cap-shape': 'x',
 'cap-surface': 'g',
 'class': 'p',
 'does-bruise-or-bleed': 'f',
 'gill-attachment': 'e',
 'gill-color': 'w',
 'gill-spacing': '',
 'habitat': 'd',
 'has-ring': 't',
 'ring-type': 'g',
 'season': 'w',
 'spore-print-color': '',
 'stem-color': 'w',
 'stem-height': '16.95',
 'stem-root': 's',
 'stem-surface': 'y',
 'stem-width': '17.09',
 'veil-color': 'w',
 'veil-type': 'u'}


In [6]:
#generate reference dataframe with all unmodified data
data = collection.find({})
raw_data_df = pd.DataFrame(list(data))
raw_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   _id                   61069 non-null  object
 1   class                 61069 non-null  object
 2   cap-diameter          61069 non-null  object
 3   cap-shape             61069 non-null  object
 4   cap-surface           61069 non-null  object
 5   cap-color             61069 non-null  object
 6   does-bruise-or-bleed  61069 non-null  object
 7   gill-attachment       61069 non-null  object
 8   gill-spacing          61069 non-null  object
 9   gill-color            61069 non-null  object
 10  stem-height           61069 non-null  object
 11  stem-width            61069 non-null  object
 12  stem-root             61069 non-null  object
 13  stem-surface          61069 non-null  object
 14  stem-color            61069 non-null  object
 15  veil-type             61069 non-null

In [7]:
#cast numerical features as float datatype (data was stored as string dtype during initial db insertion).
raw_data_df = raw_data_df.astype({
    "cap-diameter" : 'float64',
    'stem-height': 'float64',
    'stem-width': 'float64'
})


In [8]:
#replace 'p' and 'e' classification labels with 1 and 0
raw_data_df["class"] = raw_data_df["class"].replace({"p":0, "e":1})

  raw_data_df["class"] = raw_data_df["class"].replace({"p":0, "e":1})


## Fit model with reduced features
### All features (of original dataset) with blank string values will be dropped

In [9]:
#https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
# convert all empty strings into NaN values
drop_na_df = raw_data_df.replace(r'^\s*$',np.nan, regex=True)
drop_na_df = drop_na_df.dropna(axis=1)
drop_na_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   _id                   61069 non-null  object 
 1   class                 61069 non-null  int64  
 2   cap-diameter          61069 non-null  float64
 3   cap-shape             61069 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-color            61069 non-null  object 
 7   stem-height           61069 non-null  float64
 8   stem-width            61069 non-null  float64
 9   stem-color            61069 non-null  object 
 10  has-ring              61069 non-null  object 
 11  habitat               61069 non-null  object 
 12  season                61069 non-null  object 
dtypes: float64(3), int64(1), object(9)
memory usage: 6.1+ MB


In [10]:
#isolate target variable
target_variable_dropNaN_df = pd.DataFrame(drop_na_df["class"])

#offload numerical features into standalone df
dropNaN_numerical_df = pd.DataFrame(drop_na_df[["cap-diameter", "stem-height", "stem-width"]])

#isolate categorical features to next create dummies, and remove "_id" mongoDB column.
dropNaN_categorical_df = drop_na_df.drop(columns = ["_id", "class", "cap-diameter", "stem-height", "stem-width"])

In [11]:
#convert categorical features into binary dummy categories
dummies_1 = pd.get_dummies(dropNaN_categorical_df, dtype = int)
dummies_1.shape

(61069, 60)

In [12]:
# combine classification (target variable), numerical features and dummies into unified dataframe.
dropNaN_all_cols_df = pd.concat([target_variable_dropNaN_df, dropNaN_numerical_df, dummies_1], axis = 1)
dropNaN_all_cols_df.shape

(61069, 64)

In [13]:
#isolate features (X) and target variable (y)
X_data_1 = dropNaN_all_cols_df.drop(columns = ["class"])
y_data_1 = dropNaN_all_cols_df['class'].to_numpy()

In [14]:
#split data into training and testing pools
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_data_1, y_data_1)

In [17]:
#scale the data
rf_scaler_fit_1 = StandardScaler().fit(X_train_1)
X_train_scaled_1 = rf_scaler_fit_1.transform(X_train_1)
X_test_scaled_1 = rf_scaler_fit_1.transform(X_test_1)

In [18]:
#instantiate first model
rf_model_1 = RandomForestClassifier(n_estimators = 500, random_state = 78)

In [19]:
#train model
rf_model_1.fit(X_train_scaled_1, y_train_1)

In [21]:
#generate predictions based on testing features
predictions_1 = rf_model_1.predict(X_test_scaled_1)

In [22]:
acc_score_1 = accuracy_score(y_test_1, predictions_1)
print(f"Accuracy score: {acc_score_1}")

Accuracy score: 0.9963976945244957


In [23]:
cf_matrix_1 = confusion_matrix(y_test_1, predictions_1)
cf_matrix_1_df = pd.DataFrame(cf_matrix_1,
                              index = ["actual 0", "actual 1"],
                              columns = ["predicted 0", "predicted 1"]
                             )
cf_matrix_1_df

Unnamed: 0,predicted 0,predicted 1
actual 0,8519,20
actual 1,35,6694


In [24]:
classification_report_1 = classification_report(y_test_1, predictions_1)
print(classification_report_1)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8539
           1       1.00      0.99      1.00      6729

    accuracy                           1.00     15268
   macro avg       1.00      1.00      1.00     15268
weighted avg       1.00      1.00      1.00     15268



## Fit Model with All Features
#### Empty values ('blanks') in records are categorized as 'n/a', and factored into model fit.

In [25]:
#https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
#convert empty string values into unique value 'n/a', to later be converted into dummy category.
blanks_filled_df = raw_data_df.replace(r'^\s*$','n/a', regex=True)
blanks_filled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   _id                   61069 non-null  object 
 1   class                 61069 non-null  int64  
 2   cap-diameter          61069 non-null  float64
 3   cap-shape             61069 non-null  object 
 4   cap-surface           61069 non-null  object 
 5   cap-color             61069 non-null  object 
 6   does-bruise-or-bleed  61069 non-null  object 
 7   gill-attachment       61069 non-null  object 
 8   gill-spacing          61069 non-null  object 
 9   gill-color            61069 non-null  object 
 10  stem-height           61069 non-null  float64
 11  stem-width            61069 non-null  float64
 12  stem-root             61069 non-null  object 
 13  stem-surface          61069 non-null  object 
 14  stem-color            61069 non-null  object 
 15  veil-type          

In [26]:
#verify original df shape has been maintained
blanks_filled_df.shape

(61069, 22)

In [27]:
#isolate target variable
blanks_filled_target_variable_df = pd.DataFrame(blanks_filled_df["class"])

#isolate numerical features
blanks_filled_numerical_df = pd.DataFrame(blanks_filled_df[["cap-diameter", "stem-height", "stem-width"]])

#isolate categorical features to next create dummies, and remove "_id" mongoDB column.
blanks_filled_categorical_df = blanks_filled_df.drop(columns = ["_id", "class", "cap-diameter", "stem-height", "stem-width"])

In [28]:
#convert categorical features into dummies
dummies_2 = pd.get_dummies(blanks_filled_categorical_df, dtype = int)

In [29]:
dummies_2.shape

(61069, 125)

In [30]:
#concat all feautres with target variable
final_blanks_filled_df = pd.concat([blanks_filled_target_variable_df,blanks_filled_numerical_df, dummies_2], axis = 1)

In [31]:
#check that all columns (target and features) are accounted for.
final_blanks_filled_df.shape

(61069, 129)

In [32]:
#isolate features (X) and target variable (y)
X_data_2 = final_blanks_filled_df.drop(columns = ["class"])
y_data_2 = final_blanks_filled_df["class"].to_numpy()

In [33]:
#split data into training and testing pools
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_data_2, y_data_2)

In [34]:
#scale the data
rf_scaler_fit_2 = StandardScaler().fit(X_train_2)
X_train_scaled_2 = rf_scaler_fit_2.transform(X_train_2)
X_test_scaled_2 = rf_scaler_fit_2.transform(X_test_2)

In [35]:
#train the second KNN model on the newly prepprocessed data
rf_model_2 = RandomForestClassifier(n_estimators = 500, random_state = 78)

In [36]:
#fit the model
rf_model_2.fit(X_train_scaled_2, y_train_2)

In [37]:
predictions_2 = rf_model_2.predict(X_test_scaled_2)

In [38]:
acc_score_2 = accuracy_score(y_test_2, predictions_2)
print(f"Accuracy score: {acc_score_2}")

Accuracy score: 1.0


In [39]:
conf_matrix_2 = confusion_matrix(y_test_2, predictions_2)
cm_df_2 = pd.DataFrame(conf_matrix_2,
                     index = ["actual 0", "actual 1"],
                     columns = ["predicted 0", "predicted 1"]
                    )
cm_df_2

Unnamed: 0,predicted 0,predicted 1
actual 0,8485,0
actual 1,0,6783


In [40]:
cl_report_2 = classification_report(y_test_2, predictions_2)
print(cl_report_2)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8485
           1       1.00      1.00      1.00      6783

    accuracy                           1.00     15268
   macro avg       1.00      1.00      1.00     15268
weighted avg       1.00      1.00      1.00     15268



In [18]:
#store working KNN model
filename = "trained_models/rf_model_2.pkl"

with open(filename, 'wb') as file:
    pickle.dump(rf_model_2, file)

In [19]:
#save fit scaler, to be used when KNN model is loaded for future use.
scaler_filename = "trained_scalers/rf_scaler_fit_2.pkl"

with open(scaler_filename, 'wb') as file:
    pickle.dump(rf_scaler_fit_2, file)