In [70]:
from pymongo import MongoClient
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
from pprint import pprint

In [79]:
collection = MongoClient(port = 27017)["proj_4"]['mushrooms']

In [80]:
print(f"Count of documents in db: {collection.count_documents({})}")
print("Sample document: ")
pprint(collection.find_one())

Count of documents in db: 61069
Sample document: 
{'_id': ObjectId('67ce59daef8581886d4f0416'),
 'cap-color': 'o',
 'cap-diameter': '15.26',
 'cap-shape': 'x',
 'cap-surface': 'g',
 'class': 'p',
 'does-bruise-or-bleed': 'f',
 'gill-attachment': 'e',
 'gill-color': 'w',
 'gill-spacing': '',
 'habitat': 'd',
 'has-ring': 't',
 'ring-type': 'g',
 'season': 'w',
 'spore-print-color': '',
 'stem-color': 'w',
 'stem-height': '16.95',
 'stem-root': 's',
 'stem-surface': 'y',
 'stem-width': '17.09',
 'veil-color': 'w',
 'veil-type': 'u'}


In [4]:
data = collection.find({})
raw_data_df = pd.DataFrame(list(data))
raw_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   _id                   61069 non-null  object
 1   class                 61069 non-null  object
 2   cap-diameter          61069 non-null  object
 3   cap-shape             61069 non-null  object
 4   cap-surface           61069 non-null  object
 5   cap-color             61069 non-null  object
 6   does-bruise-or-bleed  61069 non-null  object
 7   gill-attachment       61069 non-null  object
 8   gill-spacing          61069 non-null  object
 9   gill-color            61069 non-null  object
 10  stem-height           61069 non-null  object
 11  stem-width            61069 non-null  object
 12  stem-root             61069 non-null  object
 13  stem-surface          61069 non-null  object
 14  stem-color            61069 non-null  object
 15  veil-type             61069 non-null

In [15]:
#cast measure columns as float datatype, after conversion to string following db insertion.
raw_data_df = raw_data_df.astype({
    "cap-diameter" : 'float64',
    'stem-height': 'float64',
    'stem-width': 'float64'
})


In [7]:
#replace 'p' and 'e' classification labels with 1 and 0
raw_data_df["class"] = raw_data_df["class"].replace({"p":0, "e":1})

  raw_data_df["class"] = raw_data_df["class"].replace({"p":0, "e":1})


## Fit Model with All Features
#### Empty values in records are categorized as 'n/a', and factored into model fit.

In [17]:
#convert empty db records into unique category 'n/a'
values = {"gill-attachment": "n/a",
         "cap-surface":"n/a",
         "gill-spacing": "n/a",
         "stem-root":"n/a",
         "stem-surface": "n/a",
         "veil-type": "n/a",
         "veil-color":"n/a",
         "ring-type":"n/a",
         "spore-print-color":"n/a"}
blanks_filled_df = raw_data_df.fillna(value = values)

In [44]:
#https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
blanks_filled_df = raw_data_df.replace(r'^\s*$','n/a', regex=True)

In [45]:
blanks_filled_df.shape

(61069, 22)

In [46]:
#isolate target variable
blanks_filled_target_variable_df = pd.DataFrame(blanks_filled_df["class"])

#isolate numerical features
blanks_filled_numerical_df = pd.DataFrame(blanks_filled_df[["cap-diameter", "stem-height", "stem-width"]])

#isolate categorical features to next create dummies, and remove "_id" mongoDB column.
blanks_filled_categorical_df = blanks_filled_df.drop(columns = ["_id", "class", "cap-diameter", "stem-height", "stem-width"])

In [47]:
blanks_filled_categorical_df.nunique()

cap-shape                7
cap-surface             12
cap-color               12
does-bruise-or-bleed     2
gill-attachment          8
gill-spacing             4
gill-color              12
stem-root                6
stem-surface             9
stem-color              13
veil-type                2
veil-color               7
has-ring                 2
ring-type                9
spore-print-color        8
habitat                  8
season                   4
dtype: int64

In [48]:
#convert categorical features into dummies
dummies = pd.get_dummies(blanks_filled_categorical_df, dtype = int)

In [49]:
dummies.shape

(61069, 125)

In [50]:
#concat numerical features with dummy features
blanks_filled_all_features_df = pd.concat([blanks_filled_numerical_df, dummies], axis = 1)

In [54]:
#concat all feautres with target variable
final_blanks_filled_nonScaled_df = pd.concat([blanks_filled_target_variable_df,blanks_filled_all_features_df], axis = 1)

In [55]:
final_blanks_filled_nonScaled_df.shape

(61069, 129)

In [56]:
X_data = final_blanks_filled_nonScaled_df.drop(columns = ["class"])
y_data = final_blanks_filled_nonScaled_df["class"].to_numpy()

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data)


In [58]:
scaler = StandardScaler()
scaler_fit = scaler.fit(X_train)
X_train_scaled = scaler_fit.transform(X_train)
X_test_scaled = scaler_fit.transform(X_test)

In [59]:
knn_model_1 = KNeighborsClassifier(n_neighbors = 3)

In [60]:
knn_model_1.fit(X_train_scaled, y_train)

In [61]:
predictions = knn_model_1.predict(X_test_scaled)

In [62]:
acc_score = accuracy_score(y_test, predictions)

In [63]:
print(f"Accuracy score: {acc_score}")

Accuracy score: 1.0


In [64]:
conf_matrix = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(conf_matrix,
                     index = ["actual 0", "actual 1"],
                     columns = ["predicted 0", "predicted 1"]
                    )

In [65]:
cm_df

Unnamed: 0,predicted 0,predicted 1
actual 0,8429,0
actual 1,0,6839


In [66]:
cl_report = classification_report(y_test, predictions)


In [67]:
print(cl_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8429
           1       1.00      1.00      1.00      6839

    accuracy                           1.00     15268
   macro avg       1.00      1.00      1.00     15268
weighted avg       1.00      1.00      1.00     15268



In [18]:
filename = "trained_models/knn_model_1.pkl"

with open(filename, 'wb') as file:
    pickle.dump(knn_model_1, file)

In [19]:
scaler_filename = "trained_scalers/knn_scaler_1.pkl"

with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler_fit, file)