In [148]:
import pandas as pd
import pickle
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import json
pd.option_context('display.max_columns', None)

<pandas._config.config.option_context at 0x79cbbc234520>

In [31]:
from google.colab import files
uploaded = files.upload()

In [33]:
# shutil.move('/content/kaggle.json', '/root/.kaggle/kaggle.json')

In [34]:
!pip install kaggle



In [35]:
! kaggle datasets download -d uciml/mushroom-classification

mushroom-classification.zip: Skipping, found more recently modified local copy (use --force to force download)


In [17]:
! unzip mushroom-classification.zip

Archive:  mushroom-classification.zip
  inflating: mushrooms.csv           


In [156]:
df = pd.read_csv('mushrooms.csv')
pd.option_context('display.max_columns', None)


<pandas._config.config.option_context at 0x79cbbc237c70>

In [160]:
df.head(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [104]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [105]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

## Domain Knowledge


1. `class`: This attribute refers to the classification of mushrooms as either "edible" or "poisonous," which is typically the target variable in a classification task.

2. `cap-shape`: Describes the shape of the mushroom cap, which can take values like "bell," "conical," "convex," etc.

3. `cap-surface`: Indicates the texture of the mushroom cap, such as "fibrous," "grooves," "smooth," etc.

4. `cap-color`: Represents the color of the mushroom cap, which could be various shades like "brown," "yellow," "white," etc.

5. `bruises`: Refers to whether the mushroom bruises when damaged, with possible values "bruises" or "no."

6. `odor`: Describes the smell of the mushroom, which can be categorized as "almond," "anise," "none," "foul," etc.

7. `gill-attachment`: Indicates how the gills of the mushroom are attached to the stem, with options "free" or "attached."

8. `gill-spacing`: Describes the spacing between the gills, with values "close" or "crowded."

9. `gill-size`: Represents the size of the gills, with options "broad" or "narrow."

10. `gill-color`: Indicates the color of the mushroom gills, such as "buff," "green," "pink," etc.

11. `stalk-shape`: Describes the shape of the mushroom stalk, with possible values "enlarging" or "tapering."

12. `stalk-root`: Represents the type of root the mushroom has, which can be "bulbous," "club," "equal," "rooted," etc.

13. `stalk-surface-above-ring`: Indicates the surface texture of the mushroom stalk above the ring, such as "fibrous," "scaly," "smooth," etc.

14. `stalk-surface-below-ring`: Describes the surface texture of the mushroom stalk below the ring, with options "fibrous," "scaly," "smooth," etc.

15. `stalk-color-above-ring`: Represents the color of the mushroom stalk above the ring, like "buff," "cinnamon," "gray," etc.

16. `stalk-color-below-ring`: Describes the color of the mushroom stalk below the ring, such as "buff," "cinnamon," "gray," etc.

17. `veil-type`: This attribute represents the type of veil the mushroom has, which has only one value for all instances in this dataset.

18. `veil-color`: Describes the color of the mushroom veil, such as "brown," "orange," "white," etc.

19. `ring-number`: Indicates the number of rings on the mushroom, with values "none," "one," or "two."

20. `ring-type`: Describes the type of ring on the mushroom, with options "cobwebby," "evanescent," "flaring," "large," etc.

21. `spore-print-color`: Represents the color of the mushroom spore print, which can be "black," "brown," "purple," etc.

22. `population`: Describes the population of mushrooms, such as "abundant," "clustered," "numerous," etc.

23. `habitat`: Indicates the habitat where the mushroom is typically found, with options like "grasses," "leaves," "woods," etc.

## Feature Relation to class

1. **Odor (odor):**
   Some mushrooms with specific odors, such as "almond" or "anise," are more likely to be edible, while others with foul or pungent odors may be poisonous.

2. **Bruising (bruises):**
   Edible mushrooms often bruise or change color when damaged, whereas many poisonous species do not exhibit this characteristic.

3. **Gill Color (gill-color):**
   Certain gill colors are associated with specific mushroom types. For example, white gills are common in edible mushrooms, while other colors like green or purple might indicate potential toxicity.

4. **Spore Print Color (spore-print-color):**
   The color of the spore print, obtained by placing the mushroom cap on a piece of paper, can be indicative of edibility. Different colors may suggest specific mushroom families, some of which might be toxic.

5. **Cap Color (cap-color):**
   The color of the mushroom cap is another important feature. Some mushroom colors are typical of edible varieties, while other colors might indicate poisonous species.

6. **Stalk Root (stalk-root):**
   The type of root on the mushroom stalk can be informative. Certain root types, like "club," are more common in edible mushrooms, while others may indicate potential toxicity.

7. **Ring Type (ring-type):**
   The presence and type of ring on the mushroom stalk can be a distinguishing feature. Specific ring types may be associated with edible or inedible mushrooms.

8. **Habitat (habitat):**
   Different mushrooms thrive in different habitats. Understanding the preferred habitat of a mushroom can provide insight into its edibility.

## Data Cleaning

In [106]:
df.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [107]:
df = df.drop_duplicates()

In [108]:
df.shape

(8124, 23)

In [109]:
label_encoder = LabelEncoder()

In [112]:
encoded_df = df.apply(label_encoder.fit_transform)

In [122]:
df['cap-shape'].unique()

array(['x', 'b', 's', 'f', 'k', 'c'], dtype=object)

In [121]:
encoded_df['cap-shape'].unique()

array([5, 0, 4, 2, 3, 1])

In [144]:
# def get_ready_dict(df, encoded_df):
#     real_dict = {}
#     columns = df.columns
#     for col in columns:
#         unique_df = df[col].unique()
#         encoded_col = encoded_df[col].unique()  # Access the column by its name
#         for i, j in zip(unique_df, encoded_col):
#             real_dict[i] = str(j)
#     return real_dict


In [145]:
# json_dict = get_ready_dict(df, encoded_df)

In [146]:
# json_dict

{'p': '4',
 'e': '0',
 'x': '5',
 'b': '0',
 's': '3',
 'f': '1',
 'k': '2',
 'c': '1',
 'y': '5',
 'g': '1',
 'n': '2',
 'w': '6',
 'u': '5',
 'r': '5',
 't': '2',
 'a': '0',
 'l': '2',
 'm': '3',
 'h': '1',
 'o': '4',
 '?': '0',
 'v': '4',
 'd': '0'}

In [147]:
file_path = 'encoded_dict.json'
with open(file_path, 'w') as json_file:
  json.dump(json_dict, json_file)

In [60]:
encoded_df.corr()['class'].sort_values(ascending = False)

class                       1.000000
gill-size                   0.540024
population                  0.298686
habitat                     0.217179
cap-surface                 0.178446
spore-print-color           0.171961
veil-color                  0.145142
gill-attachment             0.129200
cap-shape                   0.052951
cap-color                  -0.031384
odor                       -0.093552
stalk-shape                -0.102019
stalk-color-below-ring     -0.146730
stalk-color-above-ring     -0.154003
ring-number                -0.214366
stalk-surface-below-ring   -0.298801
stalk-surface-above-ring   -0.334593
gill-spacing               -0.348387
stalk-root                 -0.379361
ring-type                  -0.411771
bruises                    -0.501530
gill-color                 -0.530566
veil-type                        NaN
Name: class, dtype: float64

In [61]:
X = encoded_df.drop('class', axis = 1)
y = encoded_df['class'].copy()

In [62]:
X.sample(2)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
1916,2,3,8,1,6,1,0,1,10,0,...,2,7,7,0,2,1,4,2,4,5
1151,2,3,4,1,0,1,0,0,5,0,...,3,7,7,0,2,1,4,2,5,4


In [63]:
columns = X.columns

In [64]:
scaler = StandardScaler()

In [65]:
scaler.fit(X)

In [67]:
X_scaled = scaler.transform(X)

In [68]:
pd.DataFrame(X_scaled, columns = columns).sample(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
2073,1.029712,-1.486157,-0.19825,1.185917,0.406562,0.162896,-0.438864,-0.669038,1.183375,0.873511,...,0.586385,0.096577,-1.465353,0.0,0.142037,-0.256132,0.948081,-0.670195,1.083049,-0.877169
6869,1.029712,0.140128,-0.19825,-0.84323,-1.019565,0.162896,-0.438864,1.494683,-1.358896,0.873511,...,-0.893053,0.096577,0.107655,0.0,0.142037,-0.256132,-1.272216,1.428426,0.28433,1.448589
3363,1.029712,-1.486157,-0.591075,-0.84323,-1.019565,0.162896,-0.438864,-0.669038,-0.511472,-1.144806,...,-0.893053,-0.955152,-0.941017,0.0,0.142037,-0.256132,-0.162067,-1.089919,1.083049,-0.29573


In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

In [70]:
X_test.shape, y_test.shape, y_train.shape, X_train.shape

((1625, 22), (1625,), (6499,), (6499, 22))

In [71]:
svc = SVC(random_state = 42)
log_reg = LogisticRegression(random_state=42)
mlp = MLPClassifier(random_state=42)

In [72]:
log_reg.fit(X_train, y_train)
log_pred = log_reg.predict(X_test)
log_reg.score(X_test,y_test)

0.952

In [73]:
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
svc.score(X_test, y_test)

1.0

In [74]:
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)
mlp.score(X_test, y_test)

1.0

In [75]:
confusion_matrix(y_test, log_pred)

array([[799,  44],
       [ 34, 748]])

In [76]:
confusion_matrix(y_test, svc_pred)

array([[843,   0],
       [  0, 782]])

In [77]:
confusion_matrix(y_test, mlp_pred)

array([[843,   0],
       [  0, 782]])

In [81]:
pickle.dump(svc, open('svc_model_predictor_mushrooms.pkl', 'wb'))

In [79]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')