In [None]:
import pandas as pd
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, precision_score, recall_score

In [None]:
shutil.move('/content/kaggle.json', '/root/.kaggle/kaggle.json')

'/root/.kaggle/kaggle.json'

In [None]:
!pip install kaggle



In [None]:
! kaggle datasets download -d uciml/mushroom-classification

Downloading mushroom-classification.zip to /content
  0% 0.00/34.2k [00:00<?, ?B/s]
100% 34.2k/34.2k [00:00<00:00, 44.0MB/s]


In [None]:
! unzip mushroom-classification.zip

Archive:  mushroom-classification.zip
  inflating: mushrooms.csv           


In [None]:
df = pd.read_csv('mushrooms.csv')

In [None]:
df.sample(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
1892,e,x,s,n,f,n,f,w,b,h,...,s,w,w,p,w,o,e,n,a,g
4573,p,x,y,g,f,f,f,c,b,g,...,k,b,n,p,w,o,l,h,y,p
3428,e,f,y,n,t,n,f,c,b,p,...,s,g,g,p,w,o,p,n,y,d
2609,e,x,f,e,t,n,f,c,b,n,...,s,w,g,p,w,o,p,n,v,d
5815,p,x,y,n,f,s,f,c,n,b,...,k,p,p,p,w,o,e,w,v,l


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [None]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [None]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

## Domain Knowledge


1. `class`: This attribute refers to the classification of mushrooms as either "edible" or "poisonous," which is typically the target variable in a classification task.

2. `cap-shape`: Describes the shape of the mushroom cap, which can take values like "bell," "conical," "convex," etc.

3. `cap-surface`: Indicates the texture of the mushroom cap, such as "fibrous," "grooves," "smooth," etc.

4. `cap-color`: Represents the color of the mushroom cap, which could be various shades like "brown," "yellow," "white," etc.

5. `bruises`: Refers to whether the mushroom bruises when damaged, with possible values "bruises" or "no."

6. `odor`: Describes the smell of the mushroom, which can be categorized as "almond," "anise," "none," "foul," etc.

7. `gill-attachment`: Indicates how the gills of the mushroom are attached to the stem, with options "free" or "attached."

8. `gill-spacing`: Describes the spacing between the gills, with values "close" or "crowded."

9. `gill-size`: Represents the size of the gills, with options "broad" or "narrow."

10. `gill-color`: Indicates the color of the mushroom gills, such as "buff," "green," "pink," etc.

11. `stalk-shape`: Describes the shape of the mushroom stalk, with possible values "enlarging" or "tapering."

12. `stalk-root`: Represents the type of root the mushroom has, which can be "bulbous," "club," "equal," "rooted," etc.

13. `stalk-surface-above-ring`: Indicates the surface texture of the mushroom stalk above the ring, such as "fibrous," "scaly," "smooth," etc.

14. `stalk-surface-below-ring`: Describes the surface texture of the mushroom stalk below the ring, with options "fibrous," "scaly," "smooth," etc.

15. `stalk-color-above-ring`: Represents the color of the mushroom stalk above the ring, like "buff," "cinnamon," "gray," etc.

16. `stalk-color-below-ring`: Describes the color of the mushroom stalk below the ring, such as "buff," "cinnamon," "gray," etc.

17. `veil-type`: This attribute represents the type of veil the mushroom has, which has only one value for all instances in this dataset.

18. `veil-color`: Describes the color of the mushroom veil, such as "brown," "orange," "white," etc.

19. `ring-number`: Indicates the number of rings on the mushroom, with values "none," "one," or "two."

20. `ring-type`: Describes the type of ring on the mushroom, with options "cobwebby," "evanescent," "flaring," "large," etc.

21. `spore-print-color`: Represents the color of the mushroom spore print, which can be "black," "brown," "purple," etc.

22. `population`: Describes the population of mushrooms, such as "abundant," "clustered," "numerous," etc.

23. `habitat`: Indicates the habitat where the mushroom is typically found, with options like "grasses," "leaves," "woods," etc.

## Feature Relation to class

1. **Odor (odor):**
   Some mushrooms with specific odors, such as "almond" or "anise," are more likely to be edible, while others with foul or pungent odors may be poisonous.

2. **Bruising (bruises):**
   Edible mushrooms often bruise or change color when damaged, whereas many poisonous species do not exhibit this characteristic.

3. **Gill Color (gill-color):**
   Certain gill colors are associated with specific mushroom types. For example, white gills are common in edible mushrooms, while other colors like green or purple might indicate potential toxicity.

4. **Spore Print Color (spore-print-color):**
   The color of the spore print, obtained by placing the mushroom cap on a piece of paper, can be indicative of edibility. Different colors may suggest specific mushroom families, some of which might be toxic.

5. **Cap Color (cap-color):**
   The color of the mushroom cap is another important feature. Some mushroom colors are typical of edible varieties, while other colors might indicate poisonous species.

6. **Stalk Root (stalk-root):**
   The type of root on the mushroom stalk can be informative. Certain root types, like "club," are more common in edible mushrooms, while others may indicate potential toxicity.

7. **Ring Type (ring-type):**
   The presence and type of ring on the mushroom stalk can be a distinguishing feature. Specific ring types may be associated with edible or inedible mushrooms.

8. **Habitat (habitat):**
   Different mushrooms thrive in different habitats. Understanding the preferred habitat of a mushroom can provide insight into its edibility.

## Data Cleaning

In [None]:
df.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape

(8124, 23)

In [None]:
label_encoder = LabelEncoder()

In [None]:
encoded_df = df.apply(label_encoder.fit_transform)

In [None]:
encoded_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [None]:
encoded_df.corr()['class'].sort_values(ascending = False)

class                       1.000000
gill-size                   0.540024
population                  0.298686
habitat                     0.217179
cap-surface                 0.178446
spore-print-color           0.171961
veil-color                  0.145142
gill-attachment             0.129200
cap-shape                   0.052951
cap-color                  -0.031384
odor                       -0.093552
stalk-shape                -0.102019
stalk-color-below-ring     -0.146730
stalk-color-above-ring     -0.154003
ring-number                -0.214366
stalk-surface-below-ring   -0.298801
stalk-surface-above-ring   -0.334593
gill-spacing               -0.348387
stalk-root                 -0.379361
ring-type                  -0.411771
bruises                    -0.501530
gill-color                 -0.530566
veil-type                        NaN
Name: class, dtype: float64

In [None]:
X = encoded_df.drop('class', axis = 1)
y = encoded_df['class'].copy()

In [None]:
X.sample(2)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
6875,2,2,4,0,2,1,0,1,0,1,...,2,6,7,0,2,1,0,7,4,2
3657,2,0,2,1,5,1,0,0,5,1,...,2,6,6,0,2,1,4,3,4,0


In [None]:
columns = X.columns

In [None]:
scaler = StandardScaler()

In [None]:
X_scaled = scaler.fit_transform(X)

In [None]:
pd.DataFrame(X_scaled, columns = columns).sample(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
934,1.029712,-1.486157,-0.19825,-0.84323,0.406562,0.162896,-0.438864,1.494683,0.053477,-1.144806,...,0.586385,0.622441,0.631991,0.0,0.142037,-0.256132,0.948081,-0.670195,1.083049,2.030028
7,-2.087047,0.95327,1.373049,1.185917,-0.544189,0.162896,-0.438864,-0.669038,0.053477,-1.144806,...,0.586385,0.622441,0.631991,0.0,0.142037,-0.256132,0.948081,-0.250471,-0.514389,0.867149
3953,1.029712,0.140128,0.194575,-0.84323,-1.494941,0.162896,2.278612,1.494683,0.053477,-1.144806,...,0.586385,0.622441,0.631991,0.0,0.142037,-0.256132,0.948081,-0.250471,-0.514389,-0.877169


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

In [None]:
X_test.shape, y_test.shape, y_train.shape, X_train.shape

((1625, 22), (1625,), (6499,), (6499, 22))

In [None]:
svc = SVC(random_state = 42)
log_reg = LogisticRegression(random_state=42)
mlp = MLPClassifier(random_state=42)

In [None]:
log_reg.fit(X_train, y_train)
log_pred = log_reg.predict(X_test)
log_reg.score(X_test,y_test)

0.952

In [None]:
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
svc.score(X_test, y_test)

1.0

In [None]:
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)
mlp.score(X_test, y_test)

1.0

In [None]:
confusion_matrix(y_test, log_pred)

array([[799,  44],
       [ 34, 748]])

In [None]:
confusion_matrix(y_test, svc_pred)

array([[843,   0],
       [  0, 782]])

In [None]:
confusion_matrix(y_test, mlp_pred)

array([[843,   0],
       [  0, 782]])