# Binary Prediction of Poisonous Mushrooms


## Importing Packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

## Loading Dataset

In [2]:
df = pd.concat([pd.read_csv("train.csv"), pd.read_csv("test.csv")], axis=0)
df.loc[df['class'].isna(), "train"] = 0
df.loc[df['class'].notna(), "train"] = 1

In [3]:
df

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,train
0,0,e,8.80,f,s,u,f,a,c,w,...,,w,,,f,f,,d,a,1.0
1,1,p,4.51,x,h,o,f,a,c,n,...,y,o,,,t,z,,d,w,1.0
2,2,e,6.94,f,s,b,f,x,c,w,...,s,n,,,f,f,,l,w,1.0
3,3,e,3.88,f,y,g,f,s,,g,...,,w,,,f,f,,d,u,1.0
4,4,e,5.85,x,l,w,f,d,,w,...,,w,,,f,f,,g,a,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,5194904,,0.88,x,g,w,f,a,d,w,...,,e,,,f,f,,d,u,0.0
2077960,5194905,,3.12,x,s,w,f,d,c,w,...,,w,,,f,f,,g,a,0.0
2077961,5194906,,5.73,x,e,e,f,a,,w,...,,y,,w,t,z,,d,a,0.0
2077962,5194907,,5.03,b,g,n,f,a,d,g,...,s,g,,,f,f,,d,a,0.0


In [4]:
df.dtypes

id                        int64
class                    object
cap-diameter            float64
cap-shape                object
cap-surface              object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-spacing             object
gill-color               object
stem-height             float64
stem-width              float64
stem-root                object
stem-surface             object
stem-color               object
veil-type                object
veil-color               object
has-ring                 object
ring-type                object
spore-print-color        object
habitat                  object
season                   object
train                   float64
dtype: object

### Check for null values

In [5]:
df_null = pd.DataFrame(df.isna().sum(), columns=['null_count']).sort_values(by='null_count', ascending=False)
df_null['null_percentage'] = (df_null['null_count']/df.shape[0]) * 100

In [6]:
df_null

Unnamed: 0,null_count,null_percentage
veil-type,4929038,94.882086
spore-print-color,4749299,91.422179
stem-root,4595035,88.452656
veil-color,4567071,87.91436
stem-surface,3302349,63.568948
gill-spacing,2098030,40.38627
class,2077964,40.000008
cap-surface,1117927,21.519665
gill-attachment,873757,16.819486
ring-type,215075,4.140111


It is clear that these features are unlikely to provide significant information, or impute reliably:
- veil-type
- spore-print-color
- stem-root
- veil-color
- stem-surface
- id

## Data Preprocessing

### Feature removal

In [7]:
df = df.drop(['veil-type', 'spore-print-color', 'stem-root', 'veil-color', 'stem-surface','id'], axis=1)

### Feature encoding

In [8]:
categorical_columns = df.select_dtypes(exclude=['float64']).columns
numerical_columns = df.select_dtypes(include=['float64']).columns

In [9]:
for feature in categorical_columns:
    map = {}
    for i, value in enumerate(df[feature].unique()):
        if type(value) == str:
            map[value] = i
    df[feature] = df[feature].map(map)

In [10]:
for feature in categorical_columns:
    if feature not in ['class', 'gill-spacing']:
        df[feature] = df[feature].fillna(df[feature].mode()[0])

for feature in numerical_columns:
    df[feature] = df[feature].fillna(df[feature].median())

In [11]:
imp = IterativeImputer(add_indicator=True, random_state=42)
imputed_data = imp.fit_transform(df.drop('class', axis=1))

In [None]:
imputed_data

In [None]:
imputed_data.shape

In [12]:
df_imputed = pd.DataFrame(imputed_data, columns=df.drop('class',axis=1).columns.tolist() + ['missing'])

In [177]:
df_imputed.shape

(5194909, 17)

In [14]:
df_imputed = pd.DataFrame(df_imputed, df['class']).reset_index()

### Data balancing

In [17]:
df_imputed['class'].value_counts(normalize=True)

class
1.0    0.547137
0.0    0.452863
Name: proportion, dtype: float64

The data is quite balanced therefore no sampling strategies need to be applied.