In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
from matplotlib import pyplot as plt # data visualization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
mdf = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")

## **Knowing the Data:~**

In [None]:
mdf

In [None]:
mdf.sample(10)

In [None]:
mdf.drop(columns="id", inplace=True)

In [None]:
mdf.columns

In [None]:
mdf.columns = mdf.columns.str.replace("-", "_", regex=True)
mdf.rename(columns={"class":"e_label"}, inplace=True)

In [None]:
mdf.isnull().sum()

In [None]:
mdf[mdf.duplicated()]

#### //- SUMMARY from above:
* ##### The dataset has **22 columns**, with "id" column set as index. So effectively 21.
* ##### There are **3116945 rows. No duplicate rows** in the dataset.
* ##### Some of the columns have **a lot of null** values.
* ##### **Target column name is "e_label"** meaning Edibility-label.

### **Details of the Numerical Columns:~**

In [None]:
mdf.describe()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4), sharey=True)
fig.suptitle('Box Plots of --')

# cap_diameter
sns.boxplot(ax=axes[0], x=mdf["cap_diameter"])
axes[0].set_title("Cap Diameter")

# stem_height
sns.boxplot(ax=axes[1], x=mdf["stem_height"])
axes[1].set_title("Stem Height")

# stem_width
sns.boxplot(ax=axes[2], x=mdf["stem_width"])
axes[2].set_title("Stem Width")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(10, 4), sharey=True)
fig.suptitle('Histplot of --')

# cap_diameter
sns.histplot(x=mdf["cap_diameter"], ax=axes[0])
axes[0].set_title("Cap Diameter")

# stem_height
sns.histplot(x=mdf["stem_height"], ax=axes[1])
axes[1].set_title("Stem Height")

# stem_width
sns.histplot(x=mdf["stem_width"], ax=axes[2])
axes[2].set_title("Stem Width")

### **Details of the Categorical Columns:~**

In [None]:
mdf.describe(include="object")

In [None]:
display(mdf.e_label.value_counts())
mdf.e_label.value_counts().plot(kind="pie", title="Proportion of e & p", ylabel="",
                                y=mdf.e_label.value_counts(), figsize=(4,4),
                                autopct="%1.01f%%", explode=(0.01,0.02))

In [None]:
mdf.shape

In [None]:
mdf.cap_shape.value_counts()

In [None]:
mdf.cap_shape.unique()

##### In cap_shape there are some garbage values, probably are wringly imputed. Let's check how much data are such.

In [None]:
mdf[mdf.cap_shape.isin(list("fxpbocsdenwkltgzaruyimh")+[np.nan])].shape

##### So there are **(3116945-3116890) = 55** such data. We can remove these.

In [None]:
mdf = mdf[mdf.cap_shape.isin(list("fxpbocsdenwkltgzaruyimh")+[np.nan])]
sns.catplot(mdf, kind="count", x ="cap_shape", height=3, aspect=3)

##### Proportion wise only 7 entries like **f,x,p,b,o,c,s have visible spread over the cap_shape series.**

In [None]:
mdf.shape

In [None]:
mdf.cap_surface.value_counts()

##### The same is for cap_surface too. So have done the same operation and dropped those garbage entries.

In [None]:
mdf = mdf[mdf.cap_surface.isin(list("shyltegdiwkfnroauzpbmxc")+[np.nan])]
sns.catplot(mdf, kind="count", x ="cap_surface", height=3, aspect=3)

In [None]:
mdf.shape #73 rows dropped in this process.

In [None]:
mdf.cap_color.value_counts()

In [None]:
mdf = mdf[mdf.cap_color.isin(list("uobgwneyrpklihdsafcxmzt")+[np.nan])]
sns.catplot(mdf, kind="count", x ="cap_color", height=3, aspect=3)

In [None]:
mdf.shape #72 records removed

In [None]:
mdf.does_bruise_or_bleed.value_counts()

##### The column name suggests that it should be a Binary column. Checked on the official website as well, which confirms the same as well. Hence kept only t & f, as in True and False.

In [None]:
mdf = mdf[mdf.does_bruise_or_bleed.isin(list("tf"))]
sns.catplot(data = mdf, x='does_bruise_or_bleed', kind="count", height=3, aspect=1)

In [None]:
mdf.shape #117 records dropped

In [None]:
# list('abcdefghijklmnopqrstuvwxyz')
atoz = [chr(i) for i in range(ord('a'), ord('z')+1)]

def non_alpha_categories_removal(df, col, atoz):
    len_before = df.shape[0]
    df = df[df[col].isin(atoz+[np.nan])]
    len_after = df.shape[0]
    print(f"'{col}' processed. {len_before-len_after} records removed")
    sns.catplot(df, kind="count", x =col, height=2, aspect=3)
    return df


for col in ['gill_attachment', 'gill_spacing', 'gill_color',
            'stem_root', 'stem_surface', 'stem_color', 'veil_type', 
            'veil_color', 'has_ring', 'ring_type', 'spore_print_color', 'habitat']:
    mdf = non_alpha_categories_removal(mdf,col, atoz)

In [None]:
mdf.shape

In [None]:
display(mdf.season.value_counts())
mdf.season.value_counts().plot(figsize=(7,4), kind="pie", y=mdf.season.value_counts(),
                              autopct = "%1.01f%%", title="Distribution of Season", ylabel="")

#### //- SUMMARY from above:
1. ##### The **numerical columns have a lot of outliers.** Stem height column is normally distributed, while the others are right-skewed.
2. ##### The **target column is almost equally distributed** among Edible and Poisonous
3. ##### Most of the other **categorical columns had some garbage values** like numerical records or class name and so. So, have removed these garbage records **(about 700 rows dropped).** Still, *not all the alphabetical values are having significant spreads across respective columns.* So need to verify their correctness to decide whether to keep them or not.
4. ##### About **4 Mushrooms in every 5 don't have bruising or bleeding.**
5. ##### There are **4 different seasons** spreading across the dataset, which are most probably **"Autumn", "Summer", "Winter", and "Spring".** --> Spring occurs the least among all.

## **Data Cleaning & Processing:~**

In [None]:
temp = mdf.copy()

In [None]:
mdf = temp

In [None]:
mdf

### **Outlier Removal:**

#### Previuosly we had concluded that the numerical columns had good amount of outliers. Let's try to fix those now.

In [None]:
num_cols = ['cap_diameter', 'stem_height', 'stem_width']

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(10, 6), sharey=True)
fig.suptitle('-- Spread in Nuemrical columns -- ')
color_map = ['#ec407a','#16a085', '#7986cb']

for n,col in enumerate(num_cols):
    sns.boxplot(ax=axes[n][0], x=mdf[col], color=color_map[n])
    axes[n][0].set_title(f"Box Plot of {col}", backgroundcolor='lightgrey')
    axes[n][0].set_xlabel("")
    
    sns.violinplot(ax=axes[n][1], x=mdf[col],  color=color_map[n])
    axes[n][1].set_title(f"Violin Plot of {col}", backgroundcolor='lightgrey')
    axes[n][1].set_xlabel("")
    
fig.tight_layout()

In [None]:
#Outlier Removal using IQR Method

for col in num_cols:
    Q1 = mdf[col].quantile(.25)
    Q3 = mdf[col].quantile(.75)
    IQR = Q3-Q1
    
    LL = Q1-1.5*IQR
    UL = Q3+1.5*IQR
    print(f"{mdf[(mdf[col]<LL) | (mdf[col]>UL)].shape[0]} rows are removed:")
    mdf = mdf[(mdf[col]>=LL) & (mdf[col]<=UL)]


In [None]:
mdf.shape

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(10, 6), sharey=True)
fig.suptitle('-- Spread in Nuemrical columns -- ')
color_map = ['#ec407a','#16a085', '#7986cb']

for n,col in enumerate(num_cols):
    sns.boxplot(ax=axes[n][0], x=mdf[col], color=color_map[n])
    axes[n][0].set_title(f"Box Plot of {col}", backgroundcolor='lightgrey')
    axes[n][0].set_xlabel("")
    
    sns.violinplot(ax=axes[n][1], x=mdf[col],  color=color_map[n])
    axes[n][1].set_title(f"Violin Plot of {col}", backgroundcolor='lightgrey')
    axes[n][1].set_xlabel("")
    
fig.tight_layout()

### **Handling Null/ Missing Values:**

In [None]:
mdf.isnull().mean().round(4) * 100

In [None]:
(mdf.isnull().mean().round(4) * 100).plot.bar(title="Percentage of null values in each columns",figsize=(12,3), color='brown')

#### So, from the above diagram, it's clear that a few columns like `"stem_root"`, `"stem_surface"`, `"veil_type"`, `"veil_color"`, and `"spore_print_color"` **has 60-96% data missing.** 

#### And, **we should not impute these many records**. 
> Logically, when there are more than 50% of data missing and there's no known reason of this, we should not fill in any values, and drop such columns. Because in that case more than half of things we are doing don't have any conclusive evidence of why we are doing.
#### So we will remove this columns.


In [None]:
# Let's drop any columns that has more than 50% null value
threshold_na = len(mdf) * 0.50

# Drop columns with more than 50% missing values
mdf = mdf.dropna(thresh=threshold_na, axis=1)

In [None]:
mdf.sample(2)

#### Four other columns namely, `"cap_surface"`, `"gill_attachment"`, `"gill_spacing"` & `"ring_type"` has visible null amount. For now we can impute all missing values in these columns as "unk", as in "Unknown". 
#### Later on these columns can be judged based on correlation with target. 

In [None]:
mdf[["cap_surface", "gill_attachment", "gill_spacing", "ring_type"]]=mdf[["cap_surface", "gill_attachment", "gill_spacing", "ring_type"]].fillna("unk")

In [None]:
mdf[["cap_surface", "gill_attachment", "gill_spacing", "ring_type"]].isnull().sum()

In [None]:
mdf.ring_type.value_counts()

In [None]:
mdf.isnull().sum()

#### As a result of the above operation, we have successfully handled most of the null values. Rest of the columns have very few null entries (near to 0%). We can drop these.

In [None]:
mdf.dropna(inplace=True)
mdf.isnull().sum()

### **Removing unwanted/rare data:**

In [None]:
mdf.nunique()

#### Earlier we had seen than all these categorical columns have many unique values in them. And most of those have very low spread in the dataset. 
#### The above output shows that apart from the target column, three numerical columns & 2 other categorical columns, all other have more than 20 unique values. This much variation will confuse both us and our ML algo. 
####  

#### What are we gonna do then? 
> Let's consider only those unique values that have atleast 5% spread in the respective columns. The rest of them might have been wrongly imputed or are rare. We can change those to `Other`

In [None]:
mdf.cap_shape.value_counts()/mdf.cap_shape.count()*100 #will consider till cap_shape='b', rest-->'other'

In [None]:
percent_counts = mdf.cap_shape.value_counts()/mdf.cap_shape.count()*100
idx = percent_counts[percent_counts<5].index
print(idx)

mdf.loc[mdf.cap_shape.isin(idx), 'cap_shape'] = "other"
print(mdf.cap_shape.value_counts()/mdf.cap_shape.count()*100)

#### This makes the change as we indended for. But **if in any case, the percentage of 'other' too is less than 5%,** we can drop them.

In [None]:
for col in ['cap_surface', 'cap_color', 'gill_attachment', 'gill_spacing', 'gill_color',
            'stem_color', 'has_ring', 'ring_type', 'habitat']:

    percent_counts = mdf[col].value_counts()/mdf[col].count()*100
    idx = percent_counts[percent_counts<5].index

    if percent_counts[percent_counts<5].sum()>=5:
        mdf.loc[mdf[col].isin(idx), col] = "other"
        print(f"Rare categories in '{col}' column are classed as 'other'")
    
    else:
        mdf = mdf.loc[~mdf[col].isin(idx)]
        print(f"Rare categories in '{col}' column are dropped")    
    

In [None]:
mdf.nunique()

#### Here's information on most frequest values of each categorical columns

* **cap-shape (n):** bell=b, convex=x, flat=f, sunken=s,
* **cap-color (n):** brown=n, gray=g, white=w, yellow=y, orange=o, red=e,
* **does-bruise-or-bleed (n):** true=t,false=f,
* **gill-spacing (n):** close=c, distant=d,
* **gill-color (n):**  brown=n, white=w, gray=g, pink=p, yellow=y, orange=o,
* **stem-color (n):**  brown=n, white=w, yellow=y, 
* **has-ring (n):** true=t, false=f,
* **ring-type (n):** flaring=f,
* **habitat (n):** grasses=g, leaves=l, meadows=m, woods=d,
* **season (n):** spring=s, summer=u, autumn=a, winter=w

In [None]:
# Category name maping 

mdf.cap_shape = mdf.cap_shape.replace({'b':"bell", 'x':"convex", 'f':"flat", 's':"sunken"})
mdf.cap_color = mdf.cap_color.replace({'n':"brown", 'g':"gray", 'w':"white", 'y':"yellow", 'o':"orange", 'e':"red"})
mdf.gill_spacing = mdf.gill_spacing.replace({'c':"close", 'd':"distant"})
mdf.gill_color = mdf.gill_color.replace({'n':"brown", 'g':"gray", 'w':"white", 'p':"pink", 'y':"yellow", 'o':"orange"})
mdf.stem_color = mdf.stem_color.replace({'n':"brown", 'w':"white", 'y':"yellow"})
mdf.ring_type = mdf.ring_type.replace({'f':"flaring"})
mdf.habitat = mdf.habitat.replace({'g':"grasses", 'l':"leaves", 'm':"meadows", 'd':"woods"})
mdf.season = mdf.season.replace({'s':"spring", 'u':"summer", 'w':"winter", 'a':"autumn"})

In [None]:
mdf.sample(2)

In [None]:
categorical_cols= ['cap_shape', 'cap_surface', 'cap_color', 'gill_attachment', 'gill_spacing', 'gill_color', 
                   'stem_color', 'has_ring', 'ring_type', 'does_bruise_or_bleed', 'habitat', 'season' ]
categorical_cols = np.reshape(categorical_cols, (4,3))

# for col in categorical_cols:
f, axs = plt.subplots(4, 3, figsize=(18, 15))

for n, col in enumerate(categorical_cols):
    sns.countplot(ax=axs[n,0], data=mdf, x =col[0])
    axs[n,0].set_title(f"Freq. of each category in {col[0]}")
    axs[n,0].set_xlabel("")
    axs[n,0].set_ylabel("")
    
    sns.countplot(ax=axs[n,1], data=mdf, x =col[1])
    axs[n,1].set_title(f"Freq. of each category in {col[1]}")
    axs[n,1].set_xlabel("")
    axs[n,1].set_ylabel("")
    
    sns.countplot(ax=axs[n,2], data=mdf, x =col[2])
    axs[n,2].set_title(f"Freq. of each category in {col[2]}")
    axs[n,2].set_xlabel("")
    axs[n,2].set_ylabel("")

f.tight_layout()

In [None]:
mdf.shape

#### //- SUMMARY from above:
1. ##### Outliers from the numerical columns are removed. **Around 2.5 lac records (~ 8% of whole) dropped.**
2. ##### **"stem_root", "stem_surface", "veil_type", "veil_color", and "spore_print_color"** columns had more than 50% null values. Hence those columns were *dropped.* 
3. ##### Null values in **"cap_surface", "gill_attachment", "gill_spacing" & "ring_type"** columns were *imputed with "unk" as in "Unknown".*
4. ##### In other categorical columns... *only those categories are kept that had a **spread of 5% or more** in the respective columns. If not, all are combined to form a new category as "Other".*
5. ##### **Categories were replaced with their more convenient names** with reference from original website.
6. ##### As of now there are **16 columns** *(1 target column, 3 Numerical ones, 3 binary & rest of those are nominal ones)*. Around **27.5 lac rows present.**