# Mushroom Growth prediction model

## Import packages

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import export_graphviz

### read the dataset using read_CSV method from pandas

#### am changing the working directory to make it easier for navigation

In [2]:
os.chdir("C:/Users/ammar/Desktop/paamy/courses/ML/Ammar Mohammed ETS0191")

In [3]:
df = pd.read_csv("data/mushrooms.csv")

#### check the shape of the data (number of rows and columns)

In [4]:
df.shape

(8124, 23)

### for this assignment i want to use just 10 columns excluding the class which we will drop later and the growth stage column we will be adding

In [5]:
print(df.columns.to_list())

['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


In [6]:
selected_columns = ['class','cap-shape', 'cap-surface', 'cap-color', 'odor', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'habitat', 'population']
df = df[selected_columns]

### Since we are predicting growth stage. we need to synthesis growth stage for the data itself for mocking

In [7]:

def simulate_growth_stage(row):
    if row['cap-shape'] == 5 and row['gill-size'] == 1:
        return 2  # overgrown
    elif row['odor'] in [6, 5]:
        return 1  # mature
    else:
        return np.random.choice([0, 1], p=[0.7, 0.3])  # mostly young

df['growth_stage'] = df.apply(simulate_growth_stage, axis=1)


#### check the columns agains

In [8]:
df.shape

(8124, 12)

### Now let us explore the data more

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   class         8124 non-null   object
 1   cap-shape     8124 non-null   object
 2   cap-surface   8124 non-null   object
 3   cap-color     8124 non-null   object
 4   odor          8124 non-null   object
 5   gill-size     8124 non-null   object
 6   gill-color    8124 non-null   object
 7   stalk-shape   8124 non-null   object
 8   stalk-root    8124 non-null   object
 9   habitat       8124 non-null   object
 10  population    8124 non-null   object
 11  growth_stage  8124 non-null   int64 
dtypes: int64(1), object(11)
memory usage: 761.8+ KB


#### we can see from the above the columns, number of non null items and their data types

#### so the data overview looks like this now

In [10]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,odor,gill-size,gill-color,stalk-shape,stalk-root,habitat,population,growth_stage
0,p,x,s,n,p,n,k,e,e,u,s,0
1,e,x,s,y,a,b,k,e,c,g,n,0
2,e,b,s,w,l,b,n,e,c,m,n,0
3,p,x,y,w,p,n,n,e,e,u,s,0
4,e,x,s,g,n,b,k,t,e,g,a,0


### Now some information like mean, median and other description using describe() method

In [11]:
df.describe()

Unnamed: 0,growth_stage
count,8124.0
mean,0.297267
std,0.457083
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


### Now let us check if there is any missing data

In [12]:
df.isnull().sum()

class           0
cap-shape       0
cap-surface     0
cap-color       0
odor            0
gill-size       0
gill-color      0
stalk-shape     0
stalk-root      0
habitat         0
population      0
growth_stage    0
dtype: int64

#### from the above we can see that there is no any null values. if there were any we need to clean and drop those values

## Data Manipulation
#### all the rows hence the data is categorical we will use a proxy label encoder.
#### When describing our data we have seen that all our columns are of type object. so to use the label encoder we need to change their data types to category

In [13]:
df = df.astype('category')

In [14]:
df.dtypes

class           category
cap-shape       category
cap-surface     category
cap-color       category
odor            category
gill-size       category
gill-color      category
stalk-shape     category
stalk-root      category
habitat         category
population      category
growth_stage    category
dtype: object

In [15]:
for column in df.columns:
    labelencoder = LabelEncoder()
    df[column] = labelencoder.fit_transform(df[column])


#### now we can check and see if the dataset is transformed

In [16]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,odor,gill-size,gill-color,stalk-shape,stalk-root,habitat,population,growth_stage
0,1,5,2,4,6,1,4,0,3,5,3,0
1,0,5,2,9,0,0,4,0,2,1,2,0
2,0,0,2,8,3,0,5,0,2,3,2,0
3,1,5,3,8,6,1,5,0,3,5,3,0
4,0,5,2,3,5,0,4,1,3,1,0,0


# Prepare the data

### let as set the x and y axis. In the mean time we need to make the growth_stage the target variable. Hence we should drop growth_stage before training. and also we should drop class since this is classification it has no effect in growth_stage

In [17]:
X = df.drop(columns=['growth_stage', 'class'], axis=1)
Y = df['growth_stage']

## Split the data into training and testing dataset

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

# Train the model

### Now that we have set up the training and tesing datasets. we can Train the model

## I will use Random forest for better accuracy.

In [19]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(x_train, y_train)

### Now that we have trained the model let as test the accuracy

In [20]:
print("Test accuracy: {}%".format(round(rf.score(x_test,y_test)*100, 2)))

Test accuracy: 64.31%


In [21]:
prediction = rf.predict(x_test)
print("Random forest insight data: \n\n", classification_report(y_test, prediction) )

Random forest insight data: 

               precision    recall  f1-score   support

           0       0.70      0.85      0.77      1125
           1       0.34      0.17      0.23       500

    accuracy                           0.64      1625
   macro avg       0.52      0.51      0.50      1625
weighted avg       0.59      0.64      0.60      1625

