# **LIBRARIES**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

# **READING AND DISPLAYING DATASET**

In [2]:
df=pd.read_csv("//kaggle//input//mushrooms//Mushrooms.csv")
df

Unnamed: 0,class;cap-diameter;cap-shape;cap-surface;cap-color;does-bruise-or-bleed;gill-attachment;gill-spacing;gill-color;stem-height;stem-width;stem-root;stem-surface;stem-color;veil-type;veil-color;has-ring;ring-type;spore-print-color;habitat;season
0,p;15.26;x;g;o;f;e;;w;16.95;17.09;s;y;w;u;w;t;g...
1,p;16.6;x;g;o;f;e;;w;17.99;18.19;s;y;w;u;w;t;g;...
2,p;14.07;x;g;o;f;e;;w;17.8;17.74;s;y;w;u;w;t;g;...
3,p;14.17;f;h;e;f;e;;w;15.77;15.98;s;y;w;u;w;t;p...
4,p;14.64;x;h;o;f;e;;w;16.53;17.2;s;y;w;u;w;t;p;...
...,...
61064,p;1.18;s;s;y;f;f;f;f;3.93;6.22;;;y;;;f;f;;d;a
61065,p;1.27;f;s;y;f;f;f;f;3.18;5.43;;;y;;;f;f;;d;a
61066,p;1.27;s;s;y;f;f;f;f;3.86;6.37;;;y;;;f;f;;d;u
61067,p;1.24;f;s;y;f;f;f;f;3.56;5.44;;;y;;;f;f;;d;u


# **DATA CLEANING**

**spliting 1 column**

In [3]:
df[['class', 'cap-diameter','cap-shape','cap-surface','cap-color','does-bruise-or-bleed','gill-attachment','gill-spacing','gill-color','stem-height','stem-width','stem-root','stem-surface','stem-color','veil-type','veil-color','has-ring','ring-type','spore-print-color','habitat','season']] = df['class;cap-diameter;cap-shape;cap-surface;cap-color;does-bruise-or-bleed;gill-attachment;gill-spacing;gill-color;stem-height;stem-width;stem-root;stem-surface;stem-color;veil-type;veil-color;has-ring;ring-type;spore-print-color;habitat;season'].str.split(';', expand=True)

**after spliting drop the original column**

In [4]:
df.drop('class;cap-diameter;cap-shape;cap-surface;cap-color;does-bruise-or-bleed;gill-attachment;gill-spacing;gill-color;stem-height;stem-width;stem-root;stem-surface;stem-color;veil-type;veil-color;has-ring;ring-type;spore-print-color;habitat;season',axis=1,inplace = True)

In [5]:
df

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,s,y,f,f,f,f,3.93,...,,,y,,,f,f,,d,a
61065,p,1.27,f,s,y,f,f,f,f,3.18,...,,,y,,,f,f,,d,a
61066,p,1.27,s,s,y,f,f,f,f,3.86,...,,,y,,,f,f,,d,u
61067,p,1.24,f,s,y,f,f,f,f,3.56,...,,,y,,,f,f,,d,u


**checking NULL values**

In [6]:
df.isnull().sum() #Checking NULL values

class                   0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-root               0
stem-surface            0
stem-color              0
veil-type               0
veil-color              0
has-ring                0
ring-type               0
spore-print-color       0
habitat                 0
season                  0
dtype: int64

In [7]:
categorical_columns = ['cap-shape', 'gill-attachment','gill-color','stem-color','season']
for column in categorical_columns:
    unique_values = df[column].unique()
    print(f"Unique values in {column}: {unique_values}")

Unique values in cap-shape: ['x' 'f' 'p' 'b' 'c' 's' 'o']
Unique values in gill-attachment: ['e' '' 'a' 'd' 's' 'x' 'p' 'f']
Unique values in gill-color: ['w' 'n' 'p' 'u' 'b' 'g' 'y' 'r' 'e' 'o' 'k' 'f']
Unique values in stem-color: ['w' 'y' 'n' 'u' 'b' 'l' 'r' 'p' 'e' 'k' 'g' 'o' 'f']
Unique values in season: ['w' 'u' 'a' 's']


**Class:** This column represents whether a mushroom is poisonous or edible. The unique values are 'p' and 'e', where 'p' indicates a poisonous mushroom and 'e' indicates an edible mushroom.

**Cap Shape:** This column describes the shape of the mushroom's cap. The unique values are 'x', 'p', 'f', 'b', 'c', 's', and 'o'. Each value represents a different cap shape, such as 'b' for bell, 'c' for conical, 'x' for convex, 'f' for flat, 's' for sunken, 'p' for spherical,'o' for others.

**Gill Attachment:** This column indicates how the gills of the mushroom are attached to the stem. The unique values are 'e', '', 'a', 'd', 's', 'x', 'p', and 'f'. Each value represents a different type of gill attachment, such as 'a' for adnate, 'x' for adnexed, 'd' for decurrent, 'e' for free,'s' for sinuate, 'p' for pores, 'f' for none, '' for unknown.

**Gill Color:** This column represents the color of the mushroom's gills. The unique values are 'w', 'n', 'p', 'u', 'b', 'g', 'y', 'r', 'e', 'o', 'k', and 'f'. Each value corresponds to a different gill color, such as 'w' for white, 'n' for brown, 'p' for pink, 'u' for purple, 'b' for buff, 'g' for gray, 'y' for yellow, 'r' for red, 'e' for orange, 'o' for brown-orange, 'k' for black, and 'f' for none.

**Stem Color:** This column describes the color of the mushroom's stem. The unique values are 'w', 'y', 'n', 'u', 'b', 'l', 'r', 'p', 'e', 'k', 'g', and 'o'. Each value represents a different stem color, such as 'w' for white, 'y' for yellow, 'n' for brown, 'u' for purple, 'b' for buff, 'l' for gray, 'r' for red, 'p' for pink, 'e' for brown, 'k' for black, 'g' for green, and 'o' for orange.

**Season:** This column represents the season in which the mushroom was observed. The unique values are 'a', 'u', 'w', and 's'. Each value corresponds to a different season, such as 'a' for autumn, 'u' for summer, 'w' for winter, and 's' for spring.

In [8]:
cols = ['gill-attachment','gill-color']
for column in cols:
    df = df[df[column] != '']
    df = df[~df[column].str.contains('f')]

In [9]:
categorical_columns = ['cap-shape', 'gill-attachment','gill-color','stem-color','season']
for column in categorical_columns:
    unique_values = df[column].unique()
    print(f"Unique values in {column}: {unique_values}")

Unique values in cap-shape: ['x' 'f' 'p' 'b' 'c' 's' 'o']
Unique values in gill-attachment: ['e' 'a' 'd' 's' 'x' 'p']
Unique values in gill-color: ['w' 'n' 'p' 'b' 'u' 'g' 'y' 'r' 'e' 'o' 'k']
Unique values in stem-color: ['w' 'y' 'n' 'u' 'b' 'l' 'r' 'p' 'e' 'k' 'g' 'o']
Unique values in season: ['w' 'u' 'a' 's']


**drop extra(Unnecessary) variables**

In [10]:
df.drop(['cap-surface','cap-color','does-bruise-or-bleed','gill-spacing','stem-root','stem-surface','veil-type','veil-color','has-ring','ring-type','spore-print-color','habitat'],axis=1,inplace=True)

**check and drop duplicates**

In [11]:
num_duplicates = df.duplicated().sum() #Checking Duplicates
print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 1


In [12]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

**checking data type of each column**

In [13]:
for column in df.columns:
    data_type = df[column].dtype
    print(f"Data type of {column}: {data_type}")

Data type of class: object
Data type of cap-diameter: object
Data type of cap-shape: object
Data type of gill-attachment: object
Data type of gill-color: object
Data type of stem-height: object
Data type of stem-width: object
Data type of stem-color: object
Data type of season: object


**change type of numerical columns from object to float**

In [14]:
df['cap-diameter'] = df['cap-diameter'].astype(float)
df['stem-height'] = df['stem-height'].astype(float)
df['stem-width'] = df['stem-width'].astype(float)

**checking and drop outliers**

In [15]:
df.shape

(47654, 9)

In [16]:
for column in df.select_dtypes(include='number').columns:
    fig = go.Figure()
    fig.add_trace(go.Box(y=df[column], name=column))
    fig.update_layout(title=f"Boxplot for {column}")
    fig.show()

    # Identify outliers based on the box plot
    column_values = df[column]
    q1 = column_values.quantile(0.25)
    q3 = column_values.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = column_values[(column_values < lower_bound) | (column_values > upper_bound)]
    
# Remove outliers from the column
df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [17]:
df

Unnamed: 0,class,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season
0,p,15.26,x,e,w,16.95,17.09,w,w
1,p,16.60,x,e,w,17.99,18.19,w,u
2,p,14.07,x,e,w,17.80,17.74,w,w
3,p,14.17,f,e,w,15.77,15.98,w,w
4,p,14.64,x,e,w,16.53,17.20,w,w
...,...,...,...,...,...,...,...,...,...
59299,p,6.53,f,p,n,3.83,5.94,n,u
59300,p,5.79,s,p,n,4.32,7.18,n,u
59301,p,4.46,f,p,n,3.72,5.76,n,u
59302,p,5.53,s,p,n,4.41,5.35,n,u


**normalization of numerical columns**

In [18]:
numerical_vars = ['cap-diameter', 'stem-height', 'stem-width']
# Min-max normalization
normalizer = MinMaxScaler()
df[numerical_vars] = normalizer.fit_transform(df[numerical_vars])

# **DATA VISUALIZATION**

In [19]:
# Pie chart
fig = px.pie(df, names='class', title='Class Distribution')
fig.update_traces(pull=[0.1, 0])  # Pull sectors out from the center
fig.show()

**This pie chart illustrates the distribution of classes in the dataset. Each slice of the pie represents a class label, and the size of each slice corresponds to the proportion of that class in the dataset (to know data is balanced or not)**

In [20]:
# Area plot
fig = px.area(df, x='season', y='stem-height', color='class')
fig.update_layout(title='Area Plot - Stem Height by Season',xaxis_title='Season', yaxis_title='Stem Height')
fig.show()

**This area plot shows the variation in 'stem-height' values over different seasons. Each area represents the range of stem heights for a particular season(to know the amount of poisoned and edible mushrooms in each season)**

In [21]:
#Histogram
fig = px.histogram(df, x='gill-attachment', color='class', title='Gill Attachment by Class',labels={'gill-attachment': 'Gill Attachment'}, nbins=10)
fig.show()

**This histogram represents the distribution of 'gill-attachment' values. The x-axis represents the Gill Attachment values, and the y-axis represents the count of occurrences for each gill attachment range**

**encoding of categorical columns**

In [22]:
categorical_columns = ['cap-shape', 'gill-attachment', 'gill-color', 'stem-color', 'season']

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode each categorical column
for column in categorical_columns:
    df[column] = encoder.fit_transform(df[column])

# Define the encoding dictionary for 'class' column
encoding = {'e': 0, 'p': 1}

# Encode 'class' column using replace function
df['class'] = df['class'].replace(encoding)


**Rearrangment of columns**

In [23]:
# Column to be moved to the end
column_name = 'class'

# Get the list of column names excluding the target column
other_columns = [col for col in df.columns if col != column_name]

# Reorder the columns
new_columns = other_columns + [column_name]
df = df[new_columns]

In [24]:
df

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,0.265591,6,2,9,0.590994,0.518554,10,3,1
1,0.289606,6,2,9,0.630019,0.553441,10,2,1
2,0.244265,6,2,9,0.622889,0.539169,10,3,1
3,0.246057,2,2,9,0.546717,0.483349,10,3,1
4,0.254480,6,2,9,0.575235,0.522042,10,3,1
...,...,...,...,...,...,...,...,...,...
59299,0.109140,2,3,4,0.098687,0.164922,5,2,1
59300,0.095878,5,3,4,0.117073,0.204250,5,2,1
59301,0.072043,2,3,4,0.094559,0.159213,5,2,1
59302,0.091219,5,3,4,0.120450,0.146210,5,2,1


# **BUILD ML MODEL**

**spliting dataset into x and y**

In [25]:
x = df.drop(["class"], axis=1)
y = df["class"]
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.30)

In [26]:
print('x_train shape: ', X_train.shape)
print('x_test shape: ', X_test.shape)
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)

x_train shape:  (32013, 8)
x_test shape:  (13721, 8)
y_train shape:  (32013,)
y_test shape:  (13721,)


**Importing and Traning Model**

In [27]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

**Predict x-test**

In [28]:
y_predict = knn.predict(X_test)
y_predict

array([0, 0, 0, ..., 1, 0, 0])

**Classification Report**

In [29]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6424
           1       0.99      0.99      0.99      7297

    accuracy                           0.99     13721
   macro avg       0.99      0.99      0.99     13721
weighted avg       0.99      0.99      0.99     13721



# **CLUSTERING**

**Using k-means for clustering**

In [30]:
kmeans = KMeans(n_clusters=2)

# Fit the model to the data
kmeans.fit(x)

# Retrieve the cluster labels assigned to each sample
cluster_labels = kmeans.labels_

# Print the cluster labels
print("Cluster Labels:")
print(cluster_labels)

Cluster Labels:
[0 0 0 ... 1 1 1]


**Adding clustering result to dataframe**

In [31]:
df['clustering_column'] = cluster_labels

In [32]:
df

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class,clustering_column
0,0.265591,6,2,9,0.590994,0.518554,10,3,1,0
1,0.289606,6,2,9,0.630019,0.553441,10,2,1,0
2,0.244265,6,2,9,0.622889,0.539169,10,3,1,0
3,0.246057,2,2,9,0.546717,0.483349,10,3,1,0
4,0.254480,6,2,9,0.575235,0.522042,10,3,1,0
...,...,...,...,...,...,...,...,...,...,...
59299,0.109140,2,3,4,0.098687,0.164922,5,2,1,1
59300,0.095878,5,3,4,0.117073,0.204250,5,2,1,1
59301,0.072043,2,3,4,0.094559,0.159213,5,2,1,1
59302,0.091219,5,3,4,0.120450,0.146210,5,2,1,1


In [33]:
# Scatter plot for 'class'
fig_class = px.scatter(df, x='cap-diameter', y='stem-height', color='class',title='Scatter Plot: Class')
fig_class.show()

# Scatter plot for 'clustering_column'
fig_clustering = px.scatter(df, x='cap-diameter', y='stem-height', color='clustering_column',title='Scatter Plot: Clustering Column')
fig_clustering.show()

**This scatter matrix shows pairwise scatter plots for the numerical variables ('cap-diameter', 'stem-height', and 'stem-width') to identify any patterns,or clusters between the variables.**