In [29]:
import pandas as pd
from pathlib import Path

## Raw DataFrame

In [30]:
path = Path('MushroomDataset/secondary_data.csv')

df = pd.read_csv(path, delimiter = ";")


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap-diameter          61069 non-null  float64
 2   cap-shape             61069 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-attachment       51185 non-null  object 
 7   gill-spacing          36006 non-null  object 
 8   gill-color            61069 non-null  object 
 9   stem-height           61069 non-null  float64
 10  stem-width            61069 non-null  float64
 11  stem-root             9531 non-null   object 
 12  stem-surface          22945 non-null  object 
 13  stem-color            61069 non-null  object 
 14  veil-type             3177 non-null   object 
 15  veil-color         

In [31]:
#replace 'p' and 'e' classification labels with 1 and 0
df["class"] = df["class"].replace({"p":0, "e":1})

  df["class"] = df["class"].replace({"p":0, "e":1})


In [32]:
df["class"].value_counts()

class
0    33888
1    27181
Name: count, dtype: int64

## New DataFrame with blank values filled

In [34]:
values = {"gill-attachment": "unknown"}
df = df.fillna(value = values)

In [36]:
df["gill-spacing"].value_counts()

gill-spacing
c    24710
d     7766
f     3530
Name: count, dtype: int64

## New DataFrame with columns containting blank values removed

In [33]:
#recuding dimensionality by by dropping columns with NaN values
reduced_features_df = df.copy().dropna(axis = 1)

In [34]:
reduced_features_df.nunique()

class                      2
cap-diameter            2571
cap-shape                  7
cap-color                 12
does-bruise-or-bleed       2
gill-color                12
stem-height             2226
stem-width              4630
stem-color                13
has-ring                   2
habitat                    8
season                     4
dtype: int64

In [35]:
#isolate target variable
target_variable_df = pd.DataFrame(reduced_features_df["class"])

In [36]:
#offload numerical features into standalone df
reduced_features_numerical_df = pd.DataFrame(reduced_features_df[["cap-diameter", "stem-height", "stem-width"]])

In [37]:
#isolate categorical features to next create dummies
reduced_features_categorical_df = reduced_features_df.drop(columns = ["class", "cap-diameter", "stem-height", "stem-width"])

In [38]:
dummies = pd.get_dummies(reduced_features_categorical_df, dtype = int)


In [39]:
all_features_df = pd.concat([reduced_features_numerical_df, dummies], axis = 1)


In [40]:
final_perprocessed_nonScaled_df = pd.concat([target_variable_df,all_features_df], axis = 1)

In [42]:
out_path = Path('MushroomDataset/preprocessed_mushroom_data_1.csv')
final_perprocessed_df.to_csv(out_path, header = final_perprocessed_df.columns, index = False)