# Mushroom Classification Notebook

In this notebook, we will perform data cleaning, preprocessing, and model training for the mushroom classification problem. We will use various techniques to handle missing values, encode categorical features, and train an XGBoost classifier. Finally, we will generate predictions and prepare the submission file.

## Data Cleaning


In [5]:
import numpy as np
import pandas as pd
from scipy.stats import mode, norm, skew, kurtosis
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
import time 
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import torch

In [37]:
#read data files
train_df=pd.read_csv("Input Data\\train.csv", header='infer')
test_df=pd.read_csv("Input Data\\test.csv", header='infer')
submission=pd.read_csv('Input Data\\sample_submission.csv',header='infer')
original_df=pd.read_csv('Input Data\\original_data.csv',header='infer', sep=";")

In [7]:
train_df.drop('id',inplace=True,axis=1)

In [8]:
train_df = pd.concat([train_df, original_df], ignore_index=True).drop_duplicates()
train_df

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.80,f,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w
3,e,3.88,f,y,g,f,s,,g,4.16,...,,,w,,,f,f,,d,u
4,e,5.85,x,l,w,f,d,,w,3.37,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3178009,p,1.18,s,s,y,f,f,f,f,3.93,...,,,y,,,f,f,,d,a
3178010,p,1.27,f,s,y,f,f,f,f,3.18,...,,,y,,,f,f,,d,a
3178011,p,1.27,s,s,y,f,f,f,f,3.86,...,,,y,,,f,f,,d,u
3178012,p,1.24,f,s,y,f,f,f,f,3.56,...,,,y,,,f,f,,d,u


In [9]:
object_columns = [col for col in train_df.columns if train_df[col].dtype == 'object']
alphabets=set([chr(i) for i in range(97,123)])
def filter_alpha(value):
    if isinstance(value, str):
        return value if len(value) == 1 and value in alphabets else np.nan
    return np.nan
for col in object_columns:
    train_df[col] = train_df[col].apply(filter_alpha)

In [10]:
def converting_datatypes(df):
    df = df.copy()

    # Converting data types
    object_columns = [col for col in df.columns if df[col].dtype == 'object']
    int_columns = [col for col in df.columns if df[col].dtype == 'int64']
    float_columns = [col for col in df.columns if df[col].dtype == 'float64']
    for col in object_columns:
        df[col]=df[col].astype('category')
    for col in int_columns:
        df[col]=df[col].astype('int32')
    for col in float_columns:
        df[col]=df[col].astype('float32')
    return df

In [11]:
train_df=converting_datatypes(df=train_df)


In [12]:
null_percentage = (train_df.isnull().sum() / len(train_df)) * 100
n_miss = train_df.isnull().sum()
missing_df = pd.DataFrame({'n_miss': n_miss, 'Null Percentage': null_percentage})
missing_df = missing_df.sort_values(by='n_miss', ascending=False)
missing_df

Unnamed: 0,n_miss,Null Percentage
veil-type,3015242,94.882544
spore-print-color,2904290,91.391147
stem-root,2808574,88.37919
veil-color,2794460,87.935056
stem-surface,2019030,63.534105
gill-spacing,1283528,40.389595
cap-surface,685216,21.562129
gill-attachment,533851,16.79903
ring-type,131378,4.134155
gill-color,124,0.003902


In [13]:
train_df.drop(['veil-type','spore-print-color','stem-root','veil-color'], inplace=True,axis=1)

In [14]:
numerical_columns = [col for col in train_df.columns if train_df[col].dtype == 'float32']

In [15]:
object_columns_test = [col for col in test_df.columns if test_df[col].dtype == 'object']
alphabets=set([chr(i) for i in range(97,123)])
def filter_alpha(value):
    if isinstance(value, str):
        return value if len(value) == 1 and value in alphabets else np.nan
    return np.nan
for col in object_columns_test:
    test_df[col] = test_df[col].apply(filter_alpha)


In [16]:
test_df=converting_datatypes(test_df)

In [17]:
test_df.drop(['veil-type','spore-print-color','stem-root','veil-color'], inplace=True,axis=1)

In [18]:
null_percentage_test = (test_df.isnull().sum() / len(test_df)) * 100
n_miss = test_df.isnull().sum()
test_missing_df = pd.DataFrame({'n_miss': n_miss, 'Null Percentage': null_percentage_test})
test_missing_df = test_missing_df.sort_values(by='n_miss', ascending=False)
test_missing_df

Unnamed: 0,n_miss,Null Percentage
stem-surface,1321528,63.597252
gill-spacing,839613,40.40556
cap-surface,446945,21.508794
gill-attachment,349867,16.83701
ring-type,86211,4.148821
gill-color,93,0.004476
cap-shape,71,0.003417
habitat,56,0.002695
cap-color,55,0.002647
stem-color,55,0.002647


In [19]:
def outlier_thresholds(train_df, col_name, q1=0.01, q3=0.99):
    quartile1 = train_df[col_name].quantile(q1)
    quartile3 = train_df[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def check_outlier(train_df, col_name):
    low_limit, up_limit = outlier_thresholds(train_df, col_name)
    if train_df[(train_df[col_name] > up_limit) | (train_df[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
def replace_with_thresholds(train_df, variable):
    low_limit, up_limit = outlier_thresholds(train_df, variable)
    train_df.loc[(train_df[variable] < low_limit), variable] = low_limit
    train_df.loc[(train_df[variable] > up_limit), variable] = up_limit

In [20]:
for i in numerical_columns:
    replace_with_thresholds(train_df,i)
    replace_with_thresholds(test_df,variable=i)

In [21]:

filled_df = train_df.copy()
filled_df=converting_datatypes(filled_df)
num_cols = filled_df.select_dtypes(include=['float32']).columns
cat_cols = filled_df.select_dtypes(include=['category']).columns
numeric_imputer = SimpleImputer(strategy='median')
filled_df[num_cols] = numeric_imputer.fit_transform(filled_df[num_cols])

categorical_imputer = SimpleImputer(strategy='most_frequent')
filled_df[cat_cols] = categorical_imputer.fit_transform(filled_df[cat_cols])

In [22]:
filled_df=converting_datatypes(filled_df)


In [23]:

test_filled_df = test_df.copy()
test_filled_df=converting_datatypes(test_filled_df)
num_cols = test_filled_df.select_dtypes(include=['float32']).columns
cat_cols = test_filled_df.select_dtypes(include=['category']).columns
numeric_imputer = SimpleImputer(strategy='median')
test_filled_df[num_cols] = numeric_imputer.fit_transform(test_filled_df[num_cols])

categorical_imputer = SimpleImputer(strategy='most_frequent')
test_filled_df[cat_cols] = categorical_imputer.fit_transform(test_filled_df[cat_cols])

In [24]:
categorical_features = filled_df.drop('class', axis=1).select_dtypes(include=['category']).columns
categorical_features

Index(['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed',
       'gill-attachment', 'gill-spacing', 'gill-color', 'stem-surface',
       'stem-color', 'has-ring', 'ring-type', 'habitat', 'season'],
      dtype='object')

In [31]:

X_train=filled_df.copy()
y_train=filled_df[['class']]
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
X_train.drop(['class'],inplace=True,axis=1)
X_train.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,8.8,f,s,u,f,a,c,w,4.51,15.39,s,w,f,f,d,a
1,4.51,x,h,o,f,a,c,n,4.79,6.48,y,o,t,z,d,w
2,6.94,f,s,b,f,x,c,w,6.85,9.93,s,n,f,f,l,w
3,3.88,f,y,g,f,s,c,g,4.16,6.53,s,w,f,f,d,u
4,5.85,x,l,w,f,d,c,w,3.37,8.36,s,w,f,f,g,a


In [26]:
X_test=test_filled_df.drop('id',axis=1)
X_test.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,8.64,x,t,n,t,a,c,w,11.13,17.120001,s,w,t,g,d,a
1,6.9,o,t,o,f,a,c,y,1.27,10.75,s,n,f,f,d,a
2,2.0,b,g,n,f,a,c,n,6.18,3.14,s,n,f,f,d,s
3,3.47,x,t,n,f,s,c,n,4.98,8.51,s,w,t,z,d,u
4,6.17,x,h,y,f,p,c,y,6.73,13.7,s,y,t,f,d,u


## Model Training

In this section, we will preprocess the data, train an XGBoost model using Stratified K-Folds cross-validation, and evaluate its performance. Finally, we will prepare the predictions and create a submission file.


### XGBoost

In [33]:


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare data for training
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
categorical_features = filled_df.drop('class', axis=1).select_dtypes(include=['category']).columns

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical_features),
        ("StandardScaler", numeric_transformer, num_cols),
    ]
)

preprocessor.fit(X_train)
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(test_filled_df)




# XGBoost parameters
params_xgb = {
    'device': 'cuda',
    'enable_categorical': True,
    'tree_method': 'hist',
    'n_estimators': 360,
    'learning_rate': 0.05,
    'max_depth': 14,
    'colsample_bytree': 0.4,
    'min_child_weight': 2,
    'reg_lambda': 67,
    'subsample': 0.98,
    'num_parallel_tree': 4,
}

NUM_FOLDS = 5
val_scores = []
test_preds_xgb = []
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1)

for fold, (train_index, val_index) in enumerate(skf.split(X_train_preprocessed, y_train_encoded)):
    X_train_fold, X_val_fold = X_train_preprocessed[train_index], X_train_preprocessed[val_index]
    y_train_fold, y_val_fold = y_train_encoded[train_index], y_train_encoded[val_index]
    
    # X_train_fold=torch.tensor(X_train_fold).to(device=device)
    # y_train_fold=torch.tensor(y_train_fold).to(device=device)
    # X_val_fold=torch.tensor(X_val_fold).to(device=device)
    # y_val_fold=torch.tensor(y_val_fold).to(device=device)
    

    xgb = XGBClassifier(**params_xgb)
    xgb.fit(X_train_fold, y_train_fold)
    
    val_pred = xgb.predict(X_val_fold)
    # y_val_fold=y_val_fold.to('cpu')
    mcc = matthews_corrcoef(y_val_fold, val_pred)
    print(f'Fold {fold}: MCC = {mcc:.5f}')
    val_scores.append(mcc)
    
    test_preds_xgb.append(xgb.predict_proba(X_test_preprocessed))

test_preds_xgb_av = sum(test_preds_xgb) / len(test_preds_xgb)
print(f'Mean Validation MCC= {np.mean(val_scores):.5f}')
print(f'Standard Deviation Validation MCC= {np.std(val_scores):.5f}')

# Prepare submission
pred = np.argmax(test_preds_xgb_av, axis=1)
submission = pd.read_csv('sample_submission.csv', header='infer')
submission['class'] = pd.Series(pred).map({0: 'p', 1: 'e'})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully: submission.csv")


Fold 0: MCC = 0.98293
Fold 1: MCC = 0.98292
Fold 2: MCC = 0.98314
Fold 3: MCC = 0.98319
Fold 4: MCC = 0.98294
Mean Validation MCC= 0.98302
Standard Deviation Validation MCC= 0.00012
Submission file created successfully: submission.csv


## CatBoost Classifier

In [34]:


X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_preprocessed, y_train_encoded, test_size=0.25, random_state=42)
modelCatBoost = CatBoostClassifier(
    iterations=2500,  # Number of boosting iterations
    learning_rate=0.02,  # Learning rate for finer updates
    depth=15,  # Depth of the trees to capture more complex patterns
    l2_leaf_reg=4,  # L2 regularization parameter to control overfitting
    loss_function='Logloss',  # Loss function
    eval_metric='MCC',  # Supported evaluation metric
    task_type='GPU',  # Use GPU for training
    devices='0',  # Specify the GPU device (e.g., '0' for the first GPU)
    verbose=100  # Print progress every 100 iterations
)




# Train the model
modelCatBoost.fit(
    X_train_split,  # Training features
    y_train_split,  # Training target
    eval_set=[(X_val_split, y_val_split)],  # Validation set for early stopping
    early_stopping_rounds=100  # Stop early if no improvement
)




0:	learn: 0.8830510	test: 0.8825798	best: 0.8825798 (0)	total: 547ms	remaining: 22m 46s
100:	learn: 0.9756822	test: 0.9753201	best: 0.9753201 (100)	total: 46.6s	remaining: 18m 26s
200:	learn: 0.9796961	test: 0.9791361	best: 0.9791361 (200)	total: 1m 32s	remaining: 17m 38s
300:	learn: 0.9816025	test: 0.9807711	best: 0.9807711 (300)	total: 2m 18s	remaining: 16m 55s
400:	learn: 0.9826648	test: 0.9815943	best: 0.9815994 (398)	total: 3m 6s	remaining: 16m 16s
500:	learn: 0.9833733	test: 0.9820695	best: 0.9820720 (499)	total: 3m 54s	remaining: 15m 36s
600:	learn: 0.9838172	test: 0.9823720	best: 0.9823821 (599)	total: 4m 43s	remaining: 14m 54s
700:	learn: 0.9840881	test: 0.9825856	best: 0.9825856 (700)	total: 5m 31s	remaining: 14m 12s
800:	learn: 0.9842871	test: 0.9827379	best: 0.9827379 (799)	total: 6m 20s	remaining: 13m 28s
900:	learn: 0.9844615	test: 0.9828241	best: 0.9828241 (900)	total: 7m 10s	remaining: 12m 43s
1000:	learn: 0.9846477	test: 0.9828648	best: 0.9828673 (997)	total: 7m 59s	re

<catboost.core.CatBoostClassifier at 0x17cc3be4620>

In [35]:

# Predict on validation set (use CPU for simplicity)
y_val_pred = modelCatBoost.predict(X_val_split)

# Calculate Matthews correlation coefficient (MCC) on validation set
mcc = matthews_corrcoef(y_val_split, y_val_pred)
print(f'MCC on validation set: {mcc:.4f}')

MCC on validation set: 0.9833
