In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

#Load the compact dataset
df = pd.read_parquet("mushroom_compact.parquet", engine="fastparquet")

cols_to_drop = [
    "cap-surface",
    "gill-attachment",
    "gill-spacing",
    "stem-root",
    "stem-surface",
    "veil-type",
    "veil-color",
    "spore-print-color",
]
df_model = df.drop(columns=cols_to_drop).copy()
df_chi = df_model.copy()
#remove numeric features for chi-square
df_chi = df_chi.drop(columns=["cap-diameter", "stem-height", "stem-width"])

categorical_features = [
    "stem-color",
    "cap-color",
    "ring-type",
    "gill-color",
    "habitat",
    "cap-shape",
    "season",
    "has-ring",
    "does-bruise-or-bleed",
]
df_chi["class_binary"] = df_model["class"].map({"e": 1, "p":0})
#Target
target_col = "class_binary"

In [5]:
df_chi.head()

Unnamed: 0,class,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-color,has-ring,ring-type,habitat,season,class_binary
0,p,f,e,f,w,w,t,g,d,a,0
1,p,x,e,f,w,w,t,g,d,u,0
2,p,f,o,f,w,w,t,p,d,a,0
3,p,f,e,f,w,w,t,g,d,u,0
4,p,x,e,f,w,w,t,p,d,w,0


In [8]:
#chi-square feature selection on categorical features only


from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

#separate features + target
X_chi = df_chi.drop(columns=["class", "class_binary"])
y_chi = df_chi["class_binary"]

#replace nan with missing
X_chi = X_chi.astype("object").fillna("missing")

#encode categorical predictors as ordinal integers
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_encoded = encoder.fit_transform(X_chi)

#run Chi-square
chi_scores, p_values = chi2(X_encoded, y_chi)




In [9]:
# Build a results dataframe
chi_results = pd.DataFrame({
    "feature": X_chi.columns,
    "chi2_score": chi_scores,
    "p_value": p_values
}).sort_values("chi2_score", ascending=False)

chi_results

Unnamed: 0,feature,chi2_score,p_value
0,cap-shape,87056.022023,0.0
6,ring-type,71062.140828,0.0
4,stem-color,36558.174062,0.0
8,season,32988.247363,0.0
7,habitat,26297.147159,0.0
3,gill-color,23680.357443,0.0
5,has-ring,16381.48736,0.0
1,cap-color,10238.998625,0.0
2,does-bruise-or-bleed,940.117877,1.8658850000000001e-206
