In [1]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

RAW   = Path("../results/outputs/feature_engineered.csv")
PLOTS = Path("../results/eda_visualizations"); PLOTS.mkdir(parents=True, exist_ok=True)
OUT   = Path("../results/outputs"); OUT.mkdir(parents=True, exist_ok=True)

assert RAW.exists() and RAW.stat().st_size > 0, f"Dataset missing/empty at {RAW.resolve()}"

df = pd.read_csv(RAW)


In [2]:
df_num = df.copy()
for c in df_num.select_dtypes(include="object").columns:
    df_num[c] = df_num[c].astype("category").cat.codes

target = "Status" if "Status" in df_num.columns else None
X = df_num.drop(columns=[target]) if target else df_num
y = df_num[target] if target else None


In [3]:
corr = X.corr(numeric_only=True)
plt.figure(figsize=(8, 6))
plt.imshow(corr.values, aspect="auto", cmap="coolwarm")
plt.title("Correlation Heatmap (numeric)")
plt.colorbar()
plt.tight_layout()
plt.savefig(PLOTS / "05_correlation_heatmap.png")
plt.close()

In [4]:
vt = VarianceThreshold(threshold=1e-8)
X_vt = vt.fit_transform(X)
X_vt_df = pd.DataFrame(X_vt, columns=X.columns[vt.get_support()], index=X.index)


In [5]:
corr_vt = X_vt_df.corr(numeric_only=True).abs()
upper = corr_vt.where(np.triu(np.ones(corr_vt.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] >= 0.90)]
X_sel = X_vt_df.drop(columns=to_drop)


In [6]:
pca = PCA(n_components=0.95, random_state=42).fit(X_sel)
print("PCA components for 95% variance:", pca.n_components_)
print("Dropped (low variance):", [c for c in X.columns if c not in X_vt_df.columns])
print("Dropped (high corr):", to_drop)

PCA components for 95% variance: 2
Dropped (low variance): ['Age_Group_Middle', 'Age_Group_Senior', 'Age_Group_Elderly']
Dropped (high corr): ['Bilirubin_Albumin_Ratio']


In [7]:
out_df = X_sel.copy()
if target:
    out_df[target] = y
out_df.to_csv(OUT / "feature_selected.csv", index=False)
print("Saved:", (OUT / "feature_selected.csv").as_posix(), "| shape:", out_df.shape)

Saved: ../results/outputs/feature_selected.csv | shape: (15361, 19)
