In [1]:
from pathlib import Path
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

RAW = Path("../results/outputs/encoded.csv")
PLOTS = Path("../results/eda_visualizations"); PLOTS.mkdir(parents=True, exist_ok=True)
OUT = Path("../results/outputs"); OUT.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT / "feature_engineered.csv"

assert RAW.exists() and RAW.stat().st_size > 0, f"Missing/empty: {RAW.resolve()}"

In [2]:
df = pd.read_csv(RAW)

In [3]:
if "Age" in df.columns:
    df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
    med = df["Age"].median()
    df["Age_Years"] = df["Age"] / 365.25 if pd.notna(med) and med > 150 else df["Age"]
    bins = [0, 40, 60, 80, 120]; labels = ["Young","Middle","Senior","Elderly"]
    age_for_bins = df["Age_Years"].where((df["Age_Years"]>=0)&(df["Age_Years"]<=120))
    df["Age_Group"] = pd.cut(age_for_bins, bins=bins, labels=labels, right=False, include_lowest=True)

    if df["Age_Group"].notna().any():
        c = df["Age_Group"].value_counts().reindex(labels, fill_value=0)
        plt.figure(); c.plot(kind="bar")
        plt.title("Age Group Distribution"); plt.xlabel("Age_Group"); plt.ylabel("Count")
        plt.tight_layout(); plt.savefig(PLOTS / "03_age_group_distribution.png"); plt.close()

In [4]:
for col in ["Bilirubin","Albumin"]:
    if col in df.columns: df[col] = pd.to_numeric(df[col], errors="coerce")
if {"Bilirubin","Albumin"}.issubset(df.columns):
    alb = df["Albumin"].mask(df["Albumin"]<=1e-6, np.nan)
    df["Bilirubin_Albumin_Ratio"] = (df["Bilirubin"]/alb).clip(upper=50)
    ratio_clean = pd.to_numeric(df["Bilirubin_Albumin_Ratio"], errors="coerce").dropna()
    if not ratio_clean.empty:
        plt.figure(); ratio_clean.hist(bins=40)
        plt.title("Bilirubin/Albumin Ratio"); plt.xlabel("Ratio"); plt.ylabel("Frequency")
        plt.tight_layout(); plt.savefig(PLOTS / "03_ba_ratio_hist.png"); plt.close()


In [5]:
if "Age_Group" in df.columns:
    df = pd.get_dummies(df, columns=["Age_Group"], drop_first=True)  # baseline=Young
df.to_csv(OUT_CSV, index=False)

In [6]:
print(f"Saved -> {OUT_CSV.resolve()}")
if "Age_Years" in df.columns:
    print(f"Age_Years median: {pd.to_numeric(df['Age_Years'], errors='coerce').median():.2f}")
if "Bilirubin_Albumin_Ratio" in df.columns:
    q = pd.to_numeric(df["Bilirubin_Albumin_Ratio"], errors="coerce").dropna().quantile([0.5,0.9])
    print(f"BA ratio median={q.loc[0.5]:.3f}, p90={q.loc[0.9]:.3f}")

Saved -> F:\BioLivera3\results\outputs\feature_engineered.csv
Age_Years median: 50.25
BA ratio median=0.308, p90=1.003
