In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

RAW   = Path("../results/outputs/encoded.csv")
PLOTS = Path("../results/eda_visualizations"); PLOTS.mkdir(parents=True, exist_ok=True)
OUT   = Path("../results/outputs");           OUT.mkdir(parents=True, exist_ok=True)

assert RAW.exists() and RAW.stat().st_size > 0, f"Dataset missing/empty at {RAW.resolve()}"

In [3]:
df = pd.read_csv(RAW)

In [4]:
if 'Age' in df.columns:
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

In [5]:
labels = ['Young', 'Middle', 'Senior', 'Elderly']
if 'Age' in df.columns:
    upper = max(120, int(np.nanmax(df['Age'])) + 1) if df['Age'].notna().any() else 120
    bins = [0, 40, 60, 80, upper]
    df['Age_Group'] = pd.cut(
        df['Age'],
        bins=bins,
        labels=labels,
        right=False,
        include_lowest=True
    )

In [6]:
if {'Bilirubin', 'Albumin'}.issubset(df.columns):
    df['Bilirubin_Albumin_Ratio'] = df['Bilirubin'] / (df['Albumin'] + 1e-6)

In [7]:
if 'Age_Group' in df.columns:
    counts = df['Age_Group'].value_counts().reindex(labels, fill_value=0)
    plt.figure()
    counts.plot(kind='bar')
    plt.title("Age Group Distribution")
    plt.xlabel("Age_Group"); plt.ylabel("Count")
    plt.tight_layout(); plt.savefig(PLOTS / "03_age_group_distribution.png"); plt.close()

if 'Bilirubin_Albumin_Ratio' in df.columns:
    plt.figure()
    pd.to_numeric(df['Bilirubin_Albumin_Ratio'], errors='coerce').dropna().hist(bins=40)
    plt.title("Bilirubin/Albumin Ratio")
    plt.xlabel("Ratio"); plt.ylabel("Frequency")
    plt.tight_layout(); plt.savefig(PLOTS / "03_ba_ratio_hist.png"); plt.close()


In [8]:
if 'Age_Group' in df.columns:
    df = pd.get_dummies(df, columns=['Age_Group'], drop_first=True)

In [9]:
out_path = OUT / "feature_engineered.csv"
df.to_csv(out_path, index=False)
print("Saved feature-engineered dataset ->", out_path.resolve())

Saved feature-engineered dataset -> F:\BioLivera\results\outputs\feature_engineered.csv


In [10]:
df.head()

Unnamed: 0,N_Days,Status,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,...,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Drug_Placebo,Bilirubin_Albumin_Ratio,Age_Group_Middle,Age_Group_Senior,Age_Group_Elderly
0,1230,C,19724,1,1,0,1,0,0.5,219.0,...,45.0,75.0,220.0,10.8,2,True,0.127226,False,False,True
1,4184,C,11839,0,0,0,0,0,0.5,320.0,...,122.45,80.0,225.0,10.0,2,True,0.141243,False,False,True
2,2090,D,16467,0,0,0,0,0,0.7,255.0,...,77.5,58.0,151.0,10.2,2,True,0.187166,False,False,True
3,2105,D,21699,0,0,1,0,0,1.9,486.0,...,108.5,109.0,151.0,11.5,1,True,0.536723,False,False,True
4,2504,C,15265,0,0,0,0,0,2.3,369.510563,...,133.3,124.702128,474.0,10.9,1,True,0.585242,False,False,True
