In [1]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

RAW   = Path("../data/raw/liver_cirrhosis.csv")
PLOTS = Path("../results/eda_visualizations"); PLOTS.mkdir(parents=True, exist_ok=True)
OUT   = Path("../results/outputs");           OUT.mkdir(parents=True, exist_ok=True)

assert RAW.exists() and RAW.stat().st_size > 0, f"Dataset missing/empty at {RAW.resolve()}"

df = pd.read_csv(RAW)

numeric = [c for c in [
    'Age','Bilirubin','Cholesterol','Albumin','Copper','Alk_Phos',
    'SGOT','Tryglicerides','Platelets','Prothrombin'
] if c in df.columns]

In [2]:
plt.figure(); plt.boxplot(df['Bilirubin'].dropna(), vert=True)
plt.title("Bilirubin (before outlier handling)")
plt.tight_layout(); plt.savefig(PLOTS / "02_bilirubin_box_before.png"); plt.close()


In [3]:
def iqr_mask(s, k=1.5):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    return (s >= q1 - k*iqr) & (s <= q3 + k*iqr)

mask = pd.Series(True, index=df.index)
for c in numeric:
    mask &= iqr_mask(df[c])

removed = int((~mask).sum())
df_clean = df[mask].reset_index(drop=True)
print("Rows removed:", removed)


Rows removed: 9639


In [4]:
plt.figure(); plt.boxplot(df_clean['Bilirubin'].dropna(), vert=True)
plt.title("Bilirubin (after outlier handling)")
plt.tight_layout(); plt.savefig(PLOTS / "02_bilirubin_box_after.png"); plt.close()


In [5]:
# Save
(df_clean).to_csv(OUT / "outlier_cleaned.csv", index=False)
df_clean.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1230,C,Placebo,19724,M,Y,N,Y,N,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220.0,10.8,2
1,4184,C,Placebo,11839,F,N,N,N,N,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225.0,10.0,2
2,2090,D,Placebo,16467,F,N,N,N,N,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151.0,10.2,2
3,2105,D,Placebo,21699,F,N,Y,N,N,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151.0,11.5,1
4,2504,C,Placebo,15265,F,N,N,N,N,2.3,369.510563,3.93,24.0,1828.0,133.3,124.702128,474.0,10.9,1
