In [6]:
# ===============================
# Member 1 – Encoding Categorical Variables(2)
# ===============================
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

RAW   = Path("../results/outputs/outlier_cleaned.csv")
PLOTS = Path("../results/eda_visualizations"); PLOTS.mkdir(parents=True, exist_ok=True)
OUT   = Path("../results/outputs");           OUT.mkdir(parents=True, exist_ok=True)

assert RAW.exists() and RAW.stat().st_size > 0, f"Dataset missing/empty at {RAW.resolve()}"

In [7]:
# Load
df = pd.read_csv(RAW)
df_raw = df.copy()

In [8]:
# EDA (before encoding)
plt.figure(); df_raw['Drug'].value_counts().plot(kind='bar')
plt.title("Drug distribution (before encoding)"); plt.xlabel("Drug"); plt.ylabel("Count")
plt.tight_layout(); plt.savefig(PLOTS / "01_drug_counts_before_encoding.png"); plt.close()

plt.figure(); df_raw['Sex'].value_counts().plot(kind='bar')
plt.title("Sex distribution (before encoding)"); plt.xlabel("Sex"); plt.ylabel("Count")
plt.tight_layout(); plt.savefig(PLOTS / "01_sex_counts_before_encoding.png"); plt.close()


In [9]:
# Label-encode binary categoricals
binary_cols = ['Sex','Ascites','Hepatomegaly','Spiders','Edema']
label_maps = {}
for col in binary_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_maps[col] = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mappings:", label_maps)


Label mappings: {'Sex': {'F': np.int64(0), 'M': np.int64(1)}, 'Ascites': {'N': np.int64(0), 'Y': np.int64(1)}, 'Hepatomegaly': {'N': np.int64(0), 'Y': np.int64(1)}, 'Spiders': {'N': np.int64(0), 'Y': np.int64(1)}, 'Edema': {'N': np.int64(0), 'S': np.int64(1), 'Y': np.int64(2)}}


In [10]:
# One-hot encode Drug
if 'Drug' in df.columns:
    df = pd.get_dummies(df, columns=['Drug'], drop_first=True)

In [11]:
# Save
encoded_path = OUT / "encoded.csv"
df.to_csv(encoded_path, index=False)
print("Saved:", encoded_path.resolve())
df.head()

Saved: F:\BioLivera\results\outputs\encoded.csv


Unnamed: 0,N_Days,Status,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Drug_Placebo
0,1230,C,19724,1,1,0,1,0,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220.0,10.8,2,True
1,4184,C,11839,0,0,0,0,0,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225.0,10.0,2,True
2,2090,D,16467,0,0,0,0,0,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151.0,10.2,2,True
3,2105,D,21699,0,0,1,0,0,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151.0,11.5,1,True
4,2504,C,15265,0,0,0,0,0,2.3,369.510563,3.93,24.0,1828.0,133.3,124.702128,474.0,10.9,1,True
