In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings

warnings.filterwarnings("ignore")

RAW   = Path("../results/outputs/feature_selected.csv")
PLOTS = Path("../results/eda_visualizations"); PLOTS.mkdir(parents=True, exist_ok=True)
OUT   = Path("../results/outputs");           OUT.mkdir(parents=True, exist_ok=True)

assert RAW.exists() and RAW.stat().st_size > 0, f"Dataset missing/empty at {RAW.resolve()}"


In [2]:
df = pd.read_csv(RAW)

In [3]:
assert 'Status' in df.columns, "Target 'Status' not found."
y_str = df['Status'].astype(str)
X = df.drop(columns=['Status'])

In [4]:
cat_cols = X.select_dtypes(include=['object', 'category', 'string']).columns.tolist()
if cat_cols:
    X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [5]:
for c in X.columns:
    if X[c].dtype == bool:
        X[c] = X[c].astype(int)

X = X.apply(pd.to_numeric, errors='coerce')
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))
X = X.fillna(0)
X = X.astype(float)

In [6]:
y_le = LabelEncoder()
y = y_le.fit_transform(y_str)

In [7]:
arr = X.to_numpy()
assert np.isfinite(arr).all(), "Non-finite values remain in X."
assert len(np.unique(y)) >= 2, "SMOTE needs at least two classes."


In [8]:
class_counts = Counter(y)
min_class_count = min(class_counts.values())
k = max(1, min(5, min_class_count - 1))
print(f"Class counts BEFORE SMOTE: {class_counts} | k_neighbors={k}")

Class counts BEFORE SMOTE: Counter({np.int64(0): 10142, np.int64(2): 3829, np.int64(1): 1390}) | k_neighbors=5


In [9]:
plt.figure()
pd.Series(y).value_counts().sort_index().plot(kind='bar')
plt.title("Target distribution – BEFORE SMOTE")
plt.xlabel("Class index");
plt.ylabel("Count")
plt.tight_layout();
plt.savefig(PLOTS / "06_target_before_smote.png");
plt.close()


In [10]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

rus = RandomUnderSampler(random_state=42, sampling_strategy={0: 5000, 1: 1390, 2: 3829})

sm = SMOTE(random_state=42, k_neighbors=k, sampling_strategy={1: 5000, 2: 5000})

pipeline = Pipeline(steps=[('under', rus), ('smote', sm)])
X_sm, y_sm = pipeline.fit_resample(X, y)


In [11]:
plt.figure()
pd.Series(y_sm).value_counts().sort_index().plot(kind='bar')
plt.title("Target distribution – AFTER SMOTE (≈5k each)")
plt.xlabel("Class index");
plt.ylabel("Count")
plt.tight_layout();
plt.savefig(PLOTS / "06_target_after_smote.png");
plt.close()

In [12]:
balanced = pd.DataFrame(X_sm, columns=X.columns)
balanced['Status'] = y_sm
out_path = OUT / "smote_balanced.csv"
balanced.to_csv(out_path, index=False)

print("Label mapping (index -> class):", dict(enumerate(y_le.classes_)))
print("Class counts AFTER SMOTE:", Counter(y_sm))
print("✅ Saved balanced dataset ->", out_path.resolve(), "| shape:", balanced.shape)

balanced.head()

Label mapping (index -> class): {0: '0', 1: '1', 2: '2'}
Class counts AFTER SMOTE: Counter({np.int64(0): 5000, np.int64(1): 5000, np.int64(2): 5000})
✅ Saved balanced dataset -> F:\BioLivera\results\outputs\smote_balanced.csv | shape: (15000, 19)


Unnamed: 0,N_Days,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Drug_Placebo,Status
0,3820.0,14161.0,0.0,1.0,0.0,0.0,0.0,1.8,460.0,3.85,148.0,1472.0,108.5,118.0,388.0,10.1,1.0,0.0,0
1,1874.0,24257.0,0.0,0.0,0.0,0.0,1.0,0.6,280.0,3.35,97.648387,1044.0,128.65,89.0,295.0,9.8,1.0,1.0,0
2,2772.0,17897.0,0.0,1.0,0.0,1.0,1.0,0.6,217.0,3.62,13.0,414.0,75.95,119.0,225.0,9.9,1.0,1.0,0
3,1328.0,20597.0,0.0,0.0,1.0,0.0,0.0,0.5,369.510563,3.45,97.648387,1982.655769,122.556346,124.702128,227.0,10.7,1.0,1.0,0
4,1408.0,13918.0,0.0,0.0,1.0,1.0,0.0,2.0,310.0,3.36,70.0,1257.0,122.0,118.0,136.0,10.9,2.0,1.0,0
