In [5]:
import pandas as pd
import joblib
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# --------------------------------------------
# STEP 1: Load and clean the raw dataset
# --------------------------------------------
df = pd.read_csv('../data/cirrhosis.csv')

# Drop ID column if exists
if 'ID' in df.columns:
    df.drop(columns=['ID'], inplace=True)

# Handle missing numeric values
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(df[col].median())

# Handle missing categorical values and encode
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])
    df[col] = LabelEncoder().fit_transform(df[col])

# Make sure Stage is integer
df['Stage'] = df['Stage'].astype(int)

# --------------------------------------------
# STEP 2: Define features (15 only) and target
# --------------------------------------------

# These are the 15 features your FastAPI is expecting
feature_cols = [
    'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders',
    'Edema', 'Bilirubin', 'Cholesterol', 'Albumin',
    'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides',
    'Platelets', 'Prothrombin'
]

X = df[feature_cols]
y = df['Stage'] - 1  # Shift to 0-based labels: [0, 1, 2, 3]

# --------------------------------------------
# STEP 3: Standardize and split
# --------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# --------------------------------------------
# STEP 4: Define models
# --------------------------------------------
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=150, max_depth=12, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', max_depth=8, n_estimators=150)
}

mlflow.set_experiment("Cirrhosis-Prediction-Experiment")

best_model = None
best_name = None
best_f1 = 0

# --------------------------------------------
# STEP 5: Train and evaluate each model
# --------------------------------------------
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds, average='macro', zero_division=0)
        rec = recall_score(y_test, preds, average='macro', zero_division=0)
        f1 = f1_score(y_test, preds, average='macro', zero_division=0)

        mlflow.log_param("model_name", name)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1
        })

        mlflow.sklearn.log_model(model, name + "_model")
        print(f"{name} → Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}, F1: {f1:.2f}")

        if f1 > best_f1:
            best_f1 = f1
            best_model = model
            best_name = name

# --------------------------------------------
# STEP 6: Save the best model
# --------------------------------------------
if best_model:
    print(f"\n✅ Best model: {best_name} | Features: {best_model.n_features_in_}")
    joblib.dump(best_model, '../models/best_model.pkl')
    print("✅ Saved to ../models/best_model.pkl")




LogisticRegression → Accuracy: 0.50, Precision: 0.44, Recall: 0.43, F1: 0.43




RandomForest → Accuracy: 0.46, Precision: 0.29, Recall: 0.31, F1: 0.29


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost → Accuracy: 0.48, Precision: 0.57, Recall: 0.41, F1: 0.44

✅ Best model: XGBoost | Features: 15
✅ Saved to ../models/best_model.pkl
