In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import joblib
import os

# Load reduced dataset (selected features)
df = pd.read_csv("../data/heart_disease_selected_features.csv")

X = df.drop(columns=["num"])
y = (df["num"] > 0).astype(int)  # binary target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [2]:
# Example: Random Forest inside a pipeline with StandardScaler
pipeline = Pipeline([
    ("scaler", StandardScaler()),   # keep consistent preprocessing
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

# Train
pipeline.fit(X_train, y_train)

print("Pipeline trained successfully!")


Pipeline trained successfully!


In [3]:
# Ensure models directory exists
os.makedirs("../models", exist_ok=True)

# Save pipeline
joblib.dump(pipeline, "../models/final_model.pkl")
print("Saved pipeline model → ../models/final_model.pkl")


Saved pipeline model → ../models/final_model.pkl


In [4]:
# Load model back
loaded_model = joblib.load("../models/final_model.pkl")

# Test on new data
sample_pred = loaded_model.predict(X_test[:5])
print("Sample predictions:", sample_pred.tolist())


Sample predictions: [0, 1, 0, 0, 0]
