In [None]:
# 📦 Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import os


In [None]:
# 📊 Load the cleaned dataset
df = pd.read_csv("../data/diabetic_data_cleaned.csv")
df.head()


In [None]:
# 🎯 Create binary target variable
df['readmitted_flag'] = df['readmitted'].apply(lambda x: 1 if x == "<30" else 0)


In [None]:
# 🧹 Drop unused columns (tweak if needed)
df = df.drop(columns=['readmitted', 'encounter_id', 'patient_nbr'], errors='ignore')


In [None]:
# 🔀 One-hot encode categorical variables
df = pd.get_dummies(df, drop_first=True)


In [None]:
# 📂 Split into train and test
X = df.drop("readmitted_flag", axis=1)
y = df["readmitted_flag"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 🧠 Train Random Forest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)


In [None]:
# 🧾 Evaluate performance
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
# 💾 Save model to 'models/' folder
os.makedirs("../models", exist_ok=True)
joblib.dump(clf, "../models/readmission_model.pkl")
