In [None]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

df = pd.read_csv("/Users/omanand/Auralis/data/processed/cleaned_data.csv")

# Correlation check to detect potential leakage
corr_with_target = df.corr(numeric_only=True)["stability_label"].sort_values(ascending=False)
print("Correlation with target:\n", corr_with_target, "\n")


X_raw= df.drop(columns=["stability_label"])
y = df["stability_label"]

X_stats = pd.DataFrame()
for col in X_raw.columns:
    X_stats[f"{col}_mean"] = [X_raw[col].mean()]
    X_stats[f"{col}_std"] = [X_raw[col].std()]
    X_stats[f"{col}_min"] = [X_raw[col].min()]
    X_stats[f"{col}_max"] = [X_raw[col].max()]
    X_stats[f"{col}_range"] = [X_raw[col].max() - X_raw[col].min()]

window = 10
rolling_features = X_raw.rolling(window=window,min_periods=1).agg(['mean','std','min','max'])
rolling_features.columns = ["_".join(col) for col in rolling_features.columns]

X = pd.concat([X_raw, rolling_features], axis=1).fillna(0)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
# baseline training
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight={0:1,1:2}
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("Baseline Model Performance")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred)}\n")

importances = rf.feature_importances_
idxs = np.argsort(importances)[::-1]
for i in idxs:
    print(f"{X.columns[i]} : {importances[i]:.4f}")


print("\nDropping 'stability_index' to test feature dependence...")
X = X.drop(columns=["stability_index"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight="balanced_subsample"
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("\n=== Without stability_index ===")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred)}\n")


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf, X, y, cv=cv, scoring="roc_auc")

print("CV ROC-AUC scores:", cv_scores)
print("CV Mean ROC-AUC:", cv_scores.mean())
joblib.dump(rf, "/Users/omanand/Auralis/models/random_forest_instability.pkl")






