In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier, plot_importance
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier, plot_importance as lgbm_plot_importance, early_stopping, log_evaluation
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from generate_features import generate_features


In [None]:
os.makedirs("models", exist_ok=True)
os.makedirs("results", exist_ok=True)
os.makedirs("logs", exist_ok=True)

# Also ensure the log CSV exists
log_file = "logs/model_log.csv"
if not os.path.exists(log_file):
    with open(log_file, "w") as f:
        f.write("model_id,model_type,AUC,date,params,notes\n")


In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X, y, test, feature_names = generate_features(train, test, return_feature_names=True)
X_small, y_small = resample(X, y, n_samples=20000, stratify=y, random_state=42)

# # ‚ö†Ô∏è TEMP: Subsample for debugging speed
# X = X[:20000]
# y = y[:20000]



In [None]:
def run_knn_model(model_id, params, notes):
    model_type = "KNN"
    model = KNeighborsClassifier(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(test.shape[0])
    aucs = []

    for train_idx, val_idx in skf.split(X, y):
        model.fit(X[train_idx], y[train_idx])
        val_probs = model.predict_proba(X[val_idx])[:, 1]
        auc = roc_auc_score(y[val_idx], val_probs)
        aucs.append(auc)
        test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    mean_auc = np.mean(aucs)
    print(f"‚úÖ AUC ({model_id}):", mean_auc)

    joblib.dump(model, f"models/{model_id}.pkl")
    pd.DataFrame({"Id": range(len(test_preds)), "Prediction": test_preds}).to_csv(f"results/{model_id}.csv", index=False)

    entry = pd.DataFrame([{ "model_id": model_id, "model_type": model_type, "AUC": mean_auc, "date": datetime.now().strftime("%Y-%m-%d"), "params": str(params), "notes": notes }])
    log_path = "logs/model_log.csv"
    log = pd.read_csv(log_path) if os.path.exists(log_path) else pd.DataFrame()
    pd.concat([log, entry], ignore_index=True).to_csv(log_path, index=False)



In [None]:
def run_logreg_model(model_id, params, notes, feature_names=None):
    model_type = "LogReg"
    model = LogisticRegression(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(test.shape[0])
    aucs = []

    for train_idx, val_idx in skf.split(X, y):
        model.fit(X[train_idx], y[train_idx])
        val_probs = model.predict_proba(X[val_idx])[:, 1]
        auc = roc_auc_score(y[val_idx], val_probs)
        aucs.append(auc)
        test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    mean_auc = np.mean(aucs)
    print(f"‚úÖ AUC ({model_id}):", mean_auc)

    joblib.dump(model, f"models/{model_id}.pkl")
    pd.DataFrame({"Id": range(len(test_preds)), "Prediction": test_preds}).to_csv(f"results/{model_id}.csv", index=False)

    entry = pd.DataFrame([{ "model_id": model_id, "model_type": model_type, "AUC": mean_auc, "date": datetime.now().strftime("%Y-%m-%d"), "params": str(params), "notes": notes }])
    log_path = "logs/model_log.csv"
    log = pd.read_csv(log_path) if os.path.exists(log_path) else pd.DataFrame()
    pd.concat([log, entry], ignore_index=True).to_csv(log_path, index=False)

    if feature_names is not None and hasattr(model, 'coef_'):
        importances = np.abs(model.coef_[0])
        indices = np.argsort(importances)[-10:][::-1]
        plt.barh([feature_names[i] for i in indices], importances[indices])
        plt.xlabel("Feature importance (abs coef)")
        plt.title(f"Top 10 Feature Importances ({model_id})")
        plt.tight_layout()
        plt.show()



In [None]:
def run_svm_model(model_id, params, notes):
    model_type = "SVM"
    model = SVC(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(test.shape[0])
    aucs = []

    X_small, y_small = resample(X, y, n_samples=5000, stratify=y, random_state=42)

    for train_idx, val_idx in skf.split(X_small, y_small):
        model.fit(X_small[train_idx], y_small[train_idx])
        val_scores = model.decision_function(X_small[val_idx])
        auc = roc_auc_score(y_small[val_idx], val_scores)
        aucs.append(auc)
        test_preds += model.decision_function(test) / skf.n_splits

    mean_auc = np.mean(aucs)
    print(f"‚úÖ AUC ({model_id}):", mean_auc)

    joblib.dump(model, f"models/{model_id}.pkl")
    pd.DataFrame({"Id": range(len(test_preds)), "Prediction": test_preds}).to_csv(f"results/{model_id}.csv", index=False)

    entry = pd.DataFrame([{ "model_id": model_id, "model_type": model_type, "AUC": mean_auc, "date": datetime.now().strftime("%Y-%m-%d"), "params": str(params), "notes": notes }])
    log_path = "logs/model_log.csv"
    log = pd.read_csv(log_path) if os.path.exists(log_path) else pd.DataFrame()
    pd.concat([log, entry], ignore_index=True).to_csv(log_path, index=False)

In [None]:
def run_adaboost_model(model_id, params, notes, feature_names):
    model_type = "AdaBoost"
    base = DecisionTreeClassifier(max_depth=2)
    model = AdaBoostClassifier(estimator=base, **params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(test.shape[0])
    aucs = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nüü¢ Fold {fold + 1}")
        model.fit(X[train_idx], y[train_idx])
        train_probs = model.predict_proba(X[train_idx])[:, 1]
        val_probs = model.predict_proba(X[val_idx])[:, 1]
        train_auc = roc_auc_score(y[train_idx], train_probs)
        val_auc = roc_auc_score(y[val_idx], val_probs)
        print(f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f}")
        aucs.append(val_auc)
        test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    mean_auc = np.mean(aucs)
    print(f"\n‚úÖ AUC ({model_id}): {mean_auc:.6f}")

    joblib.dump(model, f"models/{model_id}.pkl")
    pd.DataFrame({"Id": range(len(test_preds)), "Prediction": test_preds}).to_csv(f"results/{model_id}.csv", index=False)

    log_path = "logs/model_log.csv"
    entry = pd.DataFrame([{ "model_id": model_id, "model_type": model_type, "AUC": mean_auc, "date": datetime.now().strftime("%Y-%m-%d"), "params": str(params), "notes": notes }])
    log = pd.read_csv(log_path) if os.path.exists(log_path) else pd.DataFrame()
    pd.concat([log, entry], ignore_index=True).to_csv(log_path, index=False)

    importances = model.feature_importances_
    indices = np.argsort(importances)[-10:][::-1]
    plt.barh([feature_names[i] for i in indices], importances[indices])
    plt.xlabel("Feature importance")
    plt.title(f"Top 10 Feature Importances ({model_id})")
    plt.tight_layout()
    plt.show()

In [None]:
def run_xgb_model(model_id, params, notes, X, y, test, feature_names):
    from xgboost import XGBClassifier, plot_importance
    model_type = "XGBoost"
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(test.shape[0])
    aucs = []
    models = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nüü¢ Fold {fold + 1}")
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]

        model = XGBClassifier(**params, n_estimators=10000)

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=30,
            verbose=100
        )

        train_probs = model.predict_proba(X_train)[:, 1]
        val_probs = model.predict_proba(X_val)[:, 1]
        train_auc = roc_auc_score(y_train, train_probs)
        val_auc = roc_auc_score(y_val, val_probs)
        print(f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f}")

        aucs.append(val_auc)
        test_preds += model.predict_proba(test)[:, 1] / skf.n_splits
        models.append(model)

    mean_auc = np.mean(aucs)
    print(f"\n‚úÖ AUC ({model_id}): {mean_auc:.6f}")

    joblib.dump(models[-1], f"models/{model_id}.pkl")
    pd.DataFrame({"Id": range(len(test_preds)), "Prediction": test_preds}).to_csv(f"results/{model_id}.csv", index=False)

    log_path = "logs/model_log.csv"
    entry = pd.DataFrame([{ 
        "model_id": model_id, "model_type": model_type, "AUC": mean_auc, 
        "date": datetime.now().strftime("%Y-%m-%d"), "params": str(params), "notes": notes 
    }])
    log = pd.read_csv(log_path) if os.path.exists(log_path) else pd.DataFrame()
    pd.concat([log, entry], ignore_index=True).to_csv(log_path, index=False)

    # Plot top 10 feature importances
    importances = model.feature_importances_
    indices = np.argsort(importances)[-10:][::-1]
    plt.barh([feature_names[i] for i in indices], importances[indices])
    plt.xlabel("Feature importance")
    plt.title(f"Top 10 Feature Importances ({model_id})")
    plt.tight_layout()
    plt.show()


In [None]:
def run_catboost_model(model_id, params, notes, feature_names):
    model_type = "CatBoost"
    model = CatBoostClassifier(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(test.shape[0])
    aucs = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nüü¢ Fold {fold + 1}")
        model.fit(X[train_idx], y[train_idx])
        train_probs = model.predict_proba(X[train_idx])[:, 1]
        val_probs = model.predict_proba(X[val_idx])[:, 1]
        train_auc = roc_auc_score(y[train_idx], train_probs)
        val_auc = roc_auc_score(y[val_idx], val_probs)
        print(f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f}")
        aucs.append(val_auc)
        test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    mean_auc = np.mean(aucs)
    print(f"\n‚úÖ AUC ({model_id}): {mean_auc:.6f}")

    joblib.dump(model, f"models/{model_id}.pkl")
    pd.DataFrame({"Id": range(len(test_preds)), "Prediction": test_preds}).to_csv(f"results/{model_id}.csv", index=False)

    log_path = "logs/model_log.csv"
    entry = pd.DataFrame([{ "model_id": model_id, "model_type": model_type, "AUC": mean_auc, "date": datetime.now().strftime("%Y-%m-%d"), "params": str(params), "notes": notes }])
    log = pd.read_csv(log_path) if os.path.exists(log_path) else pd.DataFrame()
    pd.concat([log, entry], ignore_index=True).to_csv(log_path, index=False)

    importances = model.get_feature_importance()
    indices = np.argsort(importances)[-10:][::-1]
    plt.barh([feature_names[i] for i in indices], importances[indices])
    plt.xlabel("Feature importance")
    plt.title(f"Top 10 Feature Importances ({model_id})")
    plt.tight_layout()
    plt.show()


In [None]:
def run_lgbm_model(model_id, params, notes, feature_names):

    model_type = "LightGBM"
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(test.shape[0])
    aucs = []
    models = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nüü¢ Fold {fold + 1}")
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]

        model = LGBMClassifier(
            **params,
            n_estimators=1000
        )

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[
                early_stopping(stopping_rounds=30),
                log_evaluation(period=100)
            ]
        )

        train_probs = model.predict_proba(X_train)[:, 1]
        val_probs = model.predict_proba(X_val)[:, 1]

        train_auc = roc_auc_score(y_train, train_probs)
        val_auc = roc_auc_score(y_val, val_probs)
        print(f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f}")

        aucs.append(val_auc)
        test_preds += model.predict_proba(test)[:, 1] / skf.n_splits
        models.append(model)

    mean_auc = np.mean(aucs)
    print(f"\n‚úÖ AUC ({model_id}): {mean_auc:.6f}")

    joblib.dump(models[-1], f"models/{model_id}.pkl")
    pd.DataFrame({"Id": range(len(test_preds)), "Prediction": test_preds}).to_csv(f"results/{model_id}.csv", index=False)

    log_path = "logs/model_log.csv"
    entry = pd.DataFrame([{
        "model_id": model_id,
        "model_type": model_type,
        "AUC": mean_auc,
        "date": datetime.now().strftime("%Y-%m-%d"),
        "params": str(params),
        "notes": notes
    }])
    log = pd.read_csv(log_path) if os.path.exists(log_path) else pd.DataFrame()
    pd.concat([log, entry], ignore_index=True).to_csv(log_path, index=False)

    # Plot top 10 feature importances with real feature names
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1][:10]
    top_names = [feature_names[i] for i in indices]

    plt.figure(figsize=(8, 5))
    plt.barh(range(10), importances[indices][::-1])
    plt.yticks(range(10), [top_names[i] for i in range(9, -1, -1)])
    plt.xlabel("Feature importance")
    plt.title(f"Top 10 Feature Importances ({model_id})")
    plt.tight_layout()
    plt.show()


In [None]:
def run_rf_model(model_id, params, notes, feature_names):
    model_type = "RandomForest"
    model = RandomForestClassifier(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_preds = np.zeros(test.shape[0])
    aucs = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nüü¢ Fold {fold + 1}")
        model.fit(X[train_idx], y[train_idx])
        train_probs = model.predict_proba(X[train_idx])[:, 1]
        val_probs = model.predict_proba(X[val_idx])[:, 1]
        train_auc = roc_auc_score(y[train_idx], train_probs)
        val_auc = roc_auc_score(y[val_idx], val_probs)
        print(f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f}")
        aucs.append(val_auc)
        test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    mean_auc = np.mean(aucs)
    print(f"\n‚úÖ AUC ({model_id}): {mean_auc:.6f}")

    joblib.dump(model, f"models/{model_id}.pkl")
    pd.DataFrame({"Id": range(len(test_preds)), "Prediction": test_preds}).to_csv(f"results/{model_id}.csv", index=False)

    log_path = "logs/model_log.csv"
    entry = pd.DataFrame([{ "model_id": model_id, "model_type": model_type, "AUC": mean_auc, "date": datetime.now().strftime("%Y-%m-%d"), "params": str(params), "notes": notes }])
    log = pd.read_csv(log_path) if os.path.exists(log_path) else pd.DataFrame()
    pd.concat([log, entry], ignore_index=True).to_csv(log_path, index=False)

    importances = model.feature_importances_
    indices = np.argsort(importances)[-10:][::-1]
    plt.barh([feature_names[i] for i in indices], importances[indices])
    plt.xlabel("Feature importance")
    plt.title(f"Top 10 Feature Importances ({model_id})")
    plt.tight_layout()
    plt.show()


In [None]:
# Parameter grids
lgbm_param_grid = [
    {"num_leaves": 60, "max_depth": 8, "learning_rate": 0.04, "subsample": 0.85, "colsample_bytree": 0.9, "random_state": 42},
    {"num_leaves": 80, "max_depth": 10, "learning_rate": 0.03, "subsample": 0.9, "colsample_bytree": 0.85, "random_state": 42},
    {"num_leaves": 100, "max_depth": 12, "learning_rate": 0.025, "subsample": 0.95, "colsample_bytree": 0.9, "random_state": 42},
    {"num_leaves": 40, "max_depth": 6, "learning_rate": 0.02, "subsample": 0.85, "colsample_bytree": 0.95, "random_state": 42},
    {"num_leaves": 30, "max_depth": 5, "learning_rate": 0.01, "subsample": 0.9, "colsample_bytree": 1.0, "random_state": 42}
]

catboost_param_grid = [
    {"iterations": 500, "depth": 6, "learning_rate": 0.03, "loss_function": "Logloss", "verbose": 0, "random_seed": 42},
    {"iterations": 600, "depth": 8, "learning_rate": 0.025, "loss_function": "Logloss", "verbose": 0, "random_seed": 42},
    {"iterations": 700, "depth": 7, "learning_rate": 0.02, "loss_function": "Logloss", "verbose": 0, "random_seed": 42}
]
xgb_param_grid = [
    {"n_estimators": 10000, "max_depth": 6, "learning_rate": 0.035, "subsample": 0.9, "colsample_bytree": 0.9, "eval_metric": "logloss", "random_state": 42, "use_label_encoder": False},
    {"n_estimators": 10000, "max_depth": 5, "learning_rate": 0.03, "subsample": 0.85, "colsample_bytree": 0.8, "eval_metric": "logloss", "random_state": 42, "use_label_encoder": False},
    {"n_estimators": 10000, "max_depth": 4, "learning_rate": 0.02, "subsample": 1.0, "colsample_bytree": 1.0, "eval_metric": "logloss", "random_state": 42, "use_label_encoder": False},
    {"n_estimators": 10000, "max_depth": 5, "learning_rate": 0.015, "subsample": 0.9, "colsample_bytree": 0.85, "eval_metric": "logloss", "random_state": 42, "use_label_encoder": False},
    {"n_estimators": 10000, "max_depth": 3, "learning_rate": 0.01, "subsample": 0.95, "colsample_bytree": 0.95, "eval_metric": "logloss", "random_state": 42, "use_label_encoder": False}
]

rf_param_grid = [
    {"n_estimators": 300, "max_depth": 12, "min_samples_leaf": 2, "max_features": "sqrt", "bootstrap": True, "random_state": 42, "n_jobs": -1},
    {"n_estimators": 400, "max_depth": 15, "min_samples_leaf": 2, "max_features": 0.8, "bootstrap": True, "random_state": 42, "n_jobs": -1}
]
adaboost_param_grid = [
    {"n_estimators": 300, "learning_rate": 0.1, "random_state": 42},
    {"n_estimators": 500, "learning_rate": 0.05, "random_state": 42}
]
logreg_param_grid = [
    {"solver": "liblinear", "C": 1.0, "max_iter": 200},
    {"solver": "liblinear", "C": 0.5, "max_iter": 200}
]
svm_param_grid = [
    {"C": 1.0, "kernel": "linear", "probability": False, "random_state": 42},
    {"C": 0.5, "kernel": "linear", "probability": False, "random_state": 42}
]
knn_param_grid = [
    {"n_neighbors": 5},
    {"n_neighbors": 7}
]

model_configs = [
    # ("lgbm", run_lgbm_model, lgbm_param_grid, X, y, True),
    ("xgb", run_xgb_model, xgb_param_grid, X, y, True),
    ("rf", run_rf_model, rf_param_grid, X, y, True),
    ("catboost", run_catboost_model, catboost_param_grid, X, y, False),
    ("adaboost", run_adaboost_model, adaboost_param_grid, X, y, True),
    ("logreg", run_logreg_model, logreg_param_grid, X, y, True),
    ("svm", run_svm_model, svm_param_grid, X_small, y_small, False),
    ("knn", run_knn_model, knn_param_grid, X_small, y_small, False)
]

# Run all model configs with all param sets
for model_prefix, func, param_list, X_used, y_used, pass_features in model_configs:
    for i, params in enumerate(param_list):
        model_id = f"{model_prefix}_v{i+1}"
        print(f"\nüöÄ Running {model_id}...")

        try:
            if pass_features:
                func(model_id, params, f"{model_prefix.upper()} config #{i+1}", feature_names)
            else:
                func(model_id, params, f"{model_prefix.upper()} config #{i+1}")
        except Exception as e:
            print(f"‚ùå ERROR in {model_id}: {e}")
            continue  # Move to the next config

