In [1]:
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import (classification_report, roc_auc_score, average_precision_score, precision_recall_fscore_support)
import xgboost as xgb, lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
import json
import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = joblib.load("../data/models/X_train_pre.joblib")
X_val   = joblib.load("../data/models/X_val_pre.joblib")
X_test  = joblib.load("../data/models/X_test_pre.joblib")


In [3]:

y_train = pd.read_parquet("../data/Gold/y_train.parquet").iloc[:,0].to_numpy()
y_val   = pd.read_parquet("../data/Gold/y_val.parquet").iloc[:,0].to_numpy()
y_test  = pd.read_parquet("../data/Gold/y_test.parquet").iloc[:,0].to_numpy()


In [4]:
preprocessor  = joblib.load("../data/models/preprocessor.joblib")
NUMERIC       = joblib.load("../data/models/numeric_cols.joblib")
CATEGORICAL   = joblib.load("../data/models/categorical_cols.joblib")
feat_names    = joblib.load("../data/models/prepared_feature_names.joblib")
# class_weights = joblib.load("../data/models/class_weights.joblib")

In [5]:
if hasattr(X_train, "tocsr"):
    print("Converting sparse matrix to dense array for modeling...")
else:
    print("Data is already in dense array format.")

Data is already in dense array format.


In [6]:
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Positives in train/val/test:", int(y_train.sum()), int(y_val.sum()), int(y_test.sum()))
print("Class ratio train:", y_train.mean())


Shapes: (19139453, 49) (6379894, 49) (6378891, 49)
Positives in train/val/test: 15536 9054 10640
Class ratio train: 0.00081172643753194


Class weights & imbalance parameters

In [7]:

pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
class_weights = {0: neg/(neg+pos), 1: pos/(neg+pos)}  # normalized
scale_pos_weight = max(1.0, neg / max(1, pos))

json.dump({"class_weights": class_weights, "scale_pos_weight": scale_pos_weight},
          open("../data/models/imbalance_meta.json", "w"))
print("class_weights:", class_weights)
print("scale_pos_weight:", scale_pos_weight)


class_weights: {0: np.float64(0.9991882735624681), 1: np.float64(0.00081172643753194)}
scale_pos_weight: 1230.9421343975284


Evaluation block

In [8]:
def evaluate_block(model_name, y_true, y_prob, thresh=0.5):
    y_pred = (y_prob >= thresh).astype(int)
    roc = roc_auc_score(y_true, y_prob)
    ap  = average_precision_score(y_true, y_prob) 
    p,r,f1,_ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    row = {"model": model_name, "roc_auc": roc, "pr_auc": ap, "precision": p, "recall": r, "f1": f1, "threshold": thresh}
    print(f"[{model_name}] ROC-AUC:{roc:.4f} | PR-AUC:{ap:.4f} | P:{p:.4f} | R:{r:.4f} | F1:{f1:.4f}")
    return row

def save_metrics_table(rows, fname):
    df = pd.DataFrame(rows)
    df.to_csv(fname, index=False)
    print(f" Saved metrics → {fname}")
    return df


In [10]:
X_test.dtype, X_val.dtype, X_train.dtype

(dtype('float64'), dtype('float64'), dtype('float64'))

In [11]:
def to_float32(X):
    try:
        return X.astype(np.float32, copy=False)
    except Exception:
        return np.array(X, dtype=np.float32)

X_train = to_float32(X_train)
X_val   = to_float32(X_val)
X_test  = to_float32(X_test)

XGBoost

In [12]:

dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)
dtest  = xgb.DMatrix(X_test,  label=y_test)

In [13]:

params = {
    "objective": "binary:logistic",
    "eval_metric": ["aucpr","auc"],
    "tree_method": "hist",        
    "max_depth": 8,
    "eta": 0.08,              
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 10,
    "scale_pos_weight": scale_pos_weight,
    "lambda": 1.0


}

In [None]:

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    evals=[(dtrain,"train"), (dval,"val")],
    early_stopping_rounds=100,
    verbose_eval=50
)

[0]	train-aucpr:0.08878	train-auc:0.97478	val-aucpr:0.17054	val-auc:0.97475
[50]	train-aucpr:0.35587	train-auc:0.98501	val-aucpr:0.46223	val-auc:0.98192


In [None]:
# threshold 0.5
val_prob  = bst.predict(dval,  iteration_range=(0, bst.best_iteration+1))
test_prob = bst.predict(dtest, iteration_range=(0, bst.best_iteration+1))

results = []
results.append(evaluate_block("XGB [VAL]",  y_val,  val_prob,  thresh=0.5))
results.append(evaluate_block("XGB [TEST]", y_test, test_prob, thresh=0.5))

# bst.save_model("..dta/models/xgb_full.json")
joblib.dump(bst, "../data/models/xgb_model.joblib")
print(" XGBoost model → ../models/xgb_full.json")