In [1]:
import numpy as np
import pandas as pd

from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [2]:
train = pd.read_csv("/kaggle/input/arisdata/train_processed.csv")
test  = pd.read_csv("/kaggle/input/arisdata/test_processed.csv")

y = train["RiskFlag"].astype(int)
X = train.drop(columns=["RiskFlag", "ProfileID"])
test_X = test.drop(columns=["ProfileID"])


In [3]:
K = 8
gmm = GaussianMixture(n_components=K, covariance_type="full", random_state=42)
gmm.fit(X)

train_resp = gmm.predict_proba(X)
test_resp  = gmm.predict_proba(test_X)

train_cluster = gmm.predict(X)
test_cluster  = gmm.predict(test_X)


In [4]:
X_gmm = X.copy().reset_index(drop=True)
test_gmm = test_X.copy().reset_index(drop=True)

for i in range(K):
    X_gmm[f"gmm_p_{i}"] = train_resp[:, i]
    test_gmm[f"gmm_p_{i}"] = test_resp[:, i]

X_gmm["gmm_cluster_id"] = train_cluster
test_gmm["gmm_cluster_id"] = test_cluster


In [5]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lr = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cb = np.zeros(len(X))

lr_models = []
xgb_models = []
cb_models = []

scale_pos = (len(y) - y.sum()) / y.sum()

def train_lr(Xtr, Xval, ytr):
    lr = LogisticRegression(
        class_weight="balanced",
        solver="lbfgs",
        max_iter=2000
    )
    lr.fit(Xtr, ytr)
    return lr

xgb_params = {
    "n_estimators": 700,
    "max_depth": 6,
    "learning_rate": 0.03,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "scale_pos_weight": scale_pos,
    "objective": "binary:logistic",
    "tree_method": "hist",
    "eval_metric": "logloss",
    "random_state": 42,
    "n_jobs": -1
}

cb_params = {
    "iterations": 1200,
    "learning_rate": 0.03,
    "depth": 8,
    "l2_leaf_reg": 4,
    "loss_function": "Logloss",
    "eval_metric": "F1",
    "class_weights": [1.0, scale_pos],
    "random_seed": 42,
    "verbose": False
}


In [6]:
for tr, val in kf.split(X_gmm, y):

    Xtr = X_gmm.iloc[tr]; ytr = y.iloc[tr]
    Xval = X_gmm.iloc[val]; yval = y.iloc[val]

    # ---- Logistic Regression ----
    lr = train_lr(Xtr, Xval, ytr)
    oof_lr[val] = lr.predict_proba(Xval)[:, 1]
    lr_models.append(lr)

    # ---- XGBoost ----
    xgb = XGBClassifier(**xgb_params)
    xgb.fit(Xtr, ytr)
    oof_xgb[val] = xgb.predict_proba(Xval)[:, 1]
    xgb_models.append(xgb)

    # ---- CatBoost ----
    cb = CatBoostClassifier(**cb_params)
    cb.fit(Xtr, ytr)
    oof_cb[val] = cb.predict_proba(Xval)[:, 1]
    cb_models.append(cb)


In [7]:
# 3-model blend: LR + XGB + CATBOOST
oof_blend = (0.25 * oof_lr) + (0.35 * oof_xgb) + (0.40 * oof_cb)

best_f1 = 0
best_t = 0.5

for t in np.arange(0.05, 0.95, 0.01):
    f1 = f1_score(y, (oof_blend > t).astype(int))
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print("Best Threshold:", best_t)
print("OOF F1:", best_f1)


Best Threshold: 0.5900000000000002
OOF F1: 0.36108582026639685


In [8]:
test_lr  = np.mean([m.predict_proba(test_gmm)[:, 1] for m in lr_models], axis=0)
test_xgb = np.mean([m.predict_proba(test_gmm)[:, 1] for m in xgb_models], axis=0)
test_cb  = np.mean([m.predict_proba(test_gmm)[:, 1] for m in cb_models], axis=0)

test_blend = (0.25 * test_lr) + (0.35 * test_xgb) + (0.40 * test_cb)
test_pred = (test_blend > best_t).astype(int)

submission = pd.DataFrame({
    "ProfileID": test["ProfileID"],
    "RiskFlag": test_pred
})

submission.to_csv("/kaggle/working/submission.csv", index=False)
print("Saved submission.csv")


Saved submission.csv
