In [1]:
# ============================
# 1. IMPORTS
# ============================
import numpy as np
import pandas as pd

from sklearn.mixture import GaussianMixture
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [2]:
# ============================
# 2. LOAD DATA
# ============================
train = pd.read_csv("/kaggle/input/arisdata/train_processed.csv")
test  = pd.read_csv("/kaggle/input/arisdata/test_processed.csv")

y = train["RiskFlag"]
X = train.drop(columns=["RiskFlag", "ProfileID"])
test_X = test.drop(columns=["ProfileID"])

In [3]:
# ============================
# 3. GMM CLUSTERING
# ============================
# Best range to try: 3â€“12 clusters
BEST_K = 8   # usually best for tabular risk datasets

gmm = GaussianMixture(
    n_components=BEST_K,
    covariance_type="full",
    random_state=42
)

gmm.fit(X)

# Add cluster ID as feature
train["Cluster"] = gmm.predict(X)
test["Cluster"] = gmm.predict(test_X)

# Add probability features
cluster_probs_train = gmm.predict_proba(X)
cluster_probs_test  = gmm.predict_proba(test_X)

for i in range(BEST_K):
    train[f"ClusterProb_{i}"] = cluster_probs_train[:, i]
    test[f"ClusterProb_{i}"]  = cluster_probs_test[:, i]

In [4]:
# ============================
# 4. FINAL DATA FOR MODEL
# ============================
X = train.drop(columns=["RiskFlag", "ProfileID"])
test_X = test.drop(columns=["ProfileID"])

In [5]:
# ============================
# 5. TRAIN A STRONG CLASSIFIER
# ============================
# XGBoost gives the highest F1 in these competitions
model = XGBClassifier(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

# Cross-validation for reliable F1 evaluation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X))
models = []

for train_index, val_index in kf.split(X, y):
    X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    model.fit(X_tr, y_tr)
    pred = model.predict(X_val)
    
    oof[val_index] = pred
    models.append(model)

print("OOF F1 Score:", f1_score(y, oof))

OOF F1 Score: 0.12134933212587729


In [6]:
# ============================
# 6. PREDICT ON TEST SET
# ============================
# Average predictions from all folds
test_preds = np.mean([m.predict(test_X) for m in models], axis=0)
test_preds = (test_preds > 0.5).astype(int)

submission = pd.DataFrame({
    "ProfileID": test["ProfileID"],
    "RiskFlag": test_preds
})

submission.to_csv("/kaggle/working/submission.csv", index=False)
print("Saved submission.csv")

Saved submission.csv
