In [1]:
# =============================
# 1. IMPORTS
# =============================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from catboost import CatBoostClassifier

In [2]:
# =============================
# 2. LOAD PREPROCESSED DATA
# =============================
train = pd.read_csv("/kaggle/input/arisdata/train_processed.csv")
test  = pd.read_csv("/kaggle/input/arisdata/test_processed.csv")

TARGET = "RiskFlag"
IDCOL = "ProfileID"

X = train.drop(columns=[TARGET, IDCOL])
y = train[TARGET].astype(int)

X_test = test.drop(columns=[IDCOL])
test_ID = test[IDCOL]

print("Train:", X.shape)
print("Test :", X_test.shape)

Train: (204277, 25)
Test : (51070, 25)


In [3]:
# =============================
# 3. TRAIN / VAL SPLIT
# =============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("X_train:", X_train.shape)
print("X_val:", X_val.shape)

X_train: (163421, 25)
X_val: (40856, 25)


In [4]:
# =============================
# 4. TUNED CATBOOST MODEL
# =============================
model = CatBoostClassifier(
    iterations=1200,
    depth=10,
    learning_rate=0.03,
    l2_leaf_reg=5,
    border_count=128,
    random_state=42,
    loss_function="Logloss",
    eval_metric="F1",
    class_weights=[1, 4.2],
    boosting_type="Plain",
    bootstrap_type="Bayesian",
    verbose=200
)

model.fit(X_train, y_train, eval_set=(X_val, y_val))

# Validation probs
val_proba = model.predict_proba(X_val)[:, 1]

0:	learn: 0.5008989	test: 0.4928687	best: 0.4928687 (0)	total: 168ms	remaining: 3m 21s
200:	learn: 0.5933573	test: 0.5079780	best: 0.5091568 (155)	total: 20.1s	remaining: 1m 40s
400:	learn: 0.6653280	test: 0.4986599	best: 0.5104624 (219)	total: 40.1s	remaining: 1m 19s
600:	learn: 0.7243135	test: 0.4807110	best: 0.5104624 (219)	total: 59.6s	remaining: 59.4s
800:	learn: 0.7757733	test: 0.4651453	best: 0.5104624 (219)	total: 1m 19s	remaining: 39.8s
1000:	learn: 0.8196568	test: 0.4498199	best: 0.5104624 (219)	total: 1m 39s	remaining: 19.8s
1199:	learn: 0.8568638	test: 0.4378552	best: 0.5104624 (219)	total: 1m 59s	remaining: 0us

bestTest = 0.5104624098
bestIteration = 219

Shrink model to first 220 iterations.


In [5]:
# =============================
# 5. THRESHOLD TUNING
# =============================

best_t = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.01):
    preds = (val_proba >= t).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print("BEST THRESHOLD =", best_t)
print("BEST VALIDATION F1 =", best_f1)

BEST THRESHOLD = 0.44999999999999984
BEST VALIDATION F1 = 0.3647472124637238


In [6]:
# =============================
# 6. FINAL MODEL TRAINING
# =============================
final_model = CatBoostClassifier(
    iterations=1200,
    depth=10,
    learning_rate=0.03,
    l2_leaf_reg=5,
    border_count=128,
    random_state=42,
    loss_function="Logloss",
    class_weights=[1, 4.2],
    boosting_type="Plain",
    bootstrap_type="Bayesian",
    verbose=False
)

final_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x7b518bb556d0>

In [7]:
# =============================
# 7. PREDICT ON TEST DATA
# =============================
test_proba = final_model.predict_proba(X_test)[:, 1]
test_pred  = (test_proba >= best_t).astype(int)

In [8]:
# =============================
# 8. SAVE SUBMISSION
# =============================
submission = pd.DataFrame({
    "ProfileID": test_ID,
    "RiskFlag": test_pred
})

submission.to_csv("/kaggle/working/catboost_tuned_submission.csv", index=False)

print("Saved:", "/kaggle/working/catboost_tuned_submission.csv")

Saved: /kaggle/working/catboost_tuned_submission.csv
