# More robust submission for kaggle 

ranking kaggle : 258/2276 

In [1]:
# FAST & STRONG CatBoost model (≈95% of max AUC, 4x faster)
# Competition-safe configuration

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# =====================
# Load data
# =====================
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_sub = pd.read_csv('data/sample_submission.csv')

TARGET = 'diagnosed_diabetes'

X = train.drop(columns=[TARGET])
y = train[TARGET]

# =====================
# Categorical features
# =====================
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat_feature_indices = [X.columns.get_loc(c) for c in cat_features]

# =====================
# CV setup (FAST)
# =====================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_preds = np.zeros(len(test))
auc_scores = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_feature_indices)
    val_pool = Pool(X_val, y_val, cat_features=cat_feature_indices)

    model = CatBoostClassifier(
        iterations=3000,
        learning_rate=0.02,
        depth=8,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        l2_leaf_reg=7,
        boosting_type='Ordered',
        bootstrap_type='Bayesian',
        bagging_temperature=0.3,
        min_data_in_leaf=40,
        early_stopping_rounds=200,
        verbose=200
    )

    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    val_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    auc_scores.append(auc)

    test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    print(f"Fold {fold + 1} AUC: {auc:.6f}")

print("\nMean CV AUC:", np.mean(auc_scores))

# =====================
# Submission
# =====================
submission = sample_sub.copy()
submission[TARGET] = test_preds
submission.to_csv('submission6.csv', index=False)

print("FAST submission saved ✔")


0:	test: 0.6838246	best: 0.6838246 (0)	total: 650ms	remaining: 32m 28s
200:	test: 0.7060894	best: 0.7060894 (200)	total: 1m 46s	remaining: 24m 42s
400:	test: 0.7111712	best: 0.7111712 (400)	total: 3m 31s	remaining: 22m 50s
600:	test: 0.7146712	best: 0.7146712 (600)	total: 5m 35s	remaining: 22m 18s
800:	test: 0.7191796	best: 0.7191796 (800)	total: 7m 49s	remaining: 21m 29s
1000:	test: 0.7221317	best: 0.7221317 (1000)	total: 10m 9s	remaining: 20m 16s
1200:	test: 0.7236959	best: 0.7236959 (1200)	total: 12m 40s	remaining: 18m 59s
1400:	test: 0.7247804	best: 0.7247804 (1400)	total: 14m 53s	remaining: 16m 59s
1600:	test: 0.7254806	best: 0.7254806 (1600)	total: 17m 7s	remaining: 14m 57s
1800:	test: 0.7258524	best: 0.7258528 (1796)	total: 19m 39s	remaining: 13m 5s
2000:	test: 0.7260254	best: 0.7260254 (1999)	total: 22m 19s	remaining: 11m 8s
2200:	test: 0.7262516	best: 0.7262516 (2200)	total: 24m 49s	remaining: 9m
2400:	test: 0.7264332	best: 0.7264332 (2400)	total: 27m 15s	remaining: 6m 48s
260