In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [11]:
train.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [3]:
TARGET = "diagnosed_diabetes"

X = train.drop(columns=["id", TARGET])
y = train[TARGET].astype(int)

test_ids = test["id"]
X_test = test.drop(columns=["id"])


In [4]:
def feature_engineering(df):
    df = df.copy()
    
    df["bmi_age_ratio"] = df["bmi"] / df["age"]
    df["activity_sleep_ratio"] = df["physical_activity_minutes_per_week"] / (df["sleep_hours_per_day"] + 1)
    df["bp_ratio"] = df["systolic_bp"] / (df["diastolic_bp"] + 1)
    df["lipid_ratio"] = df["cholesterol_total"] / (df["triglycerides"] + 1)
    
    return df

X = feature_engineering(X)
X_test = feature_engineering(X_test)


In [5]:
cat_features = X.select_dtypes(include="object").columns.tolist()


In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))


In [7]:
model = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=7,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=200,
    l2_leaf_reg=6,
    subsample=0.8,
    colsample_bylevel=0.8,
    class_weights=[1, 1.3] 
)


In [8]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold+1}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model.fit(
        X_train, y_train,
        cat_features=cat_features,
        eval_set=(X_val, y_val),
        early_stopping_rounds=100,
        use_best_model=True
    )
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / skf.n_splits



Fold 1
0:	test: 0.6764233	best: 0.6764233 (0)	total: 587ms	remaining: 7m 48s
200:	test: 0.7111117	best: 0.7111117 (200)	total: 1m 35s	remaining: 4m 45s
400:	test: 0.7197156	best: 0.7197156 (400)	total: 3m 5s	remaining: 3m 4s
600:	test: 0.7226372	best: 0.7226391 (598)	total: 4m 44s	remaining: 1m 34s
799:	test: 0.7242442	best: 0.7242442 (799)	total: 7m 13s	remaining: 0us

bestTest = 0.7242442172
bestIteration = 799


Fold 2
0:	test: 0.6772335	best: 0.6772335 (0)	total: 521ms	remaining: 6m 56s
200:	test: 0.7092721	best: 0.7092721 (200)	total: 1m 34s	remaining: 4m 42s
400:	test: 0.7178986	best: 0.7178986 (400)	total: 3m 56s	remaining: 3m 55s
600:	test: 0.7210507	best: 0.7210507 (600)	total: 6m 45s	remaining: 2m 14s
799:	test: 0.7226668	best: 0.7226668 (799)	total: 8m 37s	remaining: 0us

bestTest = 0.7226668021
bestIteration = 799


Fold 3
0:	test: 0.6781913	best: 0.6781913 (0)	total: 506ms	remaining: 6m 44s
200:	test: 0.7093929	best: 0.7093929 (200)	total: 1m 32s	remaining: 4m 34s
400:	te

In [9]:
auc = roc_auc_score(y, oof_preds)
print("\n CV ROC-AUC:", auc)



 CV ROC-AUC: 0.7238078714294118


In [10]:
submission = pd.DataFrame({
    "id": test_ids,
    "diagnosed_diabetes": test_preds
})

submission.to_csv("submission.csv", index=False)
