In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
TARGET = "diagnosed_diabetes"


In [3]:
X = train.drop(columns=["id", TARGET])
y = train[TARGET].astype(int)
test_ids = test["id"]
X_test = test.drop(columns=["id"])

In [4]:
def advanced_feature_engineering(df):
    df = df.copy()
    df["bmi_age_ratio"] = df["bmi"] / (df["age"] + 1e-3)
    df["activity_sleep_ratio"] = df["physical_activity_minutes_per_week"] / (df["sleep_hours_per_day"] + 1)
    df["bp_ratio"] = df["systolic_bp"] / (df["diastolic_bp"] + 1)
    df["lipid_ratio"] = df["cholesterol_total"] / (df["triglycerides"] + 1)
    df["pulse_pressure"] = df["systolic_bp"] - df["diastolic_bp"]
    df["mean_arterial_pressure"] = df["diastolic_bp"] + (df["systolic_bp"] - df["diastolic_bp"]) / 3.0
    df["non_hdl_cholesterol"] = df["cholesterol_total"] - df["hdl_cholesterol"]
    df["ldl_hdl_ratio"] = df["ldl_cholesterol"] / (df["hdl_cholesterol"] + 1e-3)
    df["tg_hdl_ratio"] = df["triglycerides"] / (df["hdl_cholesterol"] + 1e-3)
    df["bmi_sq"] = df["bmi"] ** 2
    df["log_bmi"] = np.log1p(df["bmi"])
    df["bmi_waist_hip"] = df["bmi"] * df["waist_to_hip_ratio"]
    df["waist_hip_age"] = df["waist_to_hip_ratio"] * df["age"]
    df["sedentary_ratio"] = df["screen_time_hours_per_day"] / (df["sleep_hours_per_day"] + 1)
    df["activity_sedentary_ratio"] = df["physical_activity_minutes_per_week"] / (df["screen_time_hours_per_day"] + 1)
    df["low_sleep"] = (df["sleep_hours_per_day"] < 6).astype(int)
    df["high_screen"] = (df["screen_time_hours_per_day"] > 6).astype(int)
    df["low_activity"] = (df["physical_activity_minutes_per_week"] < 150).astype(int)
    df["poor_diet"] = (df["diet_score"] < df["diet_score"].quantile(0.3)).astype(int)
    df["lifestyle_risk_score"] = df[["low_sleep", "high_screen", "low_activity", "poor_diet"]].sum(axis=1)
    
    history_cols = ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]
    for col in history_cols:
        df[col + "_bin"] = (df[col] == "Yes").astype(int)
    df["comorbid_count"] = df[[col + "_bin" for col in history_cols]].sum(axis=1)
    df["comorbid_age_interaction"] = df["comorbid_count"] * df["age"]
    
    # Outlier clipping
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        lower, upper = df[col].quantile([0.01, 0.99])
        df[col] = df[col].clip(lower, upper)
    
    return df


In [5]:
X = advanced_feature_engineering(X)
X_test = advanced_feature_engineering(X_test)

In [6]:
# Label encode categoricals
cat_features = X.select_dtypes(include="object").columns.tolist()
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    label_encoders[col] = le


In [7]:
# === ANTI-OVERFIT 3-MODEL ENSEMBLE (1 seed only)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_cat, oof_xgb, oof_lgb = np.zeros(len(X)), np.zeros(len(X)), np.zeros(len(X))
test_cat, test_xgb, test_lgb = np.zeros(len(X_test)), np.zeros(len(X_test)), np.zeros(len(X_test))


In [8]:
# 1. CATBOOST (ANTI-OVERFIT)
cat_params = {
    'iterations': 1500,        #  Fewer iterations
    'learning_rate': 0.04,     #  Slightly higher LR
    'depth': 5,                #  Shallower trees
    'l2_leaf_reg': 8,          #  Heavy regularization
    'random_strength': 0.3,    #  More randomness
    'border_count': 64,        #  Fewer splits
    'subsample': 0.8,
    'colsample_bylevel': 0.6,
    'class_weights': [1, 1.1], #  Lighter class weighting
    'random_seed': 42,
    'verbose': False
}


In [9]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = CatBoostClassifier(**cat_params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val),
              early_stopping_rounds=100, verbose=False)
    
    oof_cat[val_idx] = model.predict_proba(X_val)[:, 1]
    test_cat += model.predict_proba(X_test)[:, 1] / skf.n_splits


In [10]:
print(f"CatBoost CV AUC: {roc_auc_score(y, oof_cat):}")


CatBoost CV AUC: 0.7147414755312003


In [11]:
# 2. XGBOOST (ANTI-OVERFIT)
xgb_params = {
    'n_estimators': 1500,
    'learning_rate': 0.04,
    'max_depth': 4,            # Very shallow
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 12,           # Heavy L1
    'reg_lambda': 5,           # Heavy L2
    'min_child_weight': 10,    # Prevents small splits
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'auc'
}


In [12]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = XGBClassifier(**xgb_params, early_stopping_rounds=100, verbose=False)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
    oof_xgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_xgb += model.predict_proba(X_test)[:, 1] / skf.n_splits


In [13]:
print(f"XGBoost CV AUC: {roc_auc_score(y, oof_xgb):}")


XGBoost CV AUC: 0.7258790904822788


In [14]:
# 3. LIGHTGBM (ANTI-OVERFIT)
lgb_params = {
    'n_estimators': 1500,
    'learning_rate': 0.04,
    'max_depth': 4,            # Very shallow
    'num_leaves': 20,          # Conservative
    'min_child_samples': 150,  # Prevents overfitting
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 12,           # Heavy regularization
    'reg_lambda': 5,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1,
    'metric': 'auc'
}


In [15]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = LGBMClassifier(**lgb_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
    
    oof_lgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_lgb += model.predict_proba(X_test)[:, 1] / skf.n_splits


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1493]	valid_0's auc: 0.727293
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1498]	valid_0's auc: 0.725127
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1500]	valid_0's auc: 0.726671
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1500]	valid_0's auc: 0.727289
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1500]	valid_0's auc: 0.727402


In [16]:
print(f"LightGBM CV AUC: {roc_auc_score(y, oof_lgb):}")


LightGBM CV AUC: 0.7267524175297455


In [17]:
# OPTIMIZED ENSEMBLE WEIGHTS
oof_blend = 0.35 * oof_cat + 0.35 * oof_xgb + 0.30 * oof_lgb
test_blend = 0.35 * test_cat + 0.35 * test_xgb + 0.30 * test_lgb

final_auc = roc_auc_score(y, oof_blend)
print(f"\nANTI-OVERFIT or FINAL ENSEMBLE CV ROC-AUC: {final_auc:}")


# DIAGNOSTICS
print(f"\nDIAGNOSTICS:")
print(f"OOF std: {oof_blend.std():.4f}")
print(f"Test std: {test_blend.std():.4f}")
print(f"OOF mean: {oof_blend.mean():.4f}")
print(f"Test mean: {test_blend.mean():.4f}")



ANTI-OVERFIT or FINAL ENSEMBLE CV ROC-AUC: 0.7241966075068532

DIAGNOSTICS:
OOF std: 0.1735
Test std: 0.1864
OOF mean: 0.6300
Test mean: 0.6088


In [18]:
submission = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_blend})
submission.to_csv('submission.csv', index=False)
