In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:

# Load data 
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
TARGET = "diagnosed_diabetes"

In [3]:
X = train.drop(columns=["id", TARGET])
y = train[TARGET].astype(int)
test_ids = test["id"]
X_test = test.drop(columns=["id"])


In [4]:
def advanced_feature_engineering(df):
    df = df.copy()
    
    # All 30+ features from previous code (BP, lipids, lifestyle, etc.)
    df["bmi_age_ratio"] = df["bmi"] / (df["age"] + 1e-3)
    df["activity_sleep_ratio"] = df["physical_activity_minutes_per_week"] / (df["sleep_hours_per_day"] + 1)
    df["bp_ratio"] = df["systolic_bp"] / (df["diastolic_bp"] + 1)
    df["lipid_ratio"] = df["cholesterol_total"] / (df["triglycerides"] + 1)
    df["pulse_pressure"] = df["systolic_bp"] - df["diastolic_bp"]
    df["mean_arterial_pressure"] = df["diastolic_bp"] + (df["systolic_bp"] - df["diastolic_bp"]) / 3.0
    df["non_hdl_cholesterol"] = df["cholesterol_total"] - df["hdl_cholesterol"]
    df["ldl_hdl_ratio"] = df["ldl_cholesterol"] / (df["hdl_cholesterol"] + 1e-3)
    df["tg_hdl_ratio"] = df["triglycerides"] / (df["hdl_cholesterol"] + 1e-3)
    df["bmi_sq"] = df["bmi"] ** 2
    df["log_bmi"] = np.log1p(df["bmi"])
    df["bmi_waist_hip"] = df["bmi"] * df["waist_to_hip_ratio"]
    df["waist_hip_age"] = df["waist_to_hip_ratio"] * df["age"]
    df["sedentary_ratio"] = df["screen_time_hours_per_day"] / (df["sleep_hours_per_day"] + 1)
    df["activity_sedentary_ratio"] = df["physical_activity_minutes_per_week"] / (df["screen_time_hours_per_day"] + 1)
    df["low_sleep"] = (df["sleep_hours_per_day"] < 6).astype(int)
    df["high_screen"] = (df["screen_time_hours_per_day"] > 6).astype(int)
    df["low_activity"] = (df["physical_activity_minutes_per_week"] < 150).astype(int)
    df["poor_diet"] = (df["diet_score"] < df["diet_score"].quantile(0.3)).astype(int)
    df["lifestyle_risk_score"] = df[["low_sleep", "high_screen", "low_activity", "poor_diet"]].sum(axis=1)
    
    history_cols = ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]
    for col in history_cols:
        df[col + "_bin"] = (df[col] == "Yes").astype(int)
    df["comorbid_count"] = df[[col + "_bin" for col in history_cols]].sum(axis=1)
    df["comorbid_age_interaction"] = df["comorbid_count"] * df["age"]
    
    # Outlier clipping
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        lower, upper = df[col].quantile([0.01, 0.99])
        df[col] = df[col].clip(lower, upper)
    
    return df


In [5]:
X = advanced_feature_engineering(X)
X_test = advanced_feature_engineering(X_test)

In [6]:
# Label encode categoricals for XGBoost/LightGBM
cat_features = X.select_dtypes(include="object").columns.tolist()
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    label_encoders[col] = le

print("Categorical features encoded:", cat_features)


Categorical features encoded: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']


In [7]:
# === 3-MODEL ENSEMBLE ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [8]:
# Store OOF predictions for each model
oof_cat = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
test_cat = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))

In [9]:
# 1. CATBOOST (3 seeds for diversity)
cat_params = {
    'iterations': 2000, 'learning_rate': 0.03, 'depth': 6,
    'l2_leaf_reg': 5, 'subsample': 0.85, 'colsample_bylevel': 0.7,
    'class_weights': [1, 1.2], 'random_seed': 42, 'verbose': False
}

for seed in [42, 123, 777]:
    cat_params['random_seed'] = seed
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = CatBoostClassifier(**cat_params)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), 
                 early_stopping_rounds=150, verbose=False)
        
        oof_cat[val_idx] += model.predict_proba(X_val)[:, 1] / (skf.n_splits * 3)
        test_cat += model.predict_proba(X_test)[:, 1] / (skf.n_splits * 3)



In [20]:
print(f"CatBoost CV AUC: {roc_auc_score(y, oof_cat):}")

CatBoost CV AUC: 0.7248220472106447


In [11]:
#2. XGBOOST (3 seeds)
xgb_params = {
    'n_estimators': 2000, 'learning_rate': 0.03, 'max_depth': 6,
    'subsample': 0.85, 'colsample_bytree': 0.7, 'reg_alpha': 5,
    'reg_lambda': 1, 'random_state': 42, 'n_jobs': -1, 'eval_metric': 'auc'
}

for seed in [42, 123, 777]:
    xgb_params['random_state'] = seed
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = XGBClassifier(**xgb_params, early_stopping_rounds=150, verbose=False)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        
        oof_xgb[val_idx] += model.predict_proba(X_val)[:, 1] / (skf.n_splits * 3)
        test_xgb += model.predict_proba(X_test)[:, 1] / (skf.n_splits * 3)




In [14]:
print(f"XGBoost CV AUC: {roc_auc_score(y, oof_xgb):.6f}")

XGBoost CV AUC: 0.726801


In [15]:
# 3. LIGHTGBM (3 seeds)
from lightgbm import LGBMClassifier
import lightgbm as lgb  
lgb_params = {
    'n_estimators': 2000, 'learning_rate': 0.03, 'max_depth': 6,
    'subsample': 0.85, 'colsample_bytree': 0.7, 'reg_alpha': 5,
    'reg_lambda': 1, 'random_state': 42, 'n_jobs': -1, 'metric': 'auc',
    'verbose': -1
}

for seed in [42, 123, 777]:
    lgb_params['random_state'] = seed
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = LGBMClassifier(**lgb_params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)],callbacks=[lgb.early_stopping(150), lgb.log_evaluation(0)])
        
        oof_lgb[val_idx] += model.predict_proba(X_val)[:, 1] / (skf.n_splits * 3)
        test_lgb += model.predict_proba(X_test)[:, 1] / (skf.n_splits * 3)



Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1762]	valid_0's auc: 0.727363
Training until validation scores don't improve for 150 rounds
Did not meet early stopping. Best iteration is:
[1994]	valid_0's auc: 0.726088
Training until validation scores don't improve for 150 rounds
Did not meet early stopping. Best iteration is:
[1997]	valid_0's auc: 0.727262
Training until validation scores don't improve for 150 rounds
Did not meet early stopping. Best iteration is:
[1924]	valid_0's auc: 0.728042
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[1720]	valid_0's auc: 0.727912
Training until validation scores don't improve for 150 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's auc: 0.727672
Training until validation scores don't improve for 150 rounds
Did not meet early stopping. Best iteration is:
[1998]	valid_0's auc: 0.72616
Training until validation scores don't i

In [18]:
print(f"LightGBM CV AUC: {roc_auc_score(y, oof_lgb):}")


LightGBM CV AUC: 0.7276152922913872


In [17]:
# === WEIGHTED ENSEMBLE (optimized weights) ===
# Simple equal weights first, then optimize
oof_blend = 0.4 * oof_cat + 0.3 * oof_xgb + 0.3 * oof_lgb
test_blend = 0.4 * test_cat + 0.3 * test_xgb + 0.3 * test_lgb

final_auc = roc_auc_score(y, oof_blend)
print(f"\n=== FINAL ENSEMBLE CV ROC-AUC: {final_auc:}")



=== FINAL ENSEMBLE CV ROC-AUC: 0.7269622642381435


In [19]:
submission = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_blend})
submission.to_csv('submission.csv', index=False) 
