In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
TARGET = "diagnosed_diabetes"


In [3]:
# Prepare features
X = train.drop(columns=["id", TARGET])
y = train[TARGET].astype(int)
test_ids = test["id"]
X_test = test.drop(columns=["id"])

In [4]:
def advanced_feature_engineering(df):
    """Enhanced feature engineering for diabetes prediction"""
    df = df.copy()
    
    # === ORIGINAL FEATURES ===
    df["bmi_age_ratio"] = df["bmi"] / (df["age"] + 1e-3)
    df["activity_sleep_ratio"] = df["physical_activity_minutes_per_week"] / (df["sleep_hours_per_day"] + 1)
    df["bp_ratio"] = df["systolic_bp"] / (df["diastolic_bp"] + 1)
    df["lipid_ratio"] = df["cholesterol_total"] / (df["triglycerides"] + 1)
    
    # === BLOOD PRESSURE FEATURES ===
    df["pulse_pressure"] = df["systolic_bp"] - df["diastolic_bp"]
    df["mean_arterial_pressure"] = df["diastolic_bp"] + (df["systolic_bp"] - df["diastolic_bp"]) / 3.0
    
    # === LIPID PROFILE FEATURES ===
    df["non_hdl_cholesterol"] = df["cholesterol_total"] - df["hdl_cholesterol"]
    df["ldl_hdl_ratio"] = df["ldl_cholesterol"] / (df["hdl_cholesterol"] + 1e-3)
    df["tg_hdl_ratio"] = df["triglycerides"] / (df["hdl_cholesterol"] + 1e-3)
    
    # === ANTHROPOMETRIC FEATURES ===
    df["bmi_sq"] = df["bmi"] ** 2
    df["log_bmi"] = np.log1p(df["bmi"])
    df["bmi_waist_hip"] = df["bmi"] * df["waist_to_hip_ratio"]
    df["waist_hip_age"] = df["waist_to_hip_ratio"] * df["age"]
    
    # === LIFESTYLE FEATURES ===
    df["sedentary_ratio"] = df["screen_time_hours_per_day"] / (df["sleep_hours_per_day"] + 1)
    df["activity_sedentary_ratio"] = df["physical_activity_minutes_per_week"] / (df["screen_time_hours_per_day"] + 1)
    
    # Lifestyle risk flags (adjust thresholds based on domain knowledge)
    df["low_sleep"] = (df["sleep_hours_per_day"] < 6).astype(int)
    df["high_screen"] = (df["screen_time_hours_per_day"] > 6).astype(int)
    df["low_activity"] = (df["physical_activity_minutes_per_week"] < 150).astype(int)
    df["poor_diet"] = (df["diet_score"] < df["diet_score"].quantile(0.3)).astype(int)
    
    df["lifestyle_risk_score"] = df[["low_sleep", "high_screen", "low_activity", "poor_diet"]].sum(axis=1)
    
    # === DISEASE HISTORY FEATURES ===
    history_cols = ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]
    for col in history_cols:
        df[col + "_bin"] = (df[col] == "Yes").astype(int)
    
    df["comorbid_count"] = df[[col + "_bin" for col in history_cols]].sum(axis=1)
    df["comorbid_age_interaction"] = df["comorbid_count"] * df["age"]
    
    # === OUTLIER CLIPPING (helps stability) ===
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col not in ['id', TARGET]:  # Skip ID and target
            lower, upper = df[col].quantile([0.01, 0.99])
            df[col] = df[col].clip(lower, upper)
    
    return df


In [5]:
# Apply feature engineering
X = advanced_feature_engineering(X)
X_test = advanced_feature_engineering(X_test)


In [6]:
# Identify categorical features (ensure they're object dtype)
cat_features = X.select_dtypes(include="object").columns.tolist()
print("Categorical features:", cat_features)

Categorical features: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']


In [7]:
# Ensure history columns are categorical if they weren't strings
for col in ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]:
    if col in X.columns:
        X[col] = X[col].astype(str)
        X_test[col] = X_test[col].astype(str)
        if col not in cat_features:
            cat_features.append(col)


In [8]:
# === OPTIMIZED CATBOOST WITH BETTER HYPERPARAMS ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

In [9]:
# Improved hyperparameters (shallower trees, more iterations, better regularization)
model_params = {
    'iterations': 2000,
    'learning_rate': 0.03,
    'depth': 6,  # Reduced from 7
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'verbose': 200,
    'l2_leaf_reg': 5,  # Slightly higher regularization
    'subsample': 0.85,
    'colsample_bylevel': 0.7,  # More aggressive feature sampling
    'class_weights': [1, 1.2],  # Fine-tuned class weights
    'grow_policy': 'SymmetricTree',  # CatBoost's strength
    'random_strength': 0.1  # Helps generalization
}

In [10]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n=== Fold {fold+1} ===")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = CatBoostClassifier(**model_params)
    model.fit(
        X_train, y_train,
        cat_features=cat_features,
        eval_set=(X_val, y_val),
        early_stopping_rounds=150,  # Increased patience
        use_best_model=True
    )
    
    fold_auc = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])
    print(f"Fold AUC: {fold_auc:.6f}")
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / skf.n_splits



=== Fold 1 ===
0:	test: 0.6824737	best: 0.6824737 (0)	total: 481ms	remaining: 16m
200:	test: 0.7069595	best: 0.7069595 (200)	total: 1m 24s	remaining: 12m 33s
400:	test: 0.7142913	best: 0.7142913 (400)	total: 3m 27s	remaining: 13m 46s
600:	test: 0.7189163	best: 0.7189163 (600)	total: 4m 59s	remaining: 11m 37s
800:	test: 0.7210552	best: 0.7210552 (800)	total: 6m 22s	remaining: 9m 32s
1000:	test: 0.7221698	best: 0.7221698 (1000)	total: 8m 1s	remaining: 8m
1200:	test: 0.7230353	best: 0.7230353 (1200)	total: 10m 14s	remaining: 6m 48s
1400:	test: 0.7236841	best: 0.7236841 (1400)	total: 13m 32s	remaining: 5m 47s
1600:	test: 0.7241449	best: 0.7241452 (1599)	total: 17m 17s	remaining: 4m 18s
1800:	test: 0.7246474	best: 0.7246474 (1800)	total: 19m 42s	remaining: 2m 10s
1999:	test: 0.7250736	best: 0.7250746 (1997)	total: 22m 11s	remaining: 0us

bestTest = 0.7250745741
bestIteration = 1997

Shrink model to first 1998 iterations.
Fold AUC: 0.725075

=== Fold 2 ===
0:	test: 0.6792446	best: 0.6792446

In [12]:
# Final CV score
cv_auc = roc_auc_score(y, oof_preds)
print(f"\n=== FINAL CV ROC-AUC: {cv_auc} ===")



=== FINAL CV ROC-AUC: 0.724624396048214 ===


In [13]:
submission = pd.DataFrame({
    "id": test_ids,
    "diagnosed_diabetes": test_preds
})

submission.to_csv("submission.csv", index=False)
