In [None]:
# DIABETES PREDICTION CHALLENGE
import pandas as pd
import numpy as np
import warnings
import sys
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

# Suppress warnings
warnings.filterwarnings("ignore")

# Check for available libraries
try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False
    print("XGBoost not installed.")

try:
    from lightgbm import LGBMClassifier
    LGBM_AVAILABLE = True
except ImportError:
    LGBM_AVAILABLE = False
    print("LightGBM not installed.")

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except ImportError:
    CATBOOST_AVAILABLE = False
    print("CatBoost not installed.")

try:
    import optuna
    OPTUNA_AVAILABLE = True
except ImportError:
    OPTUNA_AVAILABLE = False
    print("Optuna not installed. Skipping hyperparameter tuning.")

def load_and_preprocess():
    print("Loading data...")
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")

    # Handle ID column
    if "id" in train.columns:
        train = train.drop("id", axis=1)
    if "id" in test.columns:
        test_ids = test["id"]
        test = test.drop("id", axis=1)
    else:
        test_ids = test.index

    # Combine for processing
    train["is_train"] = 1
    test["is_train"] = 0
    df = pd.concat([train, test], axis=0, ignore_index=True)

    # Feature Engineering
    print("Engineering features...")
    
    # 1. Winsorization (Clip outliers)
    # Clipping extreme values in cholesterol and triglycerides which can confuse models
    for col in ["cholesterol_total", "ldl_cholesterol", "triglycerides"]:
        upper_limit = df[col].quantile(0.99)
        df[col] = df[col].clip(upper=upper_limit)

    # 2. Blood Pressure Components
    df["Pulse_Pressure"] = df["systolic_bp"] - df["diastolic_bp"]
    df["MAP"] = df["diastolic_bp"] + (1 / 3) * df["Pulse_Pressure"]
    
    # 3. Cholesterol Ratios
    df["hdl_cholesterol"] = df["hdl_cholesterol"].replace(0, 0.1)
    df["Cholesterol_Ratio"] = df["cholesterol_total"] / df["hdl_cholesterol"]
    df["LDL_HDL_Ratio"] = df["ldl_cholesterol"] / df["hdl_cholesterol"]
    df["Trig_HDL_Ratio"] = df["triglycerides"] / df["hdl_cholesterol"]
    df["Non_HDL"] = df["cholesterol_total"] - df["hdl_cholesterol"]
    
    # 4. Interaction Terms
    df["BMI_Age"] = df["bmi"] * df["age"]
    df["BP_Product"] = df["systolic_bp"] * df["diastolic_bp"]
    
    # 5. Binning
    # BMI Categories
    df['BMI_Class'] = pd.cut(df['bmi'], 
                             bins=[-np.inf, 18.5, 24.9, 29.9, np.inf], 
                             labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
    
    # Blood Pressure Categories
    def categorize_bp(row):
        sys = row['systolic_bp']
        dia = row['diastolic_bp']
        if sys < 120 and dia < 80:
            return 'Normal'
        elif 120 <= sys < 130 and dia < 80:
            return 'Elevated'
        elif 130 <= sys < 140 or 80 <= dia < 90:
            return 'Stage1'
        else:
            return 'Stage2'
    df['BP_Class'] = df.apply(categorize_bp, axis=1)
    
    # 6. Log Transforms
    skewed_cols = ["triglycerides", "ldl_cholesterol", "cholesterol_total", "bmi"]
    for col in skewed_cols:
        if col in df.columns:
            df[f"Log_{col}"] = np.log1p(df[col])

    # Encoding & Cleaning
    train_df = df[df["is_train"] == 1].drop("is_train", axis=1)
    test_df = df[df["is_train"] == 0].drop(["is_train", "diagnosed_diabetes"], axis=1)
    
    cat_cols = train_df.select_dtypes(include=["object", "category"]).columns.tolist()
    
    if cat_cols:
        le = LabelEncoder()
        for col in cat_cols:
            train_df[col] = train_df[col].astype(str)
            test_df[col] = test_df[col].astype(str)
            
            all_values = pd.concat([train_df[col], test_df[col]])
            le.fit(all_values)
            
            train_df[col] = le.transform(train_df[col])
            test_df[col] = le.transform(test_df[col])

    # Impute Missing Values
    num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
    num_cols.remove("diagnosed_diabetes")
    
    imputer = SimpleImputer(strategy="median")
    train_df[num_cols] = imputer.fit_transform(train_df[num_cols])
    test_df[num_cols] = imputer.transform(test_df[num_cols])

    return train_df, test_df, test_ids

def optimize_xgb(X, y, n_trials=20):
    if not OPTUNA_AVAILABLE: return {}
    
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_jobs': -1,
            'random_state': 42,
            'tree_method': 'hist'
        }
        
        kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        scores = []
        for tr, val in kf.split(X, y):
            model = XGBClassifier(**params)
            model.fit(X[tr], y.iloc[tr])
            preds = model.predict_proba(X[val])[:, 1]
            scores.append(roc_auc_score(y.iloc[val], preds))
        return np.mean(scores)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    print(f"Best XGB params: {study.best_params}")
    return study.best_params

def train_and_predict(train, test, test_ids, use_optuna=False):
    X = train.drop("diagnosed_diabetes", axis=1)
    y = train["diagnosed_diabetes"]

    # Robust Scaling is better for outliers
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    test_scaled = scaler.transform(test)
    
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    oof_preds = {}
    test_preds = {}
    
    # XGBoost
    if XGB_AVAILABLE:
        print("\nTraining XGBoost...")
        if use_optuna and OPTUNA_AVAILABLE:
            print("Running Optuna for XGBoost...")
            best_params = optimize_xgb(X_scaled, y, n_trials=15) # Small trials for speed
            xgb_params = best_params
            xgb_params.update({'objective': 'binary:logistic', 'eval_metric': 'auc', 'n_jobs': -1, 'random_state': 42, 'tree_method': 'hist'})
        else:
            # Tuned manually for general cases
            xgb_params = {
                'n_estimators': 1200,
                'learning_rate': 0.015,
                'max_depth': 6,
                'subsample': 0.8,
                'colsample_bytree': 0.6,
                'reg_alpha': 0.01,
                'reg_lambda': 1.0,
                'objective': 'binary:logistic',
                'eval_metric': 'auc',
                'n_jobs': -1,
                'random_state': 42,
                'tree_method': 'hist'
            }
        
        oof = np.zeros(len(X))
        pred = np.zeros(len(test))
        
        for fold, (tr, val) in enumerate(kf.split(X_scaled, y)):
            model = XGBClassifier(**xgb_params)
            model.fit(X_scaled[tr], y.iloc[tr], eval_set=[(X_scaled[val], y.iloc[val])], verbose=False)
            oof[val] = model.predict_proba(X_scaled[val])[:, 1]
            pred += model.predict_proba(test_scaled)[:, 1] / kf.get_n_splits()
            
        print(f"XGBoost CV AUC: {roc_auc_score(y, oof):.5f}")
        oof_preds['xgb'] = oof
        test_preds['xgb'] = pred

    # LightGBM
    if LGBM_AVAILABLE:
        print("\nTraining LightGBM...")
        lgb_params = {
            'n_estimators': 1500,
            'learning_rate': 0.02,
            'num_leaves': 40,
            'max_depth': -1,
            'subsample': 0.8,
            'colsample_bytree': 0.7,
            'objective': 'binary',
            'metric': 'auc',
            'random_state': 42,
            'n_jobs': -1,
            'verbose': -1
        }
        
        oof = np.zeros(len(X))
        pred = np.zeros(len(test))
        
        for fold, (tr, val) in enumerate(kf.split(X_scaled, y)):
            model = LGBMClassifier(**lgb_params)
            model.fit(X_scaled[tr], y.iloc[tr])
            oof[val] = model.predict_proba(X_scaled[val])[:, 1]
            pred += model.predict_proba(test_scaled)[:, 1] / kf.get_n_splits()
            
        print(f"LightGBM CV AUC: {roc_auc_score(y, oof):.5f}")
        oof_preds['lgb'] = oof
        test_preds['lgb'] = pred

    # CatBoost
    if CATBOOST_AVAILABLE:
        print("\nTraining CatBoost...")
        cat_params = {
            'iterations': 1500,
            'learning_rate': 0.02,
            'depth': 7,
            'l2_leaf_reg': 5,
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'random_seed': 42,
            'verbose': 0
        }
        
        oof = np.zeros(len(X))
        pred = np.zeros(len(test))
        
        for fold, (tr, val) in enumerate(kf.split(X_scaled, y)):
            model = CatBoostClassifier(**cat_params)
            model.fit(X_scaled[tr], y.iloc[tr])
            oof[val] = model.predict_proba(X_scaled[val])[:, 1]
            pred += model.predict_proba(test_scaled)[:, 1] / kf.get_n_splits()
            
        print(f"CatBoost CV AUC: {roc_auc_score(y, oof):.5f}")
        oof_preds['cat'] = oof
        test_preds['cat'] = pred

    # Weighted Ensemble Optimization
    print("\nOptimizing Ensemble Weights...")
    from scipy.optimize import minimize
    
    models = list(oof_preds.keys())
    predictions = [oof_preds[m] for m in models]
    
    def loss_func(weights):
        final_oof = np.zeros(len(y))
        for i, w in enumerate(weights):
            final_oof += w * predictions[i]
        return -roc_auc_score(y, final_oof)
    
    init_weights = [1/len(models)] * len(models)
    cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
    bounds = [(0, 1)] * len(models)
    
    res = minimize(loss_func, init_weights, method='SLSQP', bounds=bounds, constraints=cons)
    best_weights = res.x
    
    print("Best Weights:")
    for m, w in zip(models, best_weights):
        print(f"  {m}: {w:.4f}")
        
    final_test_pred = np.zeros(len(test))
    final_oof_pred = np.zeros(len(y))
    
    for i, m in enumerate(models):
        final_test_pred += best_weights[i] * test_preds[m]
        final_oof_pred += best_weights[i] * oof_preds[m]
        
    print(f"\nFinal Ensemble CV AUC: {roc_auc_score(y, final_oof_pred):.5f}")

    submission = pd.DataFrame({
        "id": test_ids,
        "diagnosed_diabetes": final_test_pred
    })
    submission.to_csv("submission_advanced_optuna.csv", index=False)
    print("Saved: submission_advanced_optuna.csv")

if __name__ == "__main__":
    # Enable Optuna by setting use_optuna=True
    # Note: Optuna takes time. For quick run, set to False.
    USE_OPTUNA = True 
    
    train_df, test_df, test_ids = load_and_preprocess()
    train_and_predict(train_df, test_df, test_ids, use_optuna=USE_OPTUNA)

Loading data...
Engineering features...


[I 2025-12-01 15:11:19,442] A new study created in memory with name: no-name-1002ce73-6626-4a91-8cf1-e5522717d345



Training XGBoost...
Running Optuna for XGBoost...


[I 2025-12-01 15:12:34,470] Trial 0 finished with value: 0.7223411035016231 and parameters: {'n_estimators': 1025, 'learning_rate': 0.06253537504208502, 'max_depth': 6, 'subsample': 0.526753034214585, 'colsample_bytree': 0.7678852301155152, 'reg_alpha': 0.04279643621401936, 'reg_lambda': 1.6656147325727316e-05}. Best is trial 0 with value: 0.7223411035016231.
[I 2025-12-01 15:15:39,001] Trial 1 finished with value: 0.7231782094163469 and parameters: {'n_estimators': 1293, 'learning_rate': 0.007809163200743826, 'max_depth': 9, 'subsample': 0.8097868161803469, 'colsample_bytree': 0.8423420508509891, 'reg_alpha': 0.12838633439799393, 'reg_lambda': 9.879757534347639e-06}. Best is trial 1 with value: 0.7231782094163469.
[I 2025-12-01 15:17:25,141] Trial 2 finished with value: 0.7198921268582076 and parameters: {'n_estimators': 1397, 'learning_rate': 0.015396566747302819, 'max_depth': 4, 'subsample': 0.9851625852879, 'colsample_bytree': 0.8245727807355291, 'reg_alpha': 8.843304253457662e-08,

Best XGB params: {'n_estimators': 1368, 'learning_rate': 0.059344162506351314, 'max_depth': 4, 'subsample': 0.8621772098416591, 'colsample_bytree': 0.5172755423849738, 'reg_alpha': 0.00024974538350067653, 'reg_lambda': 7.478363200208106}
XGBoost CV AUC: 0.72653

Training LightGBM...
LightGBM CV AUC: 0.72616

Training CatBoost...
CatBoost CV AUC: 0.72299

Optimizing Ensemble Weights...
Best Weights:
  xgb: 0.3393
  lgb: 0.3306
  cat: 0.3301

Final Ensemble CV AUC: 0.72611
Saved: submission_advanced_optuna.csv
