## XGBoost

In [37]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import roc_auc_score, make_scorer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv("dataset/train.csv")

In [3]:
# Making sure the target feature is of type float
train_df['diagnosed_diabetes'] = train_df['diagnosed_diabetes'].astype('float')

In [4]:
train_df = train_df.drop(columns=['id'])

In [5]:
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'diagnosed_diabetes']
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history']
Categorical columns: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']


In [6]:
# This is a light automated heuristic:
# Any feature that perfectly predicts the target (corr == ±1) is suspicious.
corr_matrix = train_df[numeric_cols + ['diagnosed_diabetes']].corr()
leakage_suspects = corr_matrix['diagnosed_diabetes'].abs().sort_values(ascending=False)
print("Features most correlated with target:")
print(leakage_suspects.head())

Features most correlated with target:
diagnosed_diabetes                    1.000000
family_history_diabetes               0.211064
physical_activity_minutes_per_week    0.169789
age                                   0.161162
systolic_bp                           0.107132
Name: diagnosed_diabetes, dtype: float64


In [8]:
# One-Hot encoding for NOMINAL (unordered) categorical features
# Nominal (unordered) categories → One-Hot Encoding
nominal_cols = ['gender', 'ethnicity', 'employment_status']
ordinal_cols = ['education_level', 'income_level', 'smoking_status']

# Define the order for ordinal features
education_order = ["No formal", "Highschool", "Graduate", "Postgraduate"]  
income_order    = ["Low", "Lower-Middle", "Middle", "Upper-Middle", "High"]
smoking_order   = ["Never", "Current", "Former"]

In [10]:
# Nominal -> OneHot (binary indicators). Creates a new column for each category.
nominal_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Ordinal -> integer encoding with defined order. Assigns integer values based on order.
ordinal_transformer = OrdinalEncoder(
    categories=[education_order, income_order, smoking_order],
    dtype=int
)

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", nominal_transformer, nominal_cols),
        ("ordinal", ordinal_transformer, ordinal_cols)
    ],
    remainder="passthrough" # keeps continuous columns as is. Numeric columns (passthrough) → converted to NumPy array
)

In [18]:
# XGBoost baseline model
# -----------------------------------------------------
xgb_model = XGBClassifier(
    n_estimators=300,        # number of trees (moderate baseline)
    max_depth=6,            # tree depth: controls model complexity
    learning_rate=0.05,     # smaller LR = more stable trees
    subsample=0.8,          # row sampling -> reduces overfitting
    colsample_bytree=0.8,   # feature sampling -> reduces overfitting
    objective="binary:logistic",
    eval_metric="auc",      # built-in AUC metric (matches Kaggle)
    random_state=42,
    n_jobs=-1               # use all cores
)

In [19]:
# Full pipeline: preprocessing + model
# -----------------------------------------------------
model_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),   # from Step 2
        ("model", xgb_model)
    ]
)

In [22]:
X = train_df.drop(columns=['diagnosed_diabetes'])
y = train_df['diagnosed_diabetes'].values

In [None]:
# Validation: Stratified K-Fold CV
# -----------------------------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

In [None]:
# Evaluate using cross-validation (AUC)
# -----------------------------------------------------

# Evaluation model
cv_scores = cross_val_score(
    model_pipeline,
    X,
    y,
    cv=cv,
    scoring=auc_scorer,
    n_jobs=-1
)

print("AUC scores per fold:", cv_scores)
print("Mean AUC:", cv_scores.mean())
print("Std deviation:", cv_scores.std())

AUC scores per fold: [0.72346788 0.72079512 0.7222085  0.7231096  0.72232386]
Mean AUC: 0.722380991301365
Std deviation: 0.0009230605429321103


In [None]:
# Train final model on full training data
# BaseLine Model
model_pipeline.fit(X, y)

In [32]:
test_df = pd.read_csv("dataset/test.csv")

In [33]:
test_ids = test_df['id']
test_df = test_df.drop(columns=['id'])

In [27]:
# Step 1: Apply the same exact encoders
# Step 2: Align the columns
# Step 3: Produce the exact same feature space
# Step 4: Feed into XGBoost correctly
# Step 5: Output probabilities
test_proba = model_pipeline.predict_proba(test_df)

In [42]:
# Save submission
xgb_submission = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': test_proba[:, 1]
})
xgb_submission.to_csv('xgb_submission.csv', index=False)

In [70]:
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GroupKFold

def objective(trial):
    # Suggest hyperparameters
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
        "eval_metric": "auc",
        "use_label_encoder": False,
        "tree_method": "hist",
        "n_jobs": -1,
        "random_state": 42
    }

    
    trial_model = Pipeline([
        ("preprocessor", preprocessor),
        ("xgb", XGBClassifier(**params))
    ])
    
   
    groups = train_df['gender']  # your grouping variable
    cv = GroupKFold(n_splits=3)

    # then use in cross_val_score
    scores = cross_val_score(
        trial_model, X, y,
        cv=cv.split(X, y, groups=train_df['gender']),
        scoring="roc_auc",
        n_jobs=-1
    )
        
    return scores.mean()

In [71]:
# Create Optuna study
study = optuna.create_study(direction="maximize")

[I 2025-12-08 11:30:40,337] A new study created in memory with name: no-name-17d0d7c1-41ff-4185-9c86-f7fde65190a9


In [None]:
# Build final model with tuned params
best_params = study.best_params
final_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("xgb", XGBClassifier(
        **best_params,
        eval_metric="auc",
        use_label_encoder=False,
        tree_method="hist",
        n_jobs=-1,
        random_state=42
    ))
])

# Train on all training data
final_model.fit(X, y)

In [48]:
# Keep test IDs
test_df = pd.read_csv("dataset/test.csv")
test_ids = test_df['id']
test_features = test_df.drop(columns=['id'])

# Predict probabilities
test_proba = final_model.predict_proba(test_features)[:, 1]

# Build submission DataFrame
submission = pd.DataFrame({
    "id": test_ids,
    "diagnosed_diabetes": test_proba
})

# Save to CSV
submission.to_csv("xgb_optuna_submission.csv", index=False)