## LightGBM
### BaseLine Model

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [35]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [36]:
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
# Makign sure categorical features has 'category' dtype
for col in categorical_features:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [37]:
TARGET = 'diagnosed_diabetes'
X = train_df.drop(columns=[TARGET, 'id'])
y = train_df[TARGET]

In [38]:
# Train/Validation split for baseline performance
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [39]:
# 1) Prepare model
baseline_params = {
    "n_estimators": 1000,        # allow many rounds but use early stopping
    "learning_rate": 0.05,
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": -1
}
baseline_model = LGBMClassifier(**baseline_params)

In [40]:
# 2) Fit with early stopping using the scikit-learn API
# Provide eval_set so it prints evaluation and enables early stopping
baseline_model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.log_evaluation(period=0)  # Suppress iteration logs
    ],
    categorical_feature=categorical_features,    # LightGBM will use them natively
    #verbose=50
)

In [41]:
baseline_preds = baseline_model.predict_proba(X_valid)[:, 1]
baseline_auc = roc_auc_score(y_valid, baseline_preds)
print(f"BASELINE AUC: {baseline_auc:.5f}")

BASELINE AUC: 0.72594


### Feature Importance
> Checking which of the raw (original) feature can be dropped!

In [22]:
lgb_params = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "objective": "binary",
    "boosting_type": "gbdt",
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": -1
}

# Store importances
feature_importances = pd.DataFrame()
feature_importances["feature"] = X.columns

In [None]:
# 5-fold CV
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"Fold {fold + 1}")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    baseline_model = LGBMClassifier(**lgb_params)

    baseline_model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0)  # Suppress iteration logs
        ],
        categorical_feature=categorical_features,
        #verbose=0
    )

    feature_importances[f"fold_{fold + 1}"] = baseline_model.feature_importances_

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [31]:
# Compute average importance across folds
feature_importances['average'] = feature_importances.iloc[:, 1:].mean(axis=1)
# Sort features by importance
feature_importances = feature_importances.sort_values("average", ascending=False)

feature_importances

Unnamed: 0,feature,fold_1,fold_2,fold_3,fold_4,fold_5,average
2,physical_activity_minutes_per_week,5185,5608,5157,5708,5773,5486.2
14,triglycerides,2693,2855,2419,2692,2901,2712.0
6,bmi,1856,2178,1754,2150,2182,2024.0
0,age,1880,2013,1589,2038,1921,1888.2
11,cholesterol_total,1535,1720,1355,1780,1853,1648.6
13,ldl_cholesterol,1549,1760,1330,1669,1694,1600.4
5,screen_time_hours_per_day,1505,1729,1210,1678,1734,1571.2
8,systolic_bp,1380,1686,1264,1619,1669,1523.6
3,diet_score,1476,1630,1208,1587,1599,1500.0
10,heart_rate,1356,1491,1188,1487,1494,1403.2


In [None]:
results = []  # store results

for feature in X.columns:
    print(f"Dropping feature: {feature}")

    X_drop = X.drop(columns=[feature])
    X_train_d, X_valid_d, _, _ = train_test_split(
        X_drop, y, test_size=0.2, random_state=42, stratify=y
    )

    model = LGBMClassifier(**lgb_params)

    model.fit(
        X_train_d, y_train,
        eval_set=[(X_valid_d, y_valid)],
        eval_metric="auc",
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0)  # Suppress iteration logs
        ],
        categorical_feature=[c for c in categorical_features if c != feature],
        #verbose=False
    )
    preds = model.predict_proba(X_valid_d)[:, 1]
    auc = roc_auc_score(y_valid, preds)

    results.append({
        "feature_dropped": feature,
        "auc_without_feature": auc,
        "auc_delta": auc - baseline_auc  # positive means improvement, negative means harmful
    })

In [46]:
result_df = pd.DataFrame(results).sort_values("auc_delta")
print(result_df)

                       feature_dropped  auc_without_feature  auc_delta
2   physical_activity_minutes_per_week             0.679862  -0.046080
21             family_history_diabetes             0.691900  -0.034042
0                                  age             0.715101  -0.010841
14                       triglycerides             0.721406  -0.004536
6                                  bmi             0.725316  -0.000625
7                   waist_to_hip_ratio             0.725500  -0.000441
10                          heart_rate             0.725549  -0.000392
8                          systolic_bp             0.725570  -0.000372
3                           diet_score             0.725574  -0.000368
13                     ldl_cholesterol             0.725707  -0.000234
12                     hdl_cholesterol             0.725716  -0.000226
11                   cholesterol_total             0.725781  -0.000161
20                   employment_status             0.725839  -0.000103
9     

In [49]:
# Features with small positive LOOT deltas
drop_candidates = [
    "gender",
    "income_level",
    "education_level",
    "cardiovascular_history",
    "alcohol_consumption_per_week",
    "hypertension_history",
    "smoking_status"
]

# Group definitions
demographic = ["gender", "income_level", "education_level"]
health_history = ["hypertension_history", "cardiovascular_history"]
lifestyle = ["alcohol_consumption_per_week", "smoking_status"]

In [50]:
def train_and_eval(Xtr, Xval, label=""):
    """Train LGBM and compute AUC."""
    model = LGBMClassifier(**lgb_params)
    
    # Fix categorical features for this reduced feature set
    current_cat_cols = [c for c in categorical_features if c in Xtr.columns]
    
    model.fit(
        Xtr, y_train,
        eval_set=[(Xval, y_valid)],
        eval_metric="auc",
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0)  # Suppress iteration logs
        ],
        categorical_feature=current_cat_cols,
        #verbose=False
    )
    
    preds = model.predict_proba(Xval)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    print(f"{label} AUC: {auc:.6f}")
    return auc

In [51]:
X_drop_all = X.drop(columns=drop_candidates)
Xtr_all, Xval_all, _, _ = train_test_split(
    X_drop_all, y, test_size=0.2, random_state=42, stratify=y)

auc_drop_all = train_and_eval(Xtr_all, Xval_all, label="DROP ALL 7")

DROP ALL 7 AUC: 0.725969


In [52]:
# Demographic
X_drop_demo = X.drop(columns=demographic)
Xtr_demo, Xval_demo, _, _ = train_test_split(
    X_drop_demo, y, test_size=0.2, random_state=42, stratify=y)

auc_demo = train_and_eval(Xtr_demo, Xval_demo, label="DROP: DEMOGRAPHIC")

DROP: DEMOGRAPHIC AUC: 0.725925


In [54]:
# Health history
X_drop_health = X.drop(columns=health_history)
Xtr_health, Xval_health, _, _ = train_test_split(
    X_drop_health, y, test_size=0.2, random_state=42, stratify=y)
auc_health = train_and_eval(Xtr_health, Xval_health, label="DROP: HEALTH HISTORY")

DROP: HEALTH HISTORY AUC: 0.725957


In [55]:
# Lifestyle
X_drop_life = X.drop(columns=lifestyle)
Xtr_life, Xval_life, _, _ = train_test_split(
    X_drop_life, y, test_size=0.2, random_state=42, stratify=y)

auc_life = train_and_eval(Xtr_life, Xval_life, label="DROP: LIFESTYLE")

DROP: LIFESTYLE AUC: 0.726127
