## LightGBM
### BaseLine Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [3]:
test_df.head(2)

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,...,111,Female,White,Highschool,Middle,Former,Employed,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,...,145,Female,White,Highschool,Middle,Never,Unemployed,0,0,0


In [4]:
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
# Makign sure categorical features has 'category' dtype
for col in categorical_features:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [5]:
TARGET = 'diagnosed_diabetes'
X = train_df.drop(columns=[TARGET, 'id'])
y = train_df[TARGET]

In [7]:
baseline_params = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": -1
}

In [8]:
def cv_auc_lightgbm(X, y, categorical_features, params, n_splits=5, early_stopping_rounds=50, random_state=42):

    skf = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=random_state
    )

    fold_aucs = []

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = LGBMClassifier(**params)

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric="auc",
            categorical_feature=categorical_features,
            callbacks=[
                lgb.early_stopping(
                    stopping_rounds=early_stopping_rounds,
                    verbose=False
                ),
                lgb.log_evaluation(period=0)
            ],
        )

        preds = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, preds)
        fold_aucs.append(auc)

        print(f"Fold {fold}: AUC = {auc:.6f}")

    return {
        "mean_auc": np.mean(fold_aucs),
        "std_auc": np.std(fold_aucs),
        "fold_aucs": fold_aucs
    }


In [9]:
baseline_cv_results = cv_auc_lightgbm(
    X=X,
    y=y,
    categorical_features=categorical_features,
    params=baseline_params,
    n_splits=5
)

Fold 1: AUC = 0.726962
Fold 2: AUC = 0.725404
Fold 3: AUC = 0.726042
Fold 4: AUC = 0.727252
Fold 5: AUC = 0.726797


In [10]:
print(
    f"\nBaseline CV AUC: "
    f"{baseline_cv_results['mean_auc']:.6f} "
    f"± {baseline_cv_results['std_auc']:.6f}"
)


Baseline CV AUC: 0.726491 ± 0.000675


In [12]:
def objective(trial):

    params = {
        "n_estimators": 3000,  # large, rely on early stopping
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 300),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "objective": "binary",
        "metric": "auc",
        "random_state": 42,
        "n_jobs": -1,
        "verbosity": -1,
    }

    cv_results = cv_auc_lightgbm(
        X=X,
        y=y,
        categorical_features=categorical_features,
        params=params,
        n_splits=5,
        early_stopping_rounds=50,
    )

    return cv_results["mean_auc"]

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

In [14]:
print("Best CV AUC:", study.best_value)
print("Best params:", study.best_params)

Best CV AUC: 0.7288023629463443
Best params: {'learning_rate': 0.03946106695073694, 'num_leaves': 150, 'max_depth': 5, 'min_data_in_leaf': 107, 'feature_fraction': 0.6839615309460927, 'bagging_fraction': 0.9015512270872944, 'bagging_freq': 9, 'lambda_l1': 2.0484668907092414, 'lambda_l2': 2.337161686326926}


In [None]:
best_params = study.best_params.copy()

# Best params: {'learning_rate': 0.03946106695073694, 'num_leaves': 150, 'max_depth': 5, 'min_data_in_leaf': 107, 'feature_fraction': 0.6839615309460927, 'bagging_fraction': 0.9015512270872944, 'bagging_freq': 9, 'lambda_l1': 2.0484668907092414, 'lambda_l2': 2.337161686326926}

final_params = {
    **baseline_params,   # fixed params
    **best_params,       # tuned params override
    "objective": "binary",
    "metric": "auc",
    'seed': 42
}

In [22]:
baseline_with_best_hyperparameters_cv_results = cv_auc_lightgbm(
    X=X,
    y=y,
    categorical_features=categorical_features,
    params=final_params,
    n_splits=5
)

Fold 1: AUC = 0.727502
Fold 2: AUC = 0.725570
Fold 3: AUC = 0.726627
Fold 4: AUC = 0.727781
Fold 5: AUC = 0.727725


In [None]:
baseline_with_best_hyperparameters_cv_results

{'mean_auc': 0.7270408164014903,
 'std_auc': 0.0008442345121480178,
 'fold_aucs': [0.7275015552963221,
  0.7255695959575221,
  0.7266271215881855,
  0.7277812513578776,
  0.7277245578075443]}

In [23]:
print(
    f"\nBaseline CV AUC: "
    f"{baseline_with_best_hyperparameters_cv_results['mean_auc']:.6f} "
    f"± {baseline_with_best_hyperparameters_cv_results['std_auc']:.6f}"
)


Baseline CV AUC: 0.727041 ± 0.000844


In [24]:
test_df = pd.read_csv("dataset/test.csv")
test_ids = test_df['id']
test_features = test_df.drop(columns=['id'])

# Predict probabilities
test_proba = baseline_with_best_hyperparameters_cv_results.predict_proba(test_features)[:, 1]

# Build submission DataFrame
submission = pd.DataFrame({
    "id": test_ids,
    "diagnosed_diabetes": test_proba
})

# Save to CSV
submission.to_csv("baselineLGBM_with_best_hyperparameters_cv_results.csv", index=False)

AttributeError: 'dict' object has no attribute 'predict_proba'

### Feature Importance
> “Which features are most frequently used by the trees?”
> Ranking features  
> Identifying dominant signals (BMI, age, triglycerides, etc.)  
> Spotting features that are never used  


**By default, LGBMClassifier.feature_importances_ is:**  
> Split count (number of times a feature is used to split)

In [35]:
lgb_params = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "objective": "binary",
    "boosting_type": "gbdt",
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": -1
}

# Store importances
feature_importances = pd.DataFrame()
feature_importances["feature"] = X.columns
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# 5-fold CV
#kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"Fold {fold + 1}")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    baseline_model = LGBMClassifier(**lgb_params)

    baseline_model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0)  # Suppress iteration logs
        ],
        categorical_feature=categorical_features,
        #verbose=0
    )

    feature_importances[f"fold_{fold + 1}"] = baseline_model.feature_importances_

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [31]:
# Compute average importance across folds
feature_importances['average'] = feature_importances.iloc[:, 1:].mean(axis=1)
# Sort features by importance
feature_importances = feature_importances.sort_values("average", ascending=False)

feature_importances

Unnamed: 0,feature,fold_1,fold_2,fold_3,fold_4,fold_5,average
2,physical_activity_minutes_per_week,5185,5608,5157,5708,5773,5486.2
14,triglycerides,2693,2855,2419,2692,2901,2712.0
6,bmi,1856,2178,1754,2150,2182,2024.0
0,age,1880,2013,1589,2038,1921,1888.2
11,cholesterol_total,1535,1720,1355,1780,1853,1648.6
13,ldl_cholesterol,1549,1760,1330,1669,1694,1600.4
5,screen_time_hours_per_day,1505,1729,1210,1678,1734,1571.2
8,systolic_bp,1380,1686,1264,1619,1669,1523.6
3,diet_score,1476,1630,1208,1587,1599,1500.0
10,heart_rate,1356,1491,1188,1487,1494,1403.2


In [36]:
feature_importances = pd.DataFrame({"feature": X.columns})

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = LGBMClassifier(**lgb_params)

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="auc",
        categorical_feature=categorical_features,
        callbacks=[
            lgb.early_stopping(50, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )

    gain = model.booster_.feature_importance(importance_type="gain")
    gain = gain / gain.sum()

    feature_importances[f"fold_{fold}"] = gain

feature_importances["average_gain"] = feature_importances.iloc[:, 1:].mean(axis=1)
feature_importances = feature_importances.sort_values("average_gain", ascending=False)

In [37]:
feature_importances

Unnamed: 0,feature,fold_1,fold_2,fold_3,fold_4,fold_5,average_gain
2,physical_activity_minutes_per_week,0.321347,0.322668,0.329443,0.327224,0.325513,0.325239
21,family_history_diabetes,0.271657,0.266755,0.275779,0.263757,0.264194,0.268429
0,age,0.151394,0.14667,0.150641,0.149857,0.146248,0.148962
14,triglycerides,0.061385,0.060831,0.060935,0.058114,0.061808,0.060615
6,bmi,0.039877,0.039638,0.039442,0.041026,0.040805,0.040158
13,ldl_cholesterol,0.026941,0.026925,0.026416,0.026783,0.026529,0.026719
3,diet_score,0.017994,0.018631,0.016374,0.018476,0.018261,0.017947
11,cholesterol_total,0.015261,0.016141,0.014517,0.016401,0.017219,0.015908
10,heart_rate,0.015446,0.016088,0.014377,0.015462,0.015653,0.015405
8,systolic_bp,0.013094,0.015078,0.013103,0.015112,0.015204,0.014318


In [None]:
drop_candidates = [
    "gender",
    "income_level",
    "education_level",
    "cardiovascular_history",
    "alcohol_consumption_per_week",
    "hypertension_history",
    "smoking_status"
]


X_drop_lifestyle = X.drop(columns=drop_candidates)

categorical_drop_lifestyle = [
    c for c in categorical_features if c not in drop_candidates
]

drop_lifestyle_results = cv_auc_lightgbm(
    X=X_drop_lifestyle,
    y=y,
    categorical_features=categorical_drop_lifestyle,
    params=baseline_params,
    n_splits=5
)

Fold 1: AUC = 0.727050
Fold 2: AUC = 0.725427
Fold 3: AUC = 0.726042
Fold 4: AUC = 0.726550
Fold 5: AUC = 0.726686


In [33]:
print(
    f"\nDrop lifestyle CV AUC: "
    f"{drop_lifestyle_results['mean_auc']:.6f} "
    f"± {drop_lifestyle_results['std_auc']:.6f}"
)


Drop lifestyle CV AUC: 0.726351 ± 0.000564


> There is no reliable evidence that dropping these features helps or hurts performance!   
The model was never “improving” or “getting worse” — we were just seeing sampling noise!  