In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [2]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# -------------------- Prepare features and target 
target = 'diagnosed_diabetes'
features = [col for col in train_df.columns if col not in [target, 'id']]

cat_cols = train_df[features].select_dtypes(include='object').columns
for col in cat_cols:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

# -------------- X and y
X = train_df[features]
y = train_df[target]
X_test = test_df[features]

In [4]:
# === Add these 6 features to both train and test ===
train_df['triglycerides_to_hdl'] = train_df['triglycerides'] / train_df['hdl_cholesterol']
train_df['non_hdl_cholesterol']  = train_df['cholesterol_total'] - train_df['hdl_cholesterol']
train_df['bmi_waist_interaction'] = train_df['bmi'] * train_df['waist_to_hip_ratio']
train_df['age_family_interaction'] = train_df['age'] * train_df['family_history_diabetes']
train_df['obese'] = (train_df['bmi'] >= 30).astype('category')

# Gender-specific thresholds
hdl_low = np.where(
    train_df['gender'] == 'Male',
    (train_df['hdl_cholesterol'] < 40).astype(int),
    (train_df['hdl_cholesterol'] < 50).astype(int)
)

waist_high = np.where(
    train_df['gender'] == 'Male',
    (train_df['waist_to_hip_ratio'] > 0.9).astype(int),
    (train_df['waist_to_hip_ratio'] > 0.85).astype(int)
)

train_df['metabolic_risk_score'] = (
    (train_df['bmi'] >= 30).astype(int) +
    (train_df['triglycerides'] >= 150).astype(int) +
    hdl_low +
    (train_df['systolic_bp'] >= 130).astype(int) +
    waist_high
).clip(0, 5).astype('category')


# Continuous / interaction features
test_df['triglycerides_to_hdl'] = test_df['triglycerides'] / test_df['hdl_cholesterol']
test_df['non_hdl_cholesterol']  = test_df['cholesterol_total'] - test_df['hdl_cholesterol']
test_df['bmi_waist_interaction'] = test_df['bmi'] * test_df['waist_to_hip_ratio']
test_df['age_family_interaction'] = test_df['age'] * test_df['family_history_diabetes']

# Binary obese flag
test_df['obese'] = (test_df['bmi'] >= 30).astype('category')

# Gender-specific thresholds
hdl_low = np.where(
    test_df['gender'] == 'Male',
    (test_df['hdl_cholesterol'] < 40).astype(int),
    (test_df['hdl_cholesterol'] < 50).astype(int)
)

waist_high = np.where(
    test_df['gender'] == 'Male',
    (test_df['waist_to_hip_ratio'] > 0.9).astype(int),
    (test_df['waist_to_hip_ratio'] > 0.85).astype(int)
)

# Metabolic risk score (0–5)
test_df['metabolic_risk_score'] = (
    (test_df['bmi'] >= 30).astype(int) +
    (test_df['triglycerides'] >= 150).astype(int) +
    hdl_low +
    (test_df['systolic_bp'] >= 130).astype(int) +
    waist_high
).clip(0, 5).astype('category')


In [5]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.02,           # slower → much less overfit
    'num_leaves': 32,                # reduced
    'max_depth': -1,
    'min_child_samples': 50,         # increased
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'verbosity': -1,
    'seed': 42,
    'n_jobs': -1
}

In [6]:
groups = train_df["ethnicity"]

In [8]:
from sklearn.model_selection import GroupKFold

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
cv_scores = []

In [None]:
for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\n=== Fold {fold + 1} ===")

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols.to_list())
    lgb_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols.to_list())

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)
        ]
    )

    oof_preds[valid_idx] = model.predict(X_valid, num_iteration=model.best_iteration)
    test_preds += model.predict(X_test, num_iteration=model.best_iteration) / n_splits

    fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    cv_scores.append(fold_auc)
    print(f"Fold {fold + 1} AUC: {fold_auc:.6f}")

print("\nCV AUC:", np.mean(cv_scores), "±", np.std(cv_scores))
print("OOF AUC:", roc_auc_score(y, oof_preds))


In [10]:
avg_iterations = int(np.mean([model.best_iteration for model in [model]]))  # just one model per fold, but works
# Better: use the rounded average of all folds' best iterations
best_iters = [int(model.best_iteration * 1.05) for model in [model]]  # slight buffer
final_iterations = int(np.mean([m.best_iteration for m in [model]]) * 1.05)

lgb_full = lgb.Dataset(X, label=y, categorical_feature=cat_cols.tolist())

final_model = lgb.train(
    params,
    lgb_full,
    num_boost_round=final_iterations
)

In [11]:
# Final test predictions (just to be safe)
final_test_preds = final_model.predict(X_test)

# Use the CV-averaged predictions as final submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'diagnosed_diabetes': test_preds
})

In [12]:
submission.to_csv("submission_trial04.csv", index=False)