In [9]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [None]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# -------------------- Prepare features and target 
target = 'diagnosed_diabetes'
features = [col for col in train_df.columns if col not in [target, 'id']]

cat_cols = train_df[features].select_dtypes(include='object').columns
for col in cat_cols:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

# -------------- X and y
X = train_df[features]
y = train_df[target]
X_test = test_df[features]

In [7]:
# -------------- X and y
X = train_df[features]
y = train_df[target]
X_test = test_df[features]

In [8]:
# --------------------- LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 64,           # a bit higher than default for first trial
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample': 0.8,
    'subsample_freq': 1,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'verbosity': -1,
    'seed': 42,
    'n_jobs': -1
}

In [10]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
cv_scores = []

In [None]:
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\=== Fold {fold + 1} ===")

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols.to_list())
    lgb_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols.tolist(), reference=lgb_train)

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)
        ]
    )

    # OOF predictions
    oof_preds[valid_idx] = model.predict(X_valid, num_iterations=model.best_iteration)

    # Test predictions (avg over folds)
    test_preds += model.predict(X_test, num_iterations=model.best_iteration) / n_splits

    fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    cv_scores.append(fold_auc)
    print(f"Fold {fold+1} AUC: {fold_auc:.6f} (best iteration: {model.best_iteration})")

print("\n" + "="*50)
print(f"CV AUC: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")
print(f"OOF AUC: {roc_auc_score(y, oof_preds):.6f}")
print("="*50)

In [15]:
# ----------- Final model on full data (using avg best-iteration)
avg_iterations = int(np.mean([model.best_iteration for model in [model]]))  # just one model per fold, but works
# Better: use the rounded average of all folds' best iterations
best_iters = [int(model.best_iteration * 1.05) for model in [model]]  # slight buffer
final_iterations = int(np.mean([m.best_iteration for m in [model]]) * 1.05)

lgb_full = lgb.Dataset(X, label=y, categorical_feature=cat_cols.tolist())

final_model = lgb.train(
    params,
    lgb_full,
    num_boost_round=final_iterations
)

In [16]:
# Final test predictions (just to be safe)
final_test_preds = final_model.predict(X_test)

# Use the CV-averaged predictions as final submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'diagnosed_diabetes': test_preds
})

In [17]:
submission.to_csv("submission_trial01.csv", index=False)

In [None]:
# Get feature importance values
importance = final_model.feature_importance(importance_type='gain')  # or 'split'
feature_names = final_model.feature_name()

# Combine into DataFrame
feat_imp = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
})

# Sort descending and take top 10
top10 = feat_imp.sort_values(by='importance', ascending=False).head(13)
print(top10)

                               feature     importance
2   physical_activity_minutes_per_week  357665.685457
21             family_history_diabetes  291331.553935
0                                  age  157683.147133
14                       triglycerides   69991.332134
6                                  bmi   50119.926153
13                     ldl_cholesterol   33699.891308
8                          systolic_bp   26195.234203
3                           diet_score   21728.010202
10                          heart_rate   20663.004512
11                   cholesterol_total   20495.995180
12                     hdl_cholesterol   18028.312106
5            screen_time_hours_per_day   15636.862841
7                   waist_to_hip_ratio   14322.427335


---

In [25]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# -------------------- Prepare features and target 
target = 'diagnosed_diabetes'
features = [col for col in train_df.columns if col not in [target, 'id']]

cat_cols = train_df[features].select_dtypes(include='object').columns
for col in cat_cols:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

# -------------- X and y
X = train_df[features]
y = train_df[target]
X_test = test_df[features]

In [28]:
# === Add these 6 features to both train and test ===
train_df['triglycerides_to_hdl'] = train_df['triglycerides'] / train_df['hdl_cholesterol']
train_df['non_hdl_cholesterol']  = train_df['cholesterol_total'] - train_df['hdl_cholesterol']
train_df['bmi_waist_interaction'] = train_df['bmi'] * train_df['waist_to_hip_ratio']
train_df['age_family_interaction'] = train_df['age'] * train_df['family_history_diabetes']
train_df['obese'] = (train_df['bmi'] >= 30).astype('category')

# Gender-specific thresholds
hdl_low = np.where(
    train_df['gender'] == 'Male',
    (train_df['hdl_cholesterol'] < 40).astype(int),
    (train_df['hdl_cholesterol'] < 50).astype(int)
)

waist_high = np.where(
    train_df['gender'] == 'Male',
    (train_df['waist_to_hip_ratio'] > 0.9).astype(int),
    (train_df['waist_to_hip_ratio'] > 0.85).astype(int)
)

train_df['metabolic_risk_score'] = (
    (train_df['bmi'] >= 30).astype(int) +
    (train_df['triglycerides'] >= 150).astype(int) +
    hdl_low +
    (train_df['systolic_bp'] >= 130).astype(int) +
    waist_high
).clip(0, 5).astype('category')

In [29]:
# Continuous / interaction features
test_df['triglycerides_to_hdl'] = test_df['triglycerides'] / test_df['hdl_cholesterol']
test_df['non_hdl_cholesterol']  = test_df['cholesterol_total'] - test_df['hdl_cholesterol']
test_df['bmi_waist_interaction'] = test_df['bmi'] * test_df['waist_to_hip_ratio']
test_df['age_family_interaction'] = test_df['age'] * test_df['family_history_diabetes']

# Binary obese flag
test_df['obese'] = (test_df['bmi'] >= 30).astype('category')

# Gender-specific thresholds
hdl_low = np.where(
    test_df['gender'] == 'Male',
    (test_df['hdl_cholesterol'] < 40).astype(int),
    (test_df['hdl_cholesterol'] < 50).astype(int)
)

waist_high = np.where(
    test_df['gender'] == 'Male',
    (test_df['waist_to_hip_ratio'] > 0.9).astype(int),
    (test_df['waist_to_hip_ratio'] > 0.85).astype(int)
)

# Metabolic risk score (0–5)
test_df['metabolic_risk_score'] = (
    (test_df['bmi'] >= 30).astype(int) +
    (test_df['triglycerides'] >= 150).astype(int) +
    hdl_low +
    (test_df['systolic_bp'] >= 130).astype(int) +
    waist_high
).clip(0, 5).astype('category')


In [30]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.02,           # slower → much less overfit
    'num_leaves': 32,                # reduced
    'max_depth': -1,
    'min_child_samples': 50,         # increased
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'verbosity': -1,
    'seed': 42,
    'n_jobs': -1
}

In [None]:
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
cv_scores = []

In [None]:
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\=== Fold {fold + 1} ===")

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols.to_list())
    lgb_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols.tolist(), reference=lgb_train)

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)
        ]
    )

    # OOF predictions
    oof_preds[valid_idx] = model.predict(X_valid, num_iterations=model.best_iteration)

    # Test predictions (avg over folds)
    test_preds += model.predict(X_test, num_iterations=model.best_iteration) / n_splits

    fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    cv_scores.append(fold_auc)
    print(f"Fold {fold+1} AUC: {fold_auc:.6f} (best iteration: {model.best_iteration})")

print("\n" + "="*50)
print(f"CV AUC: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")
print(f"OOF AUC: {roc_auc_score(y, oof_preds):.6f}")
print("="*50)

In [33]:
# ----------- Final model on full data (using avg best-iteration)
avg_iterations = int(np.mean([model.best_iteration for model in [model]]))  # just one model per fold, but works
# Better: use the rounded average of all folds' best iterations
best_iters = [int(model.best_iteration * 1.05) for model in [model]]  # slight buffer
final_iterations = int(np.mean([m.best_iteration for m in [model]]) * 1.05)

lgb_full = lgb.Dataset(X, label=y, categorical_feature=cat_cols.tolist())

final_model = lgb.train(
    params,
    lgb_full,
    num_boost_round=final_iterations
)

In [None]:
# Final test predictions (just to be safe)
final_test_preds = final_model.predict(X_test)

# Use the CV-averaged predictions as final submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'diagnosed_diabetes': test_preds
})

In [35]:
submission.to_csv("submission_trial02.csv", index=False)

In [36]:
# Get feature importance values
importance = final_model.feature_importance(importance_type='gain')  # or 'split'
feature_names = final_model.feature_name()

# Combine into DataFrame
feat_imp = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
})

# Sort descending and take top 10
top10 = feat_imp.sort_values(by='importance', ascending=False).head(13)
print(top10)

                               feature    importance
2   physical_activity_minutes_per_week  1.090461e+06
21             family_history_diabetes  8.946099e+05
0                                  age  4.751313e+05
14                       triglycerides  1.977684e+05
6                                  bmi  1.332608e+05
13                     ldl_cholesterol  8.838968e+04
8                          systolic_bp  6.519248e+04
11                   cholesterol_total  5.370399e+04
3                           diet_score  5.146832e+04
10                          heart_rate  4.638371e+04
12                     hdl_cholesterol  4.225943e+04
5            screen_time_hours_per_day  3.487393e+04
7                   waist_to_hip_ratio  3.326510e+04


---

In [4]:
train_df.head(2)

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
