In [1]:
# Model Experimentation
# Only run once
import sys
import os

project_root = os.path.abspath('../')
os.chdir(project_root)

src_path = os.path.abspath(os.path.join(os.getcwd(), 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Setup

In [10]:

from utils.load_data import DataLoader
from utils.preprocess import Preprocessor, FeatureEngineering
from utils.evaluate_model import Evaluator
from utils.model import ModelLoader
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import mlflow
import warnings
warnings.filterwarnings("ignore")

dl = DataLoader()
pre = Preprocessor()
fe = FeatureEngineering()
evaluator = Evaluator()
ml = ModelLoader()

In [3]:
mlflow.set_tracking_uri("file:///c:/Users/Axioo/Documents/Fahmi/ai/ml/diabetes-prediction-challenge/logs")

In [4]:
train = dl.load_data('train_clean.csv', 'data/processed')
test = dl.load_data('test_clean.csv', 'data/processed')
submission = dl.load_data('sample_submission.csv', 'data/raw')

train_df = train.copy()
test_df = test.copy()

X_train, X_val, y_train, y_val = dl.split_data(train_df, id_column='id', target_column='diagnosed_diabetes', test_size=0.2, random_state=42, stratify='diagnosed_diabetes')

# Feature engineering

In [7]:
# Function
def create_features(df):
    df_new = df.copy()
    
    # 1. BMI Categories
    df_new['bmi_category'] = pd.cut(df_new['bmi'], bins=[0, 18.5, 25, 30, 100],labels=['underweight', 'normal', 'overweight', 'obese'])
    
    # 2. Age Groups
    df_new['age_group'] = pd.cut(df_new['age'], bins=[0, 30, 45, 60, 100],labels=['young', 'middle', 'senior', 'elderly'])
    
    # 3. Blood Pressure Categories (using systolic_bp)
    df_new['bp_category'] = pd.cut(df_new['systolic_bp'],bins=[0, 120, 130, 140, 200],labels=['normal', 'elevated', 'high', 'very_high'])
    
    # 4. Cardiovascular Risk Score
    df_new['cardio_risk_score'] = (
        (df_new['systolic_bp'] / 140) * 0.3 +
        (df_new['diastolic_bp'] / 90) * 0.3 +
        (df_new['heart_rate'] / 100) * 0.2 +
        df_new['cardiovascular_history'] * 0.2
    )
    
    # 5. Cholesterol Risk Score
    df_new['cholesterol_risk'] = (
        (df_new['cholesterol_total'] / 240) * 0.4 +
        (df_new['ldl_cholesterol'] / 160) * 0.4 +
        (1 - df_new['hdl_cholesterol'] / 60) * 0.2
    ).clip(0, 5)
    
    # 6. Lifestyle Risk Score
    df_new['lifestyle_risk'] = (
        (df_new['alcohol_consumption_per_week'] / 7) * 0.2 +
        (1 - df_new['physical_activity_minutes_per_week'] / 300) * 0.3 +
        (df_new['screen_time_hours_per_day'] / 10) * 0.2 +
        (1 - df_new['sleep_hours_per_day'] / 8) * 0.3
    ).clip(0, 5)
    
    # 7. Diet-Activity Balance
    df_new['diet_activity_balance'] = (
        df_new['diet_score'] * df_new['physical_activity_minutes_per_week'] / 1000
    )
    
    # 8. Metabolic Health Index
    df_new['metabolic_index'] = (
        (df_new['triglycerides'] / 150) * 0.3 +
        (df_new['bmi'] / 30) * 0.4 +
        (df_new['waist_to_hip_ratio'] / 0.95) * 0.3
    )
    
    # 9. Overall Health Risk
    df_new['overall_health_risk'] = (
        df_new['cardio_risk_score'] * 0.25 +
        df_new['cholesterol_risk'] * 0.25 +
        df_new['lifestyle_risk'] * 0.2 +
        df_new['metabolic_index'] * 0.3
    )
    
    # 10. Interaction Features
    df_new['age_bmi'] = df_new['age'] * df_new['bmi']
    df_new['age_bp'] = df_new['age'] * df_new['systolic_bp']
    df_new['bmi_cholesterol'] = df_new['bmi'] * df_new['cholesterol_total']
    df_new['family_history_age'] = df_new['family_history_diabetes'] * df_new['age']
    
    # 11. Log Transform for Skewed Features
    df_new['log_physical_activity'] = np.log1p(df_new['physical_activity_minutes_per_week'])
    df_new['log_triglycerides'] = np.log1p(df_new['triglycerides'])
    
    # 12. Squared Features for Non-linear Relationships
    df_new['age_squared'] = df_new['age'] ** 2
    df_new['bmi_squared'] = df_new['bmi'] ** 2
    
    # 13. Binary Risk Flags
    df_new['high_bp_flag'] = (df_new['systolic_bp'] > 140).astype(int)
    df_new['high_cholesterol_flag'] = (df_new['cholesterol_total'] > 240).astype(int)
    df_new['obese_flag'] = (df_new['bmi'] > 30).astype(int)
    df_new['sedentary_flag'] = (df_new['physical_activity_minutes_per_week'] < 150).astype(int)
    
    # 14. Combined Family and Medical History
    df_new['medical_history_score'] = (
        df_new['family_history_diabetes'] + 
        df_new['hypertension_history'] + 
        df_new['cardiovascular_history']
    )
    
    return df_new

def prepare_features(df, categorical_cols, is_train=True, train_columns=None):
    df_fe = create_features(df)
    cat_cols = categorical_cols + ['bmi_category', 'age_group', 'bp_category']
    df_encoded = pd.get_dummies(df_fe, columns=cat_cols, drop_first=True)
    
    if is_train:
        return df_encoded, df_encoded.columns.tolist()
    else:
        if train_columns is None:
            raise ValueError("train_columns must be provided when is_train=False")
        
        for col in train_columns:
            if col not in df_encoded.columns:
                df_encoded[col] = 0
        
        df_encoded = df_encoded[train_columns]
        
        return df_encoded

In [9]:
# Execution
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

X_train_fe, train_columns = prepare_features(X_train, categorical_cols, is_train=True)
X_val_fe = prepare_features(X_val, categorical_cols, is_train=False, train_columns=train_columns)
test_fe = prepare_features(test_df, categorical_cols, is_train=False, train_columns=train_columns)

# Tuning hyperparameter

In [15]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

for col in X_train_fe.select_dtypes(include='uint8').columns:
    X_train_fe[col] = X_train_fe[col].astype('category')
    X_val_fe[col] = X_val_fe[col].astype('category')
        
def objective(trial):
    params = {
        "objective": "binary",
        "eval_metric": "auc",
        "boosting_type": "gbdt",
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 50, 300),
        "max_depth": trial.suggest_int("max_depth", -1, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-3, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-3, 10.0, log=True),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.0, 1.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 5.0),
    }
    
    dtrain = lgb.Dataset(X_train_fe, label=y_train)
    dval = lgb.Dataset(X_val_fe, label=y_val, reference=dtrain)
    
    with mlflow.start_run(nested=True):
        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dval],
            num_boost_round=500,
            callbacks=[lgb.early_stopping(100)]
        )

        preds = model.predict(X_val_fe, num_iteration=model.best_iteration)
        auc = roc_auc_score(y_val, preds)

        mlflow.log_params(params)
        mlflow.log_metric("val_auc", auc)
        mlflow.log_metric("best_iteration", model.best_iteration)
        mlflow.log_metric("train_auc", roc_auc_score(y_train, model.predict(X_train_fe, num_iteration=model.best_iteration)))
        
        signature = mlflow.models.infer_signature(X_train_fe, preds)
        mlflow.lightgbm.log_model(
            model, 
            name="model",
            signature=signature,
            input_example=None
        )

    return auc

In [16]:
mlflow.set_experiment("lgbm")
with mlflow.start_run(run_name="lgbm_hyperparameter_tuning_1"):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    
    best_params = study.best_params
    best_value = study.best_value
    
    print(f"Best AUC: {study.best_value:.6f}")
    print("Best params:")
    for k, v in study.best_params.items():
        print(f"  {k}: {v}")
    
    mlflow.log_params(best_params)
    mlflow.log_metric("best_val_auc", study.best_value)
    mlflow.log_metric("n_trials_completed", len(study.trials))

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[7]	valid_0's binary_logloss: 0.650221
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[14]	valid_0's binary_logloss: 0.632126
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.661875
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.66183
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[18]	valid_0's binary_logloss: 0.639075
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.657096
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[8]	valid_0's binary_logloss: 0.641006
Training until validation scores don't improve 

# Model

In [17]:
final_params = best_params.copy()
final_params.update({
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "random_state": 42,
    "verbosity": -1,
})

full_train_df = pd.concat([X_train, X_val], axis=0)
full_y = pd.concat([y_train, y_val], axis=0)

full_train_fe, full_train_columns = prepare_features(full_train_df, categorical_cols, is_train=True)

# Set kategori jika perlu
for col in categorical_cols:
    if col in full_train_fe.columns:
        full_train_fe[col] = full_train_fe[col].astype('category')

# Training final model
dtrain_full = lgb.Dataset(full_train_fe, label=full_y)
final_model = lgb.train(
    final_params,
    dtrain_full,
    num_boost_round=1000,
)

ml.save_model(final_model, "lgbm_model_1.pkl")

Model saved to models\lgbm_model_1.pkl


# Predict

In [19]:
model = ml.load_model("lgbm_model_1.pkl")
preds = model.predict(test_fe)
pred_labels = (preds > 0.5).astype(int)

submission['diagnosed_diabetes'] = preds
submission.to_csv('output/submission_1.csv', index=False)

Model loaded from models\lgbm_model_1.pkl
