In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [6]:
# ==============================
# 1. Đọc dữ liệu
# ==============================
ROOT_DIR = Path.cwd().parents[1]
print(ROOT_DIR)
DATA_DIR = ROOT_DIR / "data" / "processed"

train_path = DATA_DIR / "train_balanced.csv"
valid_path = DATA_DIR / "validation.csv"

print(f"Train file : {train_path}")
print(f"Valid file : {valid_path}")

df_train = pd.read_csv(train_path)
df_valid = pd.read_csv(valid_path)

# Giả định cột target tên là 'stroke'
TARGET_COL = "stroke"

y_train = df_train[TARGET_COL]
X_train = df_train.drop(columns=[TARGET_COL])

y_valid = df_valid[TARGET_COL]
X_valid = df_valid.drop(columns=[TARGET_COL])

print("Train shape :", X_train.shape)
print("Valid shape :", X_valid.shape)

C:\AI\Data_Mining\Stroke-Prediction-Project
Train file : C:\AI\Data_Mining\Stroke-Prediction-Project\data\processed\train_balanced.csv
Valid file : C:\AI\Data_Mining\Stroke-Prediction-Project\data\processed\validation.csv
Train shape : (5816, 26)
Valid shape : (1019, 26)


In [7]:
# ==============================
# 2. Hàm train + đánh giá 1 model
# ==============================
def train_eval_lgbm(params, X_tr, y_tr, X_va, y_va):
    """
    Train 1 LightGBM model với tham số 'params'
    và trả về AUC trên validation.
    """
    model = lgb.LGBMClassifier(
        objective="binary",
        random_state=42,
        n_estimators=5000,             # nhiều vòng, dùng early stopping để dừng sớm
        **params
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="auc",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False)
        ],
    )

    # Dự đoán xác suất trên valid
    y_proba = model.predict_proba(X_va)[:, 1]
    auc = roc_auc_score(y_va, y_proba)

    return model, auc



In [8]:
# ==============================
# 3. Tuning đơn giản bằng validation set
# ==============================
param_grid = {
    "num_leaves":   [15, 31, 63],
    "max_depth":    [-1, 7, 11],
    "learning_rate":[0.01, 0.05, 0.1],
    "feature_fraction": [0.8, 1.0],
    "bagging_fraction": [0.8, 1.0],
    "bagging_freq": [0, 5],
    "min_child_samples": [20, 50],
}

best_auc = -1.0
best_model = None
best_params = None

# Lặp qua 1 số tổ hợp tham số (làm gọn: chỉ chọn vài tham số chính)
from itertools import product

keys = ["num_leaves", "max_depth", "learning_rate",
        "feature_fraction", "bagging_fraction",
        "bagging_freq", "min_child_samples"]

for values in product(*[param_grid[k] for k in keys]):
    params = dict(zip(keys, values))

    print(f"Training with params: {params}")
    model, auc = train_eval_lgbm(params, X_train, y_train, X_valid, y_valid)
    print(f"  => AUC(valid) = {auc:.4f}")

    if auc > best_auc:
        best_auc = auc
        best_model = model
        best_params = params.copy()
        print(f"  >>> Update best AUC = {best_auc:.4f}")

print("\n=======================")
print("BEST PARAMS ON VALID:")
print(best_params)
print(f"BEST AUC (valid) = {best_auc:.4f}")

Training with params: {'num_leaves': 15, 'max_depth': -1, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 0, 'min_child_samples': 20}
[LightGBM] [Info] Number of positive: 2908, number of negative: 2908
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5181
[LightGBM] [Info] Number of data points in the train set: 5816, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
  => AUC(valid) = 0.8576
  >>> Update best AUC = 0.8576
Training with params: {'num_leaves': 15, 'max_depth': -1, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 0, 'min_child_samples': 50}
[LightGBM] [Info] Number of positive: 2908, number of negative: 2908
[LightGBM] [Info] Auto-ch

In [11]:
# ==============================
# 4. Đánh giá chi tiết trên validation
# ==============================
y_valid_pred = best_model.predict(X_valid)
y_valid_proba = best_model.predict_proba(X_valid)[:, 1]
cm = confusion_matrix(y_valid, y_valid_pred)
cm_df = pd.DataFrame(
    cm,
    index=["Actual_0", "Actual_1"],
    columns=["Pred_0", "Pred_1"]
)

MODELS_DIR = ROOT_DIR / "data" / "results"
# Save to CSV
cm_path = MODELS_DIR / "confusion_matrix_lightgbm.csv"
cm_df.to_csv(cm_path, index=True)

print(f"Saved confusion matrix to CSV at: {cm_path}")



Saved confusion matrix to CSV at: C:\AI\Data_Mining\Stroke-Prediction-Project\data\results\confusion_matrix_lightgbm.csv
