In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import itertools

In [11]:
# Load data
X = pd.read_csv('X_resampled.csv')
y = pd.read_csv('y_resampled.csv').squeeze()

In [5]:
# 80-10-10 split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1111, stratify=y_temp, random_state=42)

In [6]:
print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 10826, Validation: 1354, Test: 1354


In [7]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [15, 31]
}

In [8]:

# Manual grid search
best_score = 0
best_params = {}
for combo in itertools.product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    score = accuracy_score(y_val, val_pred)

    if score > best_score:
        best_score = score
        best_params = params
        best_model = model

[LightGBM] [Info] Number of positive: 5413, number of negative: 5413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1036
[LightGBM] [Info] Number of data points in the train set: 10826, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 5413, number of negative: 5413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1036
[LightGBM] [Info] Number of data points in the train set: 10826, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.00000

In [9]:
print("Best Validation Accuracy:", best_score)
print("Best Parameters:", best_params)

Best Validation Accuracy: 0.9970457902511078
Best Parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1, 'num_leaves': 15}


In [18]:
# Final train evaluation
y_pred = best_model.predict(X_train)
print("\nTrain Accuracy:", accuracy_score(y_train, y_pred))
print("\nClassification Report:\n", classification_report(y_train, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_pred))


Train Accuracy: 0.9987991871420654

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5413
           1       1.00      1.00      1.00      5413

    accuracy                           1.00     10826
   macro avg       1.00      1.00      1.00     10826
weighted avg       1.00      1.00      1.00     10826


Confusion Matrix:
 [[5411    2]
 [  11 5402]]


In [19]:
# Final train evaluation
y_pred = best_model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))


Validation Accuracy: 0.9970457902511078

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       677
           1       1.00      1.00      1.00       677

    accuracy                           1.00      1354
   macro avg       1.00      1.00      1.00      1354
weighted avg       1.00      1.00      1.00      1354


Confusion Matrix:
 [[675   2]
 [  2 675]]


In [20]:
# Final test evaluation
y_pred = best_model.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.9903988183161004

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       677
           1       0.99      0.99      0.99       677

    accuracy                           0.99      1354
   macro avg       0.99      0.99      0.99      1354
weighted avg       0.99      0.99      0.99      1354


Confusion Matrix:
 [[670   7]
 [  6 671]]
