### **LightGBM**

Loads data, tunes LightGBM hyperparameters with Optuna, trains a final model and evaluates its performance.

In [None]:
import os
import joblib
from PIL import Image

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torchvision import transforms

import optuna
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

## Classical ML Model for Binary Classification

In [1]:
csv_path = os.path.join(os.getcwd(), "data", "final.csv")

df = pd.read_csv(csv_path)
df.head()




          0    1         2    3  ...      509       510       511  label
0  0.000000  0.0  0.042997  0.0  ...  0.00000  0.000000  0.000000      1
1  0.000000  0.0  0.000000  0.0  ...  0.00000  0.000000  0.476462      1
2  0.702633  0.0  0.000000  0.0  ...  0.34996  0.000000  0.000000      1
3  0.000000  0.0  0.000000  0.0  ...  0.00000  0.060267  0.000000      1
4  0.000000  0.0  0.000000  0.0  ...  0.00000  0.000000  0.000000      1

[5 rows x 513 columns]

#### Configuration

In [None]:
trials = 50
model_path = os.path.join(os.getcwd(), 'models', 'best_lightgbm_model.pkl')

#### Spliting the data for Training and testing

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#### Saving the model

In [None]:
def save_best_model(study, trial):

    # Check if the current trial is the best one so far
    if study.best_trial.number == trial.number and trial.state == optuna.trial.TrialState.COMPLETE:
        print(f"\nTrial {trial.number} is the new best with F1-Score: {trial.value:.4f}")

        # Train a model with the best params on the full training data
        best_params_so_far = study.best_params
        best_params_so_far['objective'] = 'binary'
        best_params_so_far['metric'] = 'binary_logloss'
        best_params_so_far['random_state'] = 42
        best_params_so_far['n_jobs'] = -1
        best_params_so_far['verbose'] = -1

        temp_lgbm = lgb.LGBMClassifier(**best_params_so_far)
        temp_lgbm.fit(X_train, y_train)

        # Save the model
        joblib.dump(temp_lgbm, model_path)
        print(f"Model saved to: {model_path}")

#### Hyperparameter Tuning with Optuna

In [2]:
def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }

    # Use stratified K-fold cross-validation for robust evaluation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []
    for train_index, val_index in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        model = lgb.LGBMClassifier(**params)
        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_val_fold)
        f1_scores.append(f1_score(y_val_fold, preds))

    return np.mean(f1_scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=trials, callbacks=[save_best_model])
print("Best F1 score during tuning:", study.best_value)

[31m---------------------------------------------------------------------------[39m
[31mNameError[39m                                 Traceback (most recent call last)
[36mCell[39m[36m [39m[32mIn[9][39m[32m, line 36[39m
[32m     33[39m     [38;5;28;01mreturn[39;00m np.mean(f1_scores)
[32m     35[39m study = optuna.create_study(direction=[33m'[39m[33mmaximize[39m[33m'[39m)
[32m---> [39m[32m36[39m [43mstudy[49m[43m.[49m[43moptimize[49m[43m([49m[43mobjective[49m[43m,[49m[43m [49m[43mn_trials[49m[43m=[49m[43mtrials[49m[43m,[49m[43m [49m[43mcallbacks[49m[43m=[49m[43m[[49m[43msave_best_model[49m[43m][49m[43m)[49m
[32m     37[39m [38;5;28mprint[39m([33m"[39m[33mBest F1 score during tuning:[39m[33m"[39m, study.best_value)

[36mFile [39m[32m~/.conda/envs/ml/lib/python3.13/site-packages/optuna/study/study.py:490[39m, in [36mStudy.optimize[39m[34m(self, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_tria

#### Evaluating the model

In [None]:
# Load the best LightGBM model from the saved file
lgbm = joblib.load(model_path)

# Make predictions on the test set
y_pred = lgbm.predict(X_test)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# Print classification report
print(classification_report(y_test, y_pred, target_names=['Non-Biodegradable', 'Biodegradable']))

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Non-Biodegradable', 'Biodegradable'],
            yticklabels=['Non-Biodegradable', 'Biodegradable'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Tuned Model')
plt.show()