In [None]:
%run eda_analysis.ipynb
%run preprocessing_loader_v2.ipynb

**LightGBM CODE**

Librairies


In [None]:

import pandas as pd 
import numpy as np 
from lightgbm import LGBMClassifier
from sklearn.metrics import (f1_score, precision_score, recall_score,roc_auc_score,average_precision_score,confusion_matrix,classification_report)
import matplotlib.pyplot as plt 
import seaborn as sns 
from time import time 
import optuna 
import warnings
warnings.filterwarnings("ignore", category=UserWarning)



Loading data

In [None]:
# Load training data (Original and SMOTE)

X_train_original = X_train_full
y_train_original = y_train_full

X_train_smote = X_train_resampled
y_train_smote = y_train_resampled

print(f"   Original train: {X_train_original.shape}")
print(f"   SMOTE train: {X_train_smote.shape}")
print(f"   Validation: {X_val.shape}")

**-------------------------------------------------- WITHOUT SMOTE --------------------------------------------------** 

First training

In [None]:
# Calculate scale_pos_weight
scale = (y_train_original == 0).sum() / (y_train_original == 1).sum()
print(f"Class imbalance ratio : {scale:.1f}:1")
print(f"scale_pos_weight: {scale:.1f}")


# Initialize LightGBM 

lgbm_base = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    num_leaves=31,
    scale_pos_weight = scale,
    min_child_samples=20,
    min_child_weight=0.001,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=0.0,
    n_jobs = -1,
    random_state=42,
    verbose =-1,
)

start_time = time()
lgbm_base.fit(X_train_original, y_train_original)
training_time_base = time() - start_time

print(f"Trained in {training_time_base:.2f} seconds\n \n")

# Predictions

y_pred_base = lgbm_base.predict(X_val)
y_pred_proba_base = lgbm_base.predict_proba(X_val)[:,1]

print(f"F1-Score:          {f1_score(y_val, y_pred_base)*100:.4f} %")
print(f"Precision:         {precision_score(y_val, y_pred_base)*100:.4f} %")
print(f"Recall:            {recall_score(y_val, y_pred_base)*100:.4f} %")
print(f"ROC-AUC:           {roc_auc_score(y_val, y_pred_proba_base)*100:.4f} %")
print(f"PR-AUC:            {average_precision_score(y_val, y_pred_proba_base)*100:.4f} %")

Optimisation using Optuna 

In [None]:
def objective(trial):
    params = {
        'objective':'binary',
        'metric':'binary_logloss',
        'n_estimators': 1000,
        'random_state': 42,
        'n_jobs': -1,
        'scale_pos_weight': scale, 
        
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    model = LGBMClassifier(**params)

    model.fit(X_train_original, y_train_original, eval_set=[(X_val, y_val)],
              eval_metric='f1')
    
    y_pred = model.predict(X_val)
    score = f1_score(y_val,y_pred)

    return score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

# 3. Afficher les résultats
print("Nombre d'essais terminés: ", len(study.trials))
print("Meilleur essai:")
trial = study.best_trial

print("  Valeur (F1-Score): ", trial.value)
print("  Meilleurs paramètres: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

best_params = trial.params

After optimization

In [None]:
# Initialize LightGBM 

lgbm_base_new = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.045498527992294334,
    max_depth=10,
    num_leaves=141,
    scale_pos_weight = scale,
    min_child_samples=61,
    min_child_weight=0.001,
    subsample=0.890923497210552,
    colsample_bytree=0.7291315965375645,
    reg_alpha= 0.0006260046088932341,
    reg_lambda=0.0004326548208585303,
    n_jobs = -1,
    random_state=42,
    verbose = -1,
)

start_time = time()
lgbm_base_new.fit(X_train_original, y_train_original)
training_time_base = time() - start_time

print(f"Trained in {training_time_base:.2f} seconds\n \n")

# Predictions

y_pred_base_new = lgbm_base_new.predict(X_val)
y_pred_proba_base_new = lgbm_base_new.predict_proba(X_val)[:,1]

print(f"F1-Score:          {f1_score(y_val, y_pred_base_new)*100:.4f} %")
print(f"Precision:         {precision_score(y_val, y_pred_base_new)*100:.4f} %")
print(f"Recall:            {recall_score(y_val, y_pred_base_new)*100:.4f} %")
print(f"ROC-AUC:           {roc_auc_score(y_val, y_pred_proba_base_new)*100:.4f} %")
print(f"PR-AUC:            {average_precision_score(y_val, y_pred_proba_base_new)*100:.4f} %")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_val, y_pred_base_new)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Legit', 'Fraud'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix (Val Set)')
plt.show()

In [None]:
import lightgbm as lgb
lgb.plot_importance(lgbm_base_new, max_num_features=10, importance_type='gain')
plt.title('Top Features (Gain)')
plt.show()

**-------------------------------------------------- WITH SMOTE --------------------------------------------------** 

In [None]:
def objective_smote(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'n_estimators': 1000,
        'random_state': 42,
        'n_jobs': -1,        
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    model = LGBMClassifier(**params)
    
    # Entraînement sur les données SMOTE
    model.fit(X_train_resampled, y_train_resampled, 
              eval_set=[(X_val, y_val)],
              eval_metric='f1',
              ) # Ajout du pruning
    
    # Prédiction sur le jeu de validation ORIGINAL
    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred)

    return score

# Lancement de l'étude d'optimisation
print("--- Lancement de l'optimisation Optuna pour le modèle SMOTE ---\\n")
study_smote = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_smote.optimize(objective_smote, n_trials=50, show_progress_bar=True) # 50 essais pour une recherche robuste

# Affichage des meilleurs résultats
print("\\n--- Meilleurs hyperparamètres trouvés pour SMOTE ---")
best_params_smote = study_smote.best_trial.params
for key, value in best_params_smote.items():
    print(f"    {key}: {value}")

# Entraînement du modèle final optimisé AVEC SMOTE
print("\\n--- Entraînement du modèle final optimisé avec SMOTE ---")
lgbm_smote_best = LGBMClassifier(
    n_estimators=1000,
    random_state=42,
    n_jobs=-1,
    **best_params_smote
)

start_time = time()
lgbm_smote_best.fit(X_train_resampled, y_train_resampled)
training_time_smote = time() - start_time
print(f"Entraîné en {training_time_smote:.2f} secondes\\n")

# Évaluation sur le jeu de validation
y_pred_smote_best = lgbm_smote_best.predict(X_val)
y_pred_proba_smote_best = lgbm_smote_best.predict_proba(X_val)[:, 1]

print("--- Performances sur le jeu de validation (Modèle SMOTE optimisé) ---")
print(f"F1-Score:          {f1_score(y_val, y_pred_smote_best) * 100:.4f} %")
print(f"Precision:         {precision_score(y_val, y_pred_smote_best) * 100:.4f} %")
print(f"Recall:            {recall_score(y_val, y_pred_smote_best) * 100:.4f} %")
print(f"ROC-AUC:           {roc_auc_score(y_val, y_pred_proba_smote_best) * 100:.4f} %")
print(f"PR-AUC:            {average_precision_score(y_val, y_pred_proba_smote_best) * 100:.4f} %\\n")


In [None]:
import lightgbm as lgb
lgb.plot_importance(lgbm_smote_best, max_num_features=10, importance_type='gain')
plt.title('Top Features (Gain)')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_val, y_pred_smote_best)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Legit', 'Fraud'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix (Val Set)')
plt.show()

**COMPARISON**

In [None]:
# Create a DataFrame for visualization
metrics_data = {
    'Metric': ['F1-Score', 'Precision', 'Recall', 'ROC-AUC', 'PR-AUC'],
    'Without SMOTE': [
        f1_score(y_val, y_pred_base_new),
        precision_score(y_val, y_pred_base_new),
        recall_score(y_val, y_pred_base_new),
        roc_auc_score(y_val, y_pred_proba_base_new),
        average_precision_score(y_val, y_pred_proba_base_new)
    ],
    'With SMOTE': [
        f1_score(y_val, y_pred_smote_best),
        precision_score(y_val, y_pred_smote_best),
        recall_score(y_val, y_pred_smote_best),
        roc_auc_score(y_val, y_pred_proba_smote_best),
        average_precision_score(y_val, y_pred_proba_smote_best)
    ]
}
df_metrics = pd.DataFrame(metrics_data).set_index('Metric')

# Create the plot
ax = df_metrics.plot(kind='bar', figsize=(14, 8), rot=0, colormap='viridis')
plt.title('Complete Performance Comparison: Without SMOTE vs. With SMOTE', fontsize=16)
plt.ylabel('Score', fontsize=12)
plt.ylim(0.88, 1.0) # Adjust the y-axis limit to better visualize differences
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add data labels on top of the bars
for container in ax.containers:
    ax.bar_label(container, fmt='{:.4f}')

plt.legend(title='Approach', fontsize=11)
plt.tight_layout()

# Display the plot
plt.show()