# LGBM avec les meilleurs paramètres, lancé sur l'ensemble des images (400 000)

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA #tester prince ? 
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from skopt import BayesSearchCV
from lightgbm import LGBMClassifier
from skopt.space import Categorical, Real, Integer
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import joblib
import time

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if not project_root in [Path(p).resolve() for p in sys.path]:
    sys.path.append(str(project_root))

from src import PATHS
from src.visualization.visualize import draw_spider_graph_dark, conf_matrix_dark

In [None]:
df_all = pd.read_parquet(os.path.join(PATHS.processed_data,"ML_images_100x100", "df_pixels_features.parquet"))

## création des sets train, test et validation

In [None]:
df=df_all.drop(columns=["document_id", "path",  "file_path", 'relative_path', 'filename'])


In [None]:
# On récupère les noms des colonnes sauf 'cat' et 'set'
features = df.columns.difference(['cat', 'set'])

# Split
X_train = df[df['set'] == 'train'][features]
X_val   = df[df['set'] == 'val'][features]
X_test  = df[df['set'] == 'test'][features]

y_train = df[df['set'] == 'train']['cat']
y_val   = df[df['set'] == 'val']['cat']
y_test  = df[df['set'] == 'test']['cat']

## normalisation des données

In [None]:
# Liste des colonnes à normaliser
cols_to_normalize = ['top_marge', 'bottom_marge', 'left_marge',
       'right_marge', 'nb_lignes', 'nb_colonnes', 'sharpness', 'noise',
       'ratio_b', 'ratio_n', 'entropy','width']

scaler = RobustScaler()

# Fit sur le train
X_train[cols_to_normalize] = scaler.fit_transform(X_train[cols_to_normalize])

# Transform val et test
X_val[cols_to_normalize] = scaler.transform(X_val[cols_to_normalize])
X_test[cols_to_normalize] = scaler.transform(X_test[cols_to_normalize])

## PCA pour garder 2000 colonnes

In [None]:
n_components = 2000  
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
X_val_pca = pca.transform(X_val)

## LGBM avec les meilleurs paramètres

Meilleurs hyperparamètres :
- 'colsample_bytree', 0.5802168967298673
- 'learning_rate', 0.029216387145600653
- 'max_depth', 15
- 'min_child_samples', 69
- 'n_estimators', 500
- 'num_leaves', 30
- 'reg_alpha', 1.3883805031132697e-08
- 'reg_lambda', 0.00016690235239007222
- 'subsample', 0.654130102375878


In [None]:
start_time = time.time()

clf = LGBMClassifier(num_leaves=30, 
                     max_depth=15, 
                     learning_rate=0.03, 
                     n_estimators=500, 
                     subsample_for_bin=200000, 
                     min_child_samples=69, 
                     subsample=0.65, 
                     colsample_bytree=0.58, 
                     reg_alpha=0.0, 
                     reg_lambda=0.0, 
                     n_jobs=-1)

clf.fit(X_train_pca, y_train)
y_pred = clf.predict(X_test_pca)

end_time = time.time()
elapsed = end_time - start_time
print(f" Temps d'exécution total : {elapsed / 60:.2f} minutes ({elapsed:.1f} secondes)")

In [None]:
# Juste le meilleur modèle entraîné
joblib.dump(clf, 'best_lgbm_model_all_images.pkl')

# Recharger plus tard
final_model = joblib.load('best_lgbm_model_all_images.pkl')

In [None]:
y_pred = final_model.predict(X_test_pca)
cm = confusion_matrix(y_test, y_pred)

In [None]:

print("\n Rapport de classification :")
print(classification_report(y_test, y_pred))


plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matrice de confusion")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

#DataFrame des scores
report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
print("\n Rapport sous forme de DataFrame :")
print(report_df.head(16))  # Affiche les 16 classes
print('')
print('###############################')

#Remarque : le set de validation servirait à tester l'algo final qui aurait déjà vu X_train et X_test. Il ne sert pas ici. a

In [None]:
conf_matrix_dark(cm, "illustrations/lgbm_cm_all_images.png")


In [None]:
draw_spider_graph_dark(y_test, y_pred, save_path="illustrations/lgbm_spider_all_images.png")
