In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

from skopt import BayesSearchCV
from skopt.space import Categorical, Real, Integer
from lazypredict.Supervised import LazyClassifier

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if not project_root in [Path(p).resolve() for p in sys.path]:
    sys.path.append(str(project_root))

from src import PATHS
from src.visualization.visualize import plot_spider_graph

# 1. Premiers essais
On va travailler sur un échantillon d'abord, parce que sinon je n'ai pas assez de RAM

In [None]:
df_all = pd.read_parquet(PATHS.processed_data / "df_img_features_pixels.parquet")

In [None]:
sample = pd.read_parquet(PATHS.samples / 'df_documents_sample_4k_3.parquet')
df = sample.join(df_all)
df

## 1.1. Création des sets de train, test et validation

In [None]:
data_sets = pd.read_parquet(PATHS.metadata / "df_data_sets.parquet")
labels = pd.read_parquet(PATHS.metadata / "df_encoded_labels.parquet")

data_sets = sample.join(data_sets)
labels = sample.join(labels).label

#features = df.columns

X_train = df[data_sets.data_set == "train"]
y_train = labels[data_sets.data_set == "train"]

X_val = df[data_sets.data_set == "val"]
y_val = labels[data_sets.data_set == "val"]

X_test = df[data_sets.data_set == "test"]
y_test = labels[data_sets.data_set == "test"]

# On libère de la mémoire
del df, labels, data_sets

## 1.2. Preprocessing 

In [None]:
# Liste des colonnes à normaliser
cols_to_normalize = ['top_marge', 'bottom_marge', 'left_marge',
       'right_marge', 'nb_lignes', 'nb_colonnes', 'sharpness', 'noise',
       'ratio_b', 'ratio_n', 'entropy','width']

scaler = RobustScaler()

# Fit sur le train
X_train[cols_to_normalize] = scaler.fit_transform(X_train[cols_to_normalize])

# Transform val et test
X_val[cols_to_normalize] = scaler.transform(X_val[cols_to_normalize])
X_test[cols_to_normalize] = scaler.transform(X_test[cols_to_normalize])


## 1.3. Lazy Classifier ?
Bon, trop de colonnes, ça ne tourne pas, donc je commence par faire une PCA pour garder le plus possible de l'information, tout en réduisant drastiquement la nombre de colonnes. 

In [None]:
pca = PCA()
#on va regarder combien il faut garder de dimensions pour ne pas perdre trop d'information. 
coord_pca = pd.DataFrame(data = pca.fit_transform(X_train), 
                         columns =[f'PC{i+1}' for i in range (len(X_train))])
coord_pca.head()

In [None]:
CL = pd.DataFrame(data = pca.components_,
                  columns = X_train.columns,
                  index = coord_pca.columns)
CL.head()

In [None]:
var = pd.DataFrame(data = pca.explained_variance_ratio_,
                  index = coord_pca.columns)
var.head()
# une seule composante explique quasiment 30% de la variance, et après ça chute très très vite...

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', ms= 2)
plt.axhline(y = 0.9, color ='r', linestyle = '--')
plt.xlabel("Nombre de composantes")
plt.ylabel("Variance expliquée cumulée")
plt.title("Variance expliquée")
plt.grid(True)
plt.show()


In [None]:
## PCA
n_components = 1000  # ou moins, à tester
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
clf=LazyClassifier(verbose = 0, 
                   ignore_warnings=True, 
                   custom_metric=None)
models, predictions=clf.fit(X_train_pca, X_test_pca, y_train, y_test)
display(models)

## 1.4. On se lance dans les classifiers suivants: 
- LGBM
- XGB
- SGD

### LGBM Classifier

In [None]:
clf_lgbm =LGBMClassifier()

In [None]:
clf_lgbm.fit(X_train_pca, y_train)

In [None]:
y_pred=clf_lgbm.predict(X_test_pca)

In [None]:
cm = pd.crosstab(y_test, y_pred, rownames=['classe réelle'], colnames=['classe prédite'])

In [None]:
sns.heatmap(cm, annot=True)
plt.title('Matrice de confusion, LGBM, paramètres par défaut')

In [None]:
print(classification_report(y_test, y_pred))

# 2. Recherche des meilleurs paramètres pour les 3 modèles

### Chargement des données

In [None]:
df = pd.read_parquet(PATHS.processed_data / "df_img_features_pixels.parquet")
data_sets = pd.read_parquet(PATHS.metadata / "df_data_sets.parquet")
labels = pd.read_parquet(PATHS.metadata / "df_encoded_labels.parquet")

In [None]:
# desactiver pour travailler sur un échantillon
sample = pd.read_parquet(PATHS.samples / 'df_documents_sample_10k_1.parquet')
df = sample.join(df)
data_sets = sample.join(data_sets)
labels = sample.join(labels)

In [None]:
#features = df.columns
labels = labels.label # DF -> Serie

X_train = df[data_sets.data_set == "train"]
y_train = labels[data_sets.data_set == "train"]

X_val = df[data_sets.data_set == "val"]
y_val = labels[data_sets.data_set == "val"]

X_test = df[data_sets.data_set == "test"]
y_test = labels[data_sets.data_set == "test"]

# On libère de la mémoire
del df, labels, data_sets

### Preprocessing

In [None]:
# Liste des colonnes à normaliser
cols_to_normalize = ['top_marge', 'bottom_marge', 'left_marge',
       'right_marge', 'nb_lignes', 'nb_colonnes', 'sharpness', 'noise',
       'ratio_b', 'ratio_n', 'entropy','width']

scaler = RobustScaler()

# Fit sur le train
X_train[cols_to_normalize] = scaler.fit_transform(X_train[cols_to_normalize])

# Transform val et test
X_val[cols_to_normalize] = scaler.transform(X_val[cols_to_normalize])
X_test[cols_to_normalize] = scaler.transform(X_test[cols_to_normalize])

In [None]:
X_train_pca = pipeline.transform(X_train)
X_val_pca = pipeline.transform(X_val)
X_test_pca = pipeline.transform(X_test)

### LGBM

In [None]:
start_time = time.time()

# 1. Définir l’espace de recherche
search_spaces = {
    'num_leaves': Integer(20, 150),
    'max_depth': Integer(3, 15),
    'learning_rate': Real(0.01, 0.3, 'log-uniform'),
    'n_estimators': Integer(50, 500),
    'min_child_samples': Integer(10, 100),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.5, 1.0),
    'reg_alpha': Real(1e-8, 10.0, 'log-uniform'),
    'reg_lambda': Real(1e-8, 10.0, 'log-uniform')
}

# 2. Définir la validation croisée
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 3. Définir le classifieur LightGBM
clf = LGBMClassifier(random_state=25, n_jobs=-1)

# 4. Configurer BayesSearchCV
opt = BayesSearchCV(
    estimator=clf,
    search_spaces=search_spaces,
    n_iter=20,  #nombre de combinaisons qui seront testées
    cv=cv,
    scoring='f1_weighted',  # tu peux aussi tester avec 'balanced_accuracy'
    verbose=2,
    n_jobs=-1,
    random_state=96,
)

# 5. Entraîner l’optimiseur
opt.fit(X_train_pca, y_train)

# 6. Afficher les meilleurs paramètres
print("Meilleurs hyperparamètres :")
print(opt.best_params_)
print(f" Meilleur score (f1_weighted, cross-val) : {opt.best_score_:.4f}")

# 7. Prédire avec le meilleur modèle
y_pred = opt.predict(X_test_pca)

# 8. Rapport de classification
print("\n Rapport de classification :")
print(classification_report(y_test, y_pred))

# 9. Matrice de confusion
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matrice de confusion")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

# 10. (optionnel) DataFrame des scores
report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
print("\n Rapport sous forme de DataFrame :")
print(report_df.head(16))  # Affiche les 16 classes
print('')
print('###############################')
end_time = time.time()
elapsed = end_time - start_time
print(f" Temps d'exécution total : {elapsed / 60:.2f} minutes ({elapsed:.1f} secondes)")

In [None]:
# on lance sur le set de validation : 
best_params = opt.best_params_
X_trainval = np.concatenate([X_train_pca, X_test_pca])
y_trainval = np.concatenate([y_train, y_test])

final_model = LGBMClassifier(**best_params, random_state=91, n_jobs=-1)
final_model.fit(X_trainval, y_trainval)



In [None]:
y_val_pred = final_model.predict(X_val_pca)


print("Évaluation sur le set de validation :")
print(classification_report(y_val, y_val_pred))

# Matrice de confusion
cm_val = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_val, annot=True, fmt='d', cmap='Oranges')
plt.title("Matrice de confusion - Validation")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

### XGB Classifier

In [None]:
start_time = time.time()

# 1. Espace de recherche pour XGBoost
search_spaces = {
    'n_estimators': Integer(50, 500),
    'max_depth': Integer(3, 15),
    'learning_rate': Real(0.01, 0.3, 'log-uniform'),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.5, 1.0),
    'gamma': Real(0, 5.0),  # min split loss
    'reg_alpha': Real(1e-8, 10.0, 'log-uniform'),
    'reg_lambda': Real(1e-8, 10.0, 'log-uniform')
}

# 2. Cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=93)

# 3. XGBClassifier
clf = XGBClassifier(
    objective='multi:softmax',  # ou 'multi:softprob' si tu veux des proba
    num_class=16,
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1,
    random_state=21
)

# 4. BayesSearchCV
opt = BayesSearchCV(
    estimator=clf,
    search_spaces=search_spaces,
    n_iter=20,
    cv=cv,
    scoring='f1_weighted',
    verbose=2,
    n_jobs=-1,
    random_state=56
)

# 5. Entraînement
opt.fit(X_train_pca, y_train)

# 6. Meilleurs hyperparamètres
print("Meilleurs hyperparamètres :")
print(opt.best_params_)
print(f" Meilleur score (f1_weighted, cross-val) : {opt.best_score_:.4f}")

# 7. Prédictions
y_pred = opt.predict(X_test_pca)

# 8. Rapport
print("\n Rapport de classification :")
print(classification_report(y_test, y_pred))

# 9. Matrice de confusion
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matrice de confusion - XGBoost")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

# 10. DataFrame des scores
report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
print("\n Rapport sous forme de DataFrame :")
print(report_df.head(16))

print('')
print('###############################')
end_time = time.time()
elapsed = end_time - start_time
print(f" Temps d'exécution total : {elapsed / 60:.2f} minutes ({elapsed:.1f} secondes)")

In [None]:
# on lance sur le set de validation : 
best_params_1 = opt.best_params_
X_trainval = np.concatenate([X_train_pca, X_test_pca])
y_trainval = np.concatenate([y_train, y_test])

final_model_1 = XGBClassifier(**best_params_1, random_state=91, n_jobs=-1)
final_model_1.fit(X_trainval, y_trainval)

y_val_pred_1 = final_model_1.predict(X_val_pca)


print("Évaluation sur le set de validation :")
print(classification_report(y_val, y_val_pred_1))

# Matrice de confusion
cm_val_1 = confusion_matrix(y_val, y_val_pred_1)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_val_1, annot=True, fmt='d', cmap='Oranges')
plt.title("Matrice de confusion XGB- Validation")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

### SGD Classifier

In [None]:
start_time = time.time()

search_spaces = {
    'loss': Categorical(['hinge', 'log_loss', 'modified_huber', 'squared_hinge']),
    'penalty': Categorical(['l2', 'l1', 'elasticnet']),
    'alpha': Real(1e-6, 1e-2, prior='log-uniform'),
    'learning_rate': Categorical(['constant', 'optimal', 'invscaling', 'adaptive']),
    'eta0': Real(1e-4, 1e-1, prior='log-uniform'),
    'max_iter': Integer(1000, 3000),
    'tol': Real(1e-5, 1e-2, prior='log-uniform')
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=93)

clf_3 = SGDClassifier(
    random_state=21,
    n_jobs=-1,
    early_stopping=True
)

opt_3 = BayesSearchCV(
    estimator=clf_3,
    search_spaces=search_spaces,
    n_iter=30,
    cv=cv,
    scoring='f1_weighted',
    verbose=2,
    n_jobs=-1,
    random_state=56
)

opt_3.fit(X_train_pca, y_train)

print("Meilleurs hyperparamètres :")
print(opt_3.best_params_)
print(f" Meilleur score (f1_weighted, cross-val) : {opt_3.best_score_:.4f}")

y_pred_3 = opt_3.predict(X_test_pca)

print("\n Rapport de classification :")
print(classification_report(y_test, y_pred_3))

cm_3 = confusion_matrix(y_test, y_pred_3)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_3, annot=True, fmt='d', cmap='Blues')
plt.title("Matrice de confusion - SGDClassifier")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

report_df = pd.DataFrame(classification_report(y_test, y_pred_3, output_dict=True)).transpose()
print("\n Rapport sous forme de DataFrame :")
print(report_df.head(16))

end_time = time.time()
elapsed = end_time - start_time
print(f" Temps d'exécution total : {elapsed / 60:.2f} minutes ({elapsed:.1f} secondes)")

In [None]:
# on lance sur le set de validation : 
best_params_3 = opt_3.best_params_
X_trainval = np.concatenate([X_train_pca, X_test_pca])
y_trainval = np.concatenate([y_train, y_test])

final_model_3 = SGDClassifier(**best_params_3, random_state=35, n_jobs=-1)
final_model_3.fit(X_trainval, y_trainval)

y_val_pred_3 = final_model_3.predict(X_val_pca)


print("Évaluation sur le set de validation :")
print(classification_report(y_val, y_val_pred_3))

# Matrice de confusion
cm_val_3 = confusion_matrix(y_val, y_val_pred_3)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_val_3, annot=True, fmt='d', cmap='Oranges')
plt.title("Matrice de confusion SGD - Validation")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

In [None]:
# On a établi que le meilleur était le LGBM

# 3. Création du pipeline et des modèles

### Chargement des données

In [None]:
df = pd.read_parquet(PATHS.processed_data / "df_img_features_pixels.parquet")
data_sets = pd.read_parquet(PATHS.metadata / "df_data_sets.parquet")
labels = pd.read_parquet(PATHS.metadata / "df_encoded_labels.parquet")

In [None]:
#features = df.columns
labels = labels.label # DF -> Serie

X_train = df[data_sets.data_set == "train"]
y_train = labels[data_sets.data_set == "train"]

X_val = df[data_sets.data_set == "val"]
y_val = labels[data_sets.data_set == "val"]

X_test = df[data_sets.data_set == "test"]
y_test = labels[data_sets.data_set == "test"]

# On libère de la mémoire
del df, labels, data_sets

### Preprocessing

In [None]:
t0 = time.time()
cols_to_normalize = ['top_marge', 'bottom_marge', 'left_marge',
       'right_marge', 'nb_lignes', 'nb_colonnes', 'sharpness', 'noise',
       'ratio_b', 'ratio_n', 'entropy','width']

scaler = ColumnTransformer(
    transformers=[
        ("partial_scaling", StandardScaler(), cols_to_normalize)
    ],
    remainder="passthrough"
)

pipeline = Pipeline(steps=[
    ("scaling", scaler),
    ("pca", PCA(n_components=2000))
])
pipeline.fit(X_train)
print(f"Fit réalisé en {time.time()-t0:.0f} secondes")

In [None]:
# Sauvegarde du pipeline
joblib.dump(pipeline, PATHS.models / "img_ml_pipeline.joblib")

In [None]:
X_train_pca = pipeline.transform(X_train)
X_val_pca = pipeline.transform(X_val)
X_test_pca = pipeline.transform(X_test)

In [None]:
# On libère de la mémoire
del X_train, X_val, X_test

### LGBM

In [None]:
start_time = time.time()
clf = LGBMClassifier(
    num_leaves = 30,
    max_depth = 15,
    learning_rate = 0.029216387145600653,
    n_estimators = 500,
    min_child_samples = 69,
    subsample = 0.654130102375878,
    colsample_bytree = 0.5802168967298673,
    reg_alpha = 1.3883805031132697e-08,
    reg_lambda = 0.00016690235239007222,
    random_state=25,
    n_jobs=-1
)
clf.fit(X_train_pca, y_train)
end_time = time.time()
elapsed = end_time - start_time
print(f" Temps d'exécution total : {elapsed / 60:.2f} minutes ({elapsed:.1f} secondes)")

In [None]:
joblib.dump(clf, PATHS.models / "img_lgbm.joblib")

In [None]:
y_pred = clf.predict(X_test_pca)

print("\n Rapport de classification :")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matrice de confusion")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

### XGB Classifier

In [None]:
start_time = time.time()

clf = XGBClassifier(
    n_estimators = 350,
    max_depth = 10,
    learning_rate = 0.03190583920977902,
    gamma = 0.0,
    subsample = 0.7400533807082111,
    colsample_bytree = 0.7755797682013672,
    reg_alpha = 2.2536648641150143e-05,
    reg_lambda = 1.3280200367043574e-05,
    objective='multi:softmax',  # 'multi:softprob' pour avoir des proba
    num_class=16,
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1,
    random_state=21
)

clf.fit(X_train_pca, y_train)
end_time = time.time()
elapsed = end_time - start_time
print(f" Temps d'exécution total : {elapsed / 60:.2f} minutes ({elapsed:.1f} secondes)")

In [None]:
joblib.dump(clf, PATHS.models / "img_xgboost.joblib")

In [None]:
y_pred = clf.predict(X_test_pca)

print("\n Rapport de classification :")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matrice de confusion")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

### SGD Classifier

In [None]:
start_time = time.time()


clf = SGDClassifier(
    # exception_aux best_params pour permettre la mise dans un ModelWrapper
    # avec une fonction predict_proba (squared_hinge ne permet pas cela)
    # loss = 'squared_hinge',
    loss = 'log_loss', 
    penalty = 'l1',
    alpha = 0.0069955313416296735,
    max_iter = 1102,
    tol = 0.00032010771743838396,
    n_jobs=-1,
    random_state=21,
    learning_rate = 'adaptive',
    eta0 = 0.04760145917148742,
    early_stopping=True
)
clf.fit(X_train_pca, y_train)
end_time = time.time()
elapsed = end_time - start_time
print(f" Temps d'exécution total : {elapsed / 60:.2f} minutes ({elapsed:.1f} secondes)")

In [None]:
joblib.dump(clf, PATHS.models / "img_sgd.joblib")

In [None]:
y_pred = clf.predict(X_test_pca)

print("\n Rapport de classification :")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matrice de confusion")
plt.xlabel("Classe prédite")
plt.ylabel("Classe réelle")
plt.show()

In [None]:
# On a établi que le meilleur était le LGBM