# Modèles ML par traitement du texte

## README
Ce notebook permet la création et l'évaluation de modèles ML sur le texte.

Il s'appuie pour cela sur les données ocr pré-processées contenues dans la df df_txt_ocr1

## 1. Préparation

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if not project_root in [Path(p).resolve() for p in sys.path]:
    sys.path.append(str(project_root))

from src import PATHS

In [None]:
import joblib
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

## 2. Chargement des données

In [None]:
features = pd.read_parquet(PATHS.processed_data / 'df_txt_ocr1.parquet')
features.shape

In [None]:
data_sets = pd.read_parquet(PATHS.metadata / "df_data_sets.parquet")
labels = pd.read_parquet(PATHS.metadata / "df_encoded_labels.parquet")

In [None]:
sample = pd.read_parquet(PATHS.samples / 'df_documents_sample_4k_3.parquet')
features = features.join(sample, how='inner')
features.shape

In [None]:
# au cas où features soit incomplet
data_sets = data_sets.join(features[[]], how="inner")
labels = labels.join(features[[]], how="inner")

features.shape, data_sets.shape, labels.shape

In [None]:
X = features.ocr # (vectorizer expects 1D array)
y = labels.label

X_train = X[data_sets.data_set == "train"]
y_train = y[data_sets.data_set == "train"]

X_val = X[data_sets.data_set == "val"]
y_val = y[data_sets.data_set == "val"]

X_test = X[data_sets.data_set == "test"]
y_test = y[data_sets.data_set == "test"]

# On libère de la mémoire
del features, labels, data_sets

## 3. Vectorisation

In [None]:
vectorizer = joblib.load(PATHS.models / "txt_tfid_vectorizer.joblib") # le vectorizer a déjà été entrainé dans le notebook ML (5.3)
X_train_vect = vectorizer.transform(X_train) # vectorizer est déjà fited
X_val_vect   = vectorizer.transform(X_val)
X_test_vect = vectorizer.transform(X_test)

## 4. Modélisation

### 4.1 Premier MLP

In [None]:
# Définition du modèle MLP
first_mlp = Sequential([
    Input(shape=(X_train_vect.shape[1],)),  
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='softmax')  # Softmax pour multi-classe
])

first_mlp.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Entraînement
first_mlp.fit(X_train_vect.toarray(), y_train,
              epochs=5, batch_size=32,
              validation_data=(X_val_vect.toarray(), y_val))


In [None]:
y_pred = np.argmax(first_mlp.predict(X_val_vect), axis=1)
print(classification_report(y_val, y_pred))

### 4.2 Second MLP

In [None]:
# Modèle MLP léger (scikit-learn)
second_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
second_mlp.fit(X_train_vect, y_train)

In [None]:
y_pred = second_mlp.predict(X_val_vect)
print(classification_report(y_val, y_pred))

In [None]:
# Matrice de confusion
cm = confusion_matrix(y_val, y_pred)

# Création de la heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

plt.xlabel("Prédit")
plt.ylabel("Réel")
plt.title("Matrice de confusion - MLPClassifier")
plt.tight_layout()
plt.show()

In [None]:
plt.plot(second_mlp.loss_curve_)
plt.title("Courbe de perte - MLP")
plt.xlabel("Itérations")
plt.ylabel("Loss")
plt.grid(True)
plt.show()

### 4.3 Recherche de paramètres optimaux

In [None]:
param_grid = {
    'hidden_layer_sizes': [(100,), (128,), (100, 50), (128, 64)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],  # régularisation L2
    'learning_rate': ['constant', 'adaptive']
}

grid = GridSearchCV(MLPClassifier(max_iter=10, random_state=42), param_grid, cv=3, scoring='accuracy', verbose=2)
grid.fit(X_train_vect, y_train)

print("Meilleurs paramètres :", grid.best_params_)


In [None]:
print(classification_report(y_val, y_pred, target_names=encoder.classes_.astype(str)))


In [None]:
# Meilleurs paramètres : {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (128, 64), 'learning_rate': 'constant'}

In [None]:
third_mlp = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=42, activation='tanh')
third_mlp.fit(X_train_vect, y_train)

# 5. Sauvegarde du modèle

In [None]:
model_mlp.save(PATHS.models / "txt_mlp.keras")