# Modèles ML par traitement du texte

## README
Ce notebook permet la création et l'évaluation de modèles ML sur le texte.

Il s'appuie pour cela sur les données ocr pré-processées contenues dans la df df_txt_ocr1

## 1. Préparation

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if not project_root in [Path(p).resolve() for p in sys.path]:
    sys.path.append(str(project_root))

from src import PATHS

In [None]:
import joblib
import pandas as pd
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

## 2. Chargement des données

In [None]:
features = pd.read_parquet(PATHS.processed_data / 'df_txt_ocr1.parquet')
features.shape

In [None]:
data_sets = pd.read_parquet(PATHS.metadata / "df_data_sets.parquet")
labels = pd.read_parquet(PATHS.metadata / "df_encoded_labels.parquet")

In [None]:
# au cas où features soit incomplet
data_sets = data_sets.join(features[[]], how="inner")
labels = labels.join(features[[]], how="inner")

features.shape, data_sets.shape, labels.shape

In [None]:
X = features.ocr # (vectorizer expects 1D array)
y = labels.label

X_train = X[data_sets.data_set == "train"]
y_train = y[data_sets.data_set == "train"]

X_val = X[data_sets.data_set == "val"]
y_val = y[data_sets.data_set == "val"]

X_test = X[data_sets.data_set == "test"]
y_test = y[data_sets.data_set == "test"]

# On libère de la mémoire
del features, labels, data_sets

## 3. Vectorisation

In [None]:
vectorizer = joblib.load(PATHS.models / "txt_tfid_vectorizer.joblib") # le vectorizer a déjà été entrainé dans le notebook ML (5.3)
X_train_vect = vectorizer.transform(X_train) # vectorizer est déjà fited
X_val_vect   = vectorizer.transform(X_val)
X_test_vect = vectorizer.transform(X_test)

## 4. Modélisation

In [None]:
# Définition du modèle MLP
model_mlp = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_vect.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='softmax')  # Softmax pour multi-classe
])

model_mlp.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Entraînement
model_mlp.fit(X_train_vect.toarray(), y_train,
              epochs=5, batch_size=32,
              validation_data=(X_val_vect.toarray(), y_val))


# 5. Sauvegarde du modèle

In [None]:
model_mlp.save(PATHS.models / "txt_mlp.keras")