# Modèles ML par traitement du texte

## README
Ce notebook permet la création et l'évaluation de modèles ML sur le texte.

Il s'appuie pour cela sur les données ocr pré-processées contenues dans la df df_txt_ocr1

## 1. Préparation

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if not project_root in [Path(p).resolve() for p in sys.path]:
    sys.path.append(str(project_root))

from src import PATHS

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
import joblib

## 2. Chargement des données

In [None]:
features = pd.read_parquet(PATHS.processed_data / 'df_txt_ocr1.parquet')
features.shape

In [None]:
data_sets = pd.read_parquet(PATHS.metadata / "df_data_sets.parquet")
labels = pd.read_parquet(PATHS.metadata / "df_encoded_labels.parquet")

In [None]:
# au cas où features soit incomplet
data_sets = data_sets.join(features[[]], how="inner")
labels = labels.join(features[[]], how="inner")

features.shape, data_sets.shape, labels.shape

In [None]:
X = features.ocr # (vectorizer expects 1D array)
y = labels.label

X_train = X[data_sets.data_set == "train"]
y_train = y[data_sets.data_set == "train"]

X_val = X[data_sets.data_set == "val"]
y_val = y[data_sets.data_set == "val"]

X_test = X[data_sets.data_set == "test"]
y_test = y[data_sets.data_set == "test"]

# On libère de la mémoire
del features, labels, data_sets

## 3. Vectorisation

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_val_vect   = vectorizer.transform(X_val)
X_test_vect = vectorizer.transform(X_test)

## 4. Modélisation

In [None]:
 #Modèles à tester
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB()}


In [None]:
#Entraînement et évaluation
for name, model in models.items():
    model.fit(X_train_vect, y_train)
    y_pred = model.predict(X_val_vect)
    print(f"\n--- {name} ---")
    print(classification_report(y_val, y_pred))

In [None]:
# Dictionnaire pour stocker les métriques
scores = {}

for name, model in models.items():
    model.fit(X_train_vect, y_train)
    y_pred = model.predict(X_val_vect)
    print(f"\n--- {name} ---")
    print(classification_report(y_val, y_pred))

    # Stockage des métriques
    report = classification_report(y_val, y_pred, output_dict=True)
    scores[name] = {
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1-score': report['macro avg']['f1-score']
    }

In [None]:
# Extraction des métriques
labels = list(scores.keys())
precision = [scores[model]['precision'] for model in labels]
recall = [scores[model]['recall'] for model in labels]
f1 = [scores[model]['f1-score'] for model in labels]

x = range(len(labels))
width = 0.25

# Graphique
plt.figure(figsize=(10, 6))
plt.bar([p - width for p in x], precision, width=width, label='Précision')
plt.bar(x, recall, width=width, label='Rappel')
plt.bar([p + width for p in x], f1, width=width, label='F1-score')

plt.xticks(x, labels, rotation=45)
plt.ylabel("Score (macro avg)")
plt.title("Comparaison des métriques par modèle")
plt.ylim(0, 1.05)
plt.legend()
plt.tight_layout()
plt.show()


# 5. Sauvegarde des modèles

In [None]:
joblib.dump(vectorizer, PATHS.models / "txt_tfid_vectorizer.joblib")
joblib.dump(models['Logistic Regression'], PATHS.models / "txt_logistic_regressor.joblib")
joblib.dump(models['Random Forest'], PATHS.models / "txt_random_forest.joblib")
joblib.dump(models['Naive Bayes'], PATHS.models / "txt_naive_bayes.joblib")
