# Modèles multimodaux autonomes

## README
Ce notebook permet la création et l'évaluation de modèles multimodaux autonomes.

Il réalise tout d'abord certaines opérations préalables (chapitre 1), dont la définition des variables globales d'exécution

## 1. Préparation

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if not project_root in [Path(p).resolve() for p in sys.path]:
    sys.path.append(str(project_root))

from src import PATHS

In [None]:
import os
import time
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from src.models.multimodal import MultiModalVoter, MultiModalClassWeightedVoter, MultiModalLogisticRegressor
from src.visualization.visualize import visual_classification_report

## 2. Chargement des données

In [None]:
text_features = pd.read_parquet(os.path.join(PATHS.processed_data, 'df_txt_ocr1.parquet'))
text_features.shape

In [None]:
image_features = pd.read_parquet(os.path.join(PATHS.processed_data, "df_img_features_flattened.parquet"))
image_features.shape

In [None]:
features = text_features.join(image_features, how="inner")
del image_features, text_features
features.shape

In [None]:
data_sets = pd.read_parquet(os.path.join(PATHS.metadata, "df_data_sets.parquet"))
labels = pd.read_parquet(os.path.join(PATHS.metadata, "df_encoded_labels.parquet"))

In [None]:
# au cas où features soit incomplet
data_sets = data_sets.join(features[[]], how="inner")
labels = labels.join(features[[]], how="inner")

features.shape, data_sets.shape, labels.shape

In [None]:
X_train = features[data_sets.data_set == "train"]
y_train = labels[data_sets.data_set == "train"]

X_val = features[data_sets.data_set == "val"]
y_val = labels[data_sets.data_set == "val"]

X_test = features[data_sets.data_set == "test"]
y_test = labels[data_sets.data_set == "test"]

del features, labels, data_sets

## 3. Chargement des pipelines

In [None]:
available_pipelines = [pipeline for pipeline in  os.listdir(PATHS.pipelines) if pipeline.endswith(".pkl")]
img_pipelines = [pipeline for pipeline in available_pipelines if pipeline.startswith("img")]
txt_pipelines = [pipeline for pipeline in available_pipelines if pipeline.startswith("txt")]
print(img_pipelines)
print(txt_pipelines)

## TODO: mettre dans src 

In [None]:
def df_to_serie(df):
    assert len(df.columns) == 1
    return df[df.columns[0]]

In [None]:
with open(os.path.join(PATHS.pipelines, "img_lgbm.pkl"), "rb") as f:
    img_pipeline = pickle.load(f)
with open(os.path.join(PATHS.pipelines, "txt_ml.pkl"), "rb") as f:
    txt_pipeline = pickle.load(f)


## 4. Chargement des modèles

In [None]:
available_models = [model for model in  os.listdir(PATHS.models) if model.endswith(".pkl")]
img_models = [model for model in available_models if model.startswith("img")]
txt_models = [model for model in available_models if model.startswith("txt")]
print(img_models)
print(txt_models)

In [None]:
with open(os.path.join(PATHS.models, "img_lgbm.pkl"), "rb") as f:
    img_model = pickle.load(f)
with open(os.path.join(PATHS.models, "txt_logistic_regression.pkl"), "rb") as f:
    txt_model = pickle.load(f)

In [None]:
img_model, txt_model

## 5. Regression logistique

In [None]:
model = MultiModalLogisticRegressor(img_pipeline, img_model, txt_pipeline, txt_model)
model.fit(X_train, y_train)

In [None]:
visual_classification_report(model, X_test, y_test, "Logistic Regressor", compare_with_components=True)

In [None]:
model.score(X_test, y_test)

## 6. Impact de la taille du jeu de données

In [None]:
n_docs = X_train.shape[0]
scores = []
n_documents = []
for i in range(10):
    print(i, n_docs, '...', end='')
    model = MultiModalLogisticRegressor(img_pipeline, img_model, txt_pipeline, txt_model)
    print(' created ...', end='')
    model.fit(X_train, y_train)
    print(' fitted ...', end='')
    n_documents.append(n_docs)
    scores.append(model.score(X_test, y_test))
    print(' evaluated')
    n_docs //= 2
    X_train = X_train[:n_docs]
    y_train = y_train[:n_docs]
    X_test = X_test[:n_docs]
    y_test = y_test[:n_docs]
    

In [None]:
plt.figure(figsize=(8,8))
plt.plot(n_documents, scores)
plt.xlabel("Nombre de documents")
plt.ylabel("Score MMO Logistic Regressor")
plt.title("Score obtenu en fonction du volume de données utilisées");

           