# Modèles multimodaux - CLIP

## README
Ce notebook permet la création et l'évaluation d'un modèle basé sur l'architecture existatne CLIP.

# TODO

Le chapitre 1 prépare les données nécessaires

Le chapitre 2 crée et évalue un modèle de voting simple, par moyennage des prédictions de 2 modèles (1 images et 1 texte)

Le chapitre 3 propose une version avancée, avec pondérations, apprises ou non, des résultats des deux modèles utilisés

## 1. Préparation

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if not project_root in [Path(p).resolve() for p in sys.path]:
    sys.path.append(str(project_root))

from src import PATHS

In [None]:
import os
import time
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from src.models.multimodal import MultiModalVoter, MultiModalClassWeightedVoter, MultiModalLogisticRegressor
from src.visualization.visualize import visual_classification_report

## 2. Chargement des données

In [None]:
text_features = pd.read_parquet(os.path.join(PATHS.processed_data, 'df_txt_ocr1.parquet'))
text_features.shape

In [None]:
# Dans src/utils?
def get_converted_image_path(tif_path):
    return os.path.join(
        PATHS.converted_images,
        tif_path.replace('raw/RVL-CDIP/images/', '').replace('.tif', '.jpg')
    )


In [None]:
image_features = pd.read_parquet(os.path.join(PATHS.metadata,'df_filepaths.parquet'))[['rvl_image_path']]
image_features["filepath"] = image_features.rvl_image_path.apply(get_converted_image_path)
image_features.drop(columns="rvl_image_path", inplace=True)
image_features.shape

In [None]:
features = text_features.join(image_features, how="inner")
del image_features, text_features
features.shape

In [None]:
data_sets = pd.read_parquet(os.path.join(PATHS.metadata, "df_data_sets.parquet"))
labels = pd.read_parquet(os.path.join(PATHS.metadata, "df_encoded_labels.parquet"))

In [None]:
# au cas où features soit incomplet
data_sets = data_sets.join(features[[]], how="inner")
labels = labels.join(features[[]], how="inner")

features.shape, data_sets.shape, labels.shape

In [None]:
X_train = features[data_sets.data_set == "train"]
y_train = labels[data_sets.data_set == "train"]

X_val = features[data_sets.data_set == "val"]
y_val = labels[data_sets.data_set == "val"]

X_test = features[data_sets.data_set == "test"]
y_test = labels[data_sets.data_set == "test"]

In [None]:
from src.models.multimodal_clip import MultimodalCLIPBasedClassifier

In [None]:
clf = MultimodalCLIPBasedClassifier()

In [None]:
embeddings = pd.read_parquet(os.path.join(PATHS.processed_data, 'df_clip_embeddings.parquet'))

In [None]:
embeddings.shape

In [None]:
t0 = time.time()
clf.fit(X_train, y_train, embeddings=X_train[[]].join(embeddings))
print(f"Terminé en {time.time()-t0:.2f} secondes")

In [None]:
from src.visualization.visualize import visual_classification_report
visual_classification_report(clf, X_test, y_test, "CLIP-based Logistic Regressor Model")