# Création des dataframes de base

## README
Ce notebook permet de créer plusieurs dataframe, qui pourront être utilisées par la suite dans les différents notebook. Pour toutes les dataframe dont les lignes correspondent à des documents, l'index sera l'identifiant du document en question, tel que défini dans le jeu de données IIT_CDIP. Cela garantit l'unicité de cet identifiant, et facilitera pour toute la suite des travaux les jointures entre documents. 

Il réalise tout d'abord certaines opérations préalables (chapitre 1), dont la configuration des répertoires du projet

A l'issue (chapitre 2), il permet de créer les DataFrame utiles à la suite du projet et dont les caractéristiques sont rappelées dans le compte-rendu de projet. 

## 1. Préparation

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if not project_root in [Path(p).resolve() for p in sys.path]:
    sys.path.append(str(project_root))

from src import PATHS

In [None]:
import os
import time
import numpy as np
import pandas as pd
from lxml import etree
from functools import reduce
from utils import remove_ds_store_files

In [None]:
project_path = '/Users/ben/Work/mle/ds-project/mai25_bds_extraction/' # à modifier par chacun en fonction de son arborescence

data_path = os.path.join(project_path, 'data')
raw_data_path = os.path.join(data_path, 'raw')
processed_data_path = os.path.join(data_path, 'processed')
metadata_data_path = os.path.join(data_path, 'metadata')

raw_rvl_cdip_path = os.path.join(raw_data_path, 'RVL-CDIP')
rvl_cdip_images_path = os.path.join(raw_rvl_cdip_path, 'images')
rvl_cdip_labels_path = os.path.join(raw_rvl_cdip_path, 'labels')

iit_cdip_images_path = os.path.join(raw_data_path, 'IIT-CDIP', 'images')
iit_cdip_xmls_path = os.path.join(raw_data_path, 'IIT-CDIP', 'xmls')

## 2. Création du DataFrame DOCUMENTS

### 2.1 Filename, rvl_image_path 

In [None]:
def get_documents_base():
    tmp_list = []
    for foldername, _, filenames in os.walk(PATHS.rvl_cdip_images):
        for filename in filenames:
            if filename.endswith(".tif"):
                tmp_list.append((
                    os.path.basename(foldername), 
                    filename,
                    os.path.join(os.path.relpath(foldername, PATHS.data), filename),
                ))
    tmp_list.sort()
    return pd.DataFrame(tmp_list, columns = ["document_id", "filename", "rvl_image_path"])

In [None]:
t = time.time()
df_base = get_documents_base()
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
df_base.head()

### 2.2 Labels et Data Sets

In [None]:
def get_labels_and_data_sets():
    data_sets = ["train.txt", "val.txt", "test.txt"]
    dataframes = []
    for data_set in data_sets:
        df = pd.read_csv(os.path.join(PATHS.labels, data_set), sep= ' ', names = ["image_path", "label"])
        df.insert(
            loc = 0,
            column = "document_id",
            value = df.image_path.apply(lambda x: x.split("/")[-2])
        )
        df.insert(
            loc = 3,
            column = "data_set",
            value = data_set[:-4]
        )
        df.drop(columns="image_path", inplace=True)
        dataframes.append(df)
    return pd.concat(dataframes).sort_values(by="document_id").reset_index(drop=True)

In [None]:
t = time.time()
df_labels_and_data_sets = get_labels_and_data_sets()
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
df_labels_and_data_sets.head()

### 2.3 iit_image_path, iit_individual_xml_path 

In [None]:
def get_iit_individual_data():
    tmp_list = []
    for foldername, _, filenames in os.walk(PATHS.iit_cdip_images):
        tif_path = None
        xml_path = None
        for filename in filenames:
            if filename.endswith(".tif"):
                tif_path = os.path.join(os.path.relpath(foldername, PATHS.data), filename)
            if filename.endswith(".xml"):
                xml_path = os.path.join(os.path.relpath(foldername, PATHS.data), filename)
        if tif_path or xml_path:
            tmp_list.append((
                os.path.basename(foldername), 
                tif_path,
                xml_path
            ))
    tmp_list.sort()
    return pd.DataFrame(tmp_list, columns = ["document_id", "iit_image_path", "iit_individual_xml_path"]).replace({None: np.nan})

In [None]:
t = time.time()
df_iit_ind = get_iit_individual_data()
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
df_iit_ind.head()

### 2.4 iit_collective_xml_path

In [None]:
def get_iit_collective_data():
    tmp_list = []
    for foldername, _, filenames in os.walk(PATHS.iit_cdip_xmls):
        for filename in filenames:
            if filename.endswith(".xml"):
                xml_file = os.path.join(os.path.relpath(foldername, PATHS.data), filename)
                parser = etree.XMLParser(recover=True)
                tree = etree.parse(os.path.join(PATHS.data, xml_file), parser)
                root = tree.getroot()
                ids_in_file = [record.find("docid").text for record in root.findall("record")]
                tmp_list += [(id_, xml_file) for id_ in ids_in_file]
    tmp_list.sort()
    return pd.DataFrame(tmp_list, columns = ["document_id", "iit_collective_xml_path"])

In [None]:
t = time.time()
df_iit_coll = get_iit_collective_data()
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
df_iit_coll.head()

### 2.5 Concaténation

In [None]:
dfs = [df_base, df_labels_and_data_sets, df_iit_ind, df_iit_coll]  # liste de DataFrames
df_documents = reduce(lambda left, right: pd.merge(left, right, on='document_id', how="left"), dfs)
df_documents.set_index('document_id', inplace = True)
df_documents.head()

In [None]:
df_documents.dtypes

In [None]:
# on abserve qu'il manque des documents en termes d'xml
# cet aspect sera traité ultérieurement (preprocessing texte)
df_documents.isna().sum()

### 2.6 Sauvegarde
Un comparatif sur la Dafaframe df_documents met en avant le gain de place obtenu avec parquet (24 Mo, par rapport à pickle (82 Mo) ou csb (88 Mo)).

In [None]:
# df_documents.to_pickle(os.path.join(processed_data_path, "df_documents.pkl")) # environ 82 Mo
# df_documents.to_csv(os.path.join(processed_data_path, "df_documents.csv")) # environ 88 Mo
# df_documents.to_parquet(os.path.join(processed_data_path, "df_documents.parquet")) # environ 24 Mo

In [None]:
df_documents[[]].to_parquet(os.path.join(PATHS.metadata, "df_documents.parquet"))

In [None]:
df_documents[['filename', 'rvl_image_path', 'iit_image_path', 'iit_individual_xml_path', 'iit_collective_xml_path']].\
    to_parquet(PATHS.metadata / "df_filepaths.parquet")

In [None]:
df_documents[["label"]].to_parquet(PATHS.metadata / "df_encoded_labels.parquet")

In [None]:
# info from data/raw/RVL-CDIP/readme.txt
labels_map = {
    0: "letter",
    1: "form",
    2: "email",
    3: "handwritten",
    4: "advertisement",
    5: "scientific report",
    6: "scientific publication",
    7: "specification",
    8: "file folder",
    9: "news article",
    10: "budget",
    11: "invoice",
    12: "presentation",
    13: "questionnaire",
    14: "resume",
    15: "memo"
}

In [None]:
df_documents[["label"]].replace(labels_map).to_parquet(PATHS.metadata / "df_plain_labels.parquet")

In [None]:
pd.DataFrame(labels_map.items(), columns= ["encoded_label", "plain_label"]).set_index("encoded_label").\
    to_parquet(PATHS.metadata / "df_labels_mapping.parquet")

In [None]:
df_documents[["data_set"]].replace(labels_map).to_parquet(PATHS.metadata / "df_data_sets.parquet")