# Création DataFrame DOCUMENTS

## README
Ce notebook permet de télécharger sur le site de huggingface les images de la BDD RVL-CDIP.

Il réalise tout d'abord certaines opérations préalables (chapitre 1), dont la définition des variables globales d'exécution (**A METTRE A JOUR LORS D'UNE PREMIERE UTILISATION**)

A l'issue (chapitre 2), il permet de créer un premier DataFrame intitulé DOCUMENTS,  qui rassemble les informations de bases relatives aux documents de la base de données RVL-CDIP:
- document_id
- filename
- label
- data_set
- rvl_image_path
- iit_image_path
- iit_individual_xml_path
- iit_grouped_xml_path

## 1. Préparation

In [1]:
import os
import time
import numpy as np
import pandas as pd
from lxml import etree
from functools import reduce
from utils import remove_ds_store_files

In [2]:
project_path = '/Users/ben/Work/mle/ds-project/mai25_bds_extraction/' # à modifier par chacun en fonction de son arborescence

data_path = os.path.join(project_path, 'data')
raw_data_path = os.path.join(data_path, 'raw')
processed_data_path = os.path.join(data_path, 'processed')

raw_rvl_cdip_path = os.path.join(raw_data_path, 'RVL-CDIP')
rvl_cdip_images_path = os.path.join(raw_rvl_cdip_path, 'images')
rvl_cdip_labels_path = os.path.join(raw_rvl_cdip_path, 'labels')

iit_cdip_images_path = os.path.join(raw_data_path, 'IIT-CDIP', 'images')
iit_cdip_xmls_path = os.path.join(raw_data_path, 'IIT-CDIP', 'xmls')

## 2. Création du DataFrame DOCUMENTS

### 2.1 Filename, rvl_image_path 

In [3]:
def get_documents_base():
    tmp_list = []
    for foldername, _, filenames in os.walk(rvl_cdip_images_path):
        for filename in filenames:
            if filename.endswith(".tif"):
                tmp_list.append((
                    os.path.basename(foldername), 
                    filename,
                    os.path.join(os.path.relpath(foldername, data_path), filename),
                ))
    tmp_list.sort()
    return pd.DataFrame(tmp_list, columns = ["document_id", "filename", "rvl_image_path"])

In [4]:
t = time.time()
df_base = get_documents_base()
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
df_base.head()

Duree d'exécution: 18.183 secondes.


Unnamed: 0,document_id,filename,rvl_image_path
0,aaa06d00,50486482-6482.tif,raw/RVL-CDIP/images/imagesa/a/a/a/aaa06d00/504...
1,aaa08d00,2072197187.tif,raw/RVL-CDIP/images/imagesa/a/a/a/aaa08d00/207...
2,aaa09e00,2029372116.tif,raw/RVL-CDIP/images/imagesa/a/a/a/aaa09e00/202...
3,aaa10c00,2085133627a.tif,raw/RVL-CDIP/images/imagesa/a/a/a/aaa10c00/208...
4,aaa11d00,515558347+-8348.tif,raw/RVL-CDIP/images/imagesa/a/a/a/aaa11d00/515...


### 2.2 Labels et Data Sets

In [5]:
def get_labels_and_data_sets():
    data_sets = ["train.txt", "val.txt", "test.txt"]
    dataframes = []
    for data_set in data_sets:
        df = pd.read_csv(os.path.join(rvl_cdip_labels_path, data_set), sep= ' ', names = ["image_path", "label"])
        df.insert(
            loc = 0,
            column = "document_id",
            value = df.image_path.apply(lambda x: x.split("/")[-2])
        )
        df.insert(
            loc = 3,
            column = "data_set",
            value = data_set[:-4]
        )
        df.drop(columns="image_path", inplace=True)
        dataframes.append(df)
    return pd.concat(dataframes).sort_values(by="document_id").reset_index(drop=True)

In [6]:
t = time.time()
df_labels_and_data_sets = get_labels_and_data_sets()
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
df_labels_and_data_sets.head()

Duree d'exécution: 0.517 secondes.


Unnamed: 0,document_id,label,data_set
0,aaa06d00,6,test
1,aaa08d00,9,train
2,aaa09e00,11,val
3,aaa10c00,2,train
4,aaa11d00,3,train


### 2.3 iit_image_path, iit_individual_xml_path 

In [7]:
def get_iit_individual_data():
    tmp_list = []
    for foldername, _, filenames in os.walk(iit_cdip_images_path):
        tif_path = None
        xml_path = None
        for filename in filenames:
            if filename.endswith(".tif"):
                tif_path = os.path.join(os.path.relpath(foldername, data_path), filename),
            if filename.endswith(".xml"):
                xml_path = os.path.join(os.path.relpath(foldername, data_path), filename),
        if tif_path or xml_path:
            tmp_list.append((
                os.path.basename(foldername), 
                tif_path,
                xml_path
            ))
    tmp_list.sort()
    return pd.DataFrame(tmp_list, columns = ["document_id", "iit_image_path", "iit_individual_xml_path"]).replace({None: np.nan})

In [8]:
t = time.time()
df_iit_ind = get_iit_individual_data()
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
df_iit_ind.head()

Duree d'exécution: 21.376 secondes.


Unnamed: 0,document_id,iit_image_path,iit_individual_xml_path
0,aaa06d00,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa06d00/50...,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa06d00/aa...
1,aaa08d00,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa08d00/20...,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa08d00/aa...
2,aaa09e00,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa09e00/20...,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa09e00/aa...
3,aaa10c00,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa10c00/20...,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa10c00/aa...
4,aaa11d00,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa11d00/51...,(raw/IIT-CDIP/images/imagesa/a/a/a/aaa11d00/aa...


### 2.4 iit_collective_xml_path

In [9]:
def get_iit_collective_data():
    tmp_list = []
    for foldername, _, filenames in os.walk(iit_cdip_xmls_path):
        for filename in filenames:
            if filename.endswith(".xml"):
                xml_file = os.path.join(os.path.relpath(foldername, data_path), filename)
                parser = etree.XMLParser(recover=True)
                tree = etree.parse(os.path.join(data_path, xml_file), parser)
                root = tree.getroot()
                ids_in_file = [record.find("docid").text for record in root.findall("record")]
                tmp_list += [(id_, xml_file) for id_ in ids_in_file]
    tmp_list.sort()
    return pd.DataFrame(tmp_list, columns = ["document_id", "iit_collective_xml_path"])

In [10]:
t = time.time()
df_iit_coll = get_iit_collective_data()
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
df_iit_coll.head()

Duree d'exécution: 10.164 secondes.


Unnamed: 0,document_id,iit_collective_xml_path
0,aaa06d00,raw/IIT-CDIP/xmls/aa.xml
1,aaa08d00,raw/IIT-CDIP/xmls/aa.xml
2,aaa09e00,raw/IIT-CDIP/xmls/aa.xml
3,aaa10c00,raw/IIT-CDIP/xmls/aa.xml
4,aaa11d00,raw/IIT-CDIP/xmls/aa.xml


### 2.5 Fusion et sauvegarde

In [11]:
dfs = [df_base, df_labels_and_data_sets, df_iit_ind, df_iit_coll]  # liste de DataFrames
df_documents = reduce(lambda left, right: pd.merge(left, right, on='document_id', how="left"), dfs)

In [12]:
# ATTENTION: PROBLEME A INVESTIGUER SUR LA COLONNE iit_collective_xml_path:
df_documents.isna().sum()

document_id                     0
filename                        0
rvl_image_path                  0
label                           0
data_set                        0
iit_image_path                  0
iit_individual_xml_path    241651
iit_collective_xml_path      1990
dtype: int64

In [13]:
# df_documents.to_pickle(os.path.join(processed_data_path, "df_documents.pkl")) # environ 82 Mo
# df_documents.to_csv(os.path.join(processed_data_path, "df_documents.csv")) # environ 88 Mo
df_documents.to_parquet(os.path.join(processed_data_path, "df_documents.parquet")) # environ 24 Mo