# Extraction des données RVL-CDIP

## README
Ce notebook permet de télécharger sur le site de huggingface les images de la BDD RVL-CDIP.

Il réalise tout d'abord certaines opérations préalables (chapitre 1), dont la définition des variables globales d'exécution (**A METTRE A JOUR LORS D'UNE PREMIERE UTILISATION**)

A l'issue (chapitre 2), il télécharge l'ensemble des documents de la BDD RVL-CDIP puis décompresse les fichiers téléchargés.

Remarque:
- **[IMPORTANT] L'utilisation de ce script écrira environ 50 Go de données (principalement des fichiers tif) sur le disque dur.**.


## 1. Préparation

In [None]:
import os
import time
import tarfile
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import remove_ds_store_files

In [None]:
project_path = '/Users/ben/Work/mle/ds-project/mai25_bds_extraction/' # à modifier par chacun en fonction de son arborescence

data_path = os.path.join(project_path, 'data')
raw_data_path = os.path.join(data_path, 'raw')
extracted_data_path = os.path.join(data_path, 'extracted')
processed_data_path = os.path.join(data_path, 'processed')

raw_rvl_cdip_path = os.path.join(raw_data_path, 'RVL-CDIP')
rvl_cdip_images_path = os.path.join(raw_rvl_cdip_path, 'images')
rvl_cdip_labels_path = os.path.join(raw_rvl_cdip_path, 'labels')

iit_cdip_images_path = os.path.join(raw_data_path, 'IIT-CDIP', 'images')
iit_cdip_xmls_path = os.path.join(raw_data_path, 'IIT-CDIP', 'xmls')

In [None]:
os.makedirs(rvl_cdip_images_path, exist_ok=True)
os.makedirs(rvl_cdip_labels_path, exist_ok=True)

## 2. Téléchargement des fichiers RVL-CDIP

### 2.1. Fichiers images

In [None]:
# Téléchargement de l'archive
url = "https://huggingface.co/datasets/aharley/rvl_cdip/resolve/main/data/rvl-cdip.tar.gz?download=true"
tar_file_path = os.path.join(raw_rvl_cdip_path, "rvl-cdip.tar.gz")

# Stream download with progress bar
response = requests.get(url, stream=True)
total = int(response.headers.get('content-length', 0))
block_size = 1024

with open(tar_file_path, 'wb') as f, tqdm(
    desc="Téléchargement RVL-CDIP",
    total=total,
    unit='iB',
    unit_scale=True,
    unit_divisor=1024
) as bar:
    for data in response.iter_content(block_size):
        f.write(data)
        bar.update(len(data))

In [None]:
# Extraction des images
with tarfile.open(tar_file_path, "r:gz") as archive:
    archive.extractall(path=raw_rvl_cdip_path)

In [None]:
# Suppression du fichier archive
from pathlib import Path

images_path = Path(rvl_cdip_images_path)
image_files = list(images_path.rglob("*.tif"))

if len(image_files) == 400000:
    os.remove(tar_file_path)

### 2.2 Autres fichiers

In [None]:
for label_set in ["train", "val", "test"]:
    url = f"https://huggingface.co/datasets/aharley/rvl_cdip/resolve/main/data/{label_set}.txt?download=true"
    response = requests.get(url, stream=True)
    with open(os.path.join(rvl_cdip_labels_path, f"{label_set}.txt"), "wb") as f:
        f.write(response.content)
        
for additional_file in ["README.md", "dataset_infos.json"]:
    url = f"https://huggingface.co/datasets/aharley/rvl_cdip/resolve/main/{additional_file}?download=true"
    response = requests.get(url, stream=True)
    with open(os.path.join(raw_rvl_cdip_path, additional_file), "wb") as f:
        f.write(response.content)