In [None]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

import pandas as pd

from functools import reduce
import pickle
import json
from tqdm import tqdm
import os

from datasets import Dataset


In [None]:
# there are downloaded with http request files from the website 
# paintings and gravures
louvre_0 = pd.read_csv("22_0_to_5000.csv", sep=";", quotechar='"')
louvre_1 = pd.read_csv("22_5000_to_10000.csv", sep=";", quotechar='"')
louvre_2 = pd.read_csv("22_10000_to_15000.csv", sep=";", quotechar='"')
# drawings
louvre_3 = pd.read_csv("13_0_to_5000.csv", sep=";", quotechar='"')
louvre_4 = pd.read_csv("13_5000_to_10000.csv", sep=";", quotechar='"')

louvre = pd.concat([louvre_0, louvre_1, louvre_2, louvre_3, louvre_4], axis=0)


In [None]:
louvre["url"] = louvre['ARK'].map(lambda ark: f"https://collections.louvre.fr/ark:/53355/{ark}.json")

In [None]:
def get_retrying_session(retries=5, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,  # Exponential delay: 0.3, 0.6, 1.2, ...
        status_forcelist=status_forcelist,
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session


In [None]:
louvre_ds = []
session = get_retrying_session()
for piece_url in tqdm(louvre["url"].values):
    try:
        request = session.get(piece_url)
        request.raise_for_status()
        louvre_ds.append(request.content)
    except Exception as e:
        print(f"Failed to GET reponse from {piece_url}: {e}")

        


In [None]:
with open("louvre_ds", "wb") as f:
    pickle.dump(louvre_ds, f)

In [None]:
with open("louvre_ds", "rb") as f:
    louvre_ds = [json.loads(entity) for entity in pickle.load(f)]

In [None]:
def flatten(xss):
    return [x for xs in xss for x in xs]


In [None]:
fields = ['title', 'classification', 'subjects', 'techniques', 'materials', 'description', 'categories', 'artist', 'id', 'date_start', 'date_end', 'department_title', 'image_id', 'full_info', 'image_url']

In [None]:
louvre_dataset = Dataset.from_pandas(pd.DataFrame(data=louvre))


In [None]:
num_proc = os.cpu_count()

In [None]:

louvre_dataset = louvre_dataset.map(lambda batch: {
    'displayDateCreated': [x.replace('Date de création/fabrication :', '').strip() 
                           for x in batch['displayDateCreated']]
    }, 
                   batched=True, 
                   batch_size=1000, 
                   num_proc=num_proc)

In [None]:
louvre_dataset.save_to_disk("louvre_ds_22_13")