In [None]:
%pip install requests tqdm

In [None]:
import requests
import csv
import os
from tqdm import tqdm

def fetch_louvre_data(start=1, end=100, output="louvre_peintures.csv", error_log="erreurs.log", done_ids_path="done_ids.txt"):
    base_url = "https://collections.louvre.fr/ark:/53355/cl{:09d}.json"
    fieldnames = ["arkId", "title", "collection", "creator", "image"]

    # Lire les ARKs déjà présents dans le CSV
    existing_arks = set()
    if os.path.exists(output):
        with open(output, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_arks = {row["arkId"] for row in reader}

    # Lire les ARKs déjà traités (réussis ou échoués)
    done_ids = set()
    if os.path.exists(done_ids_path):
        with open(done_ids_path, encoding="utf-8") as f:
            done_ids = {line.strip() for line in f}

    # Ouvrir les fichiers en mode ajout
    with open(output, mode="a", newline="", encoding="utf-8") as f_out, \
         open(error_log, mode="a", encoding="utf-8") as f_err:

        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        if os.stat(output).st_size == 0:
            writer.writeheader()

        for i in tqdm(range(start, end + 1)):
            ark_id = f"cl{str(i).zfill(9)}"
            if ark_id in existing_arks or ark_id in done_ids:
                continue

            url = base_url.format(i)
            try:
                r = requests.get(url, timeout=5)
                if r.status_code != 200:
                    # Ne logue pas les erreurs attendues (404, etc.)
                    with open(done_ids_path, "a", encoding="utf-8") as f_done:
                        f_done.write(ark_id + "\n")
                    continue

                data = r.json()
                if data.get("collection", "") != "Département des Peintures":
                    with open(done_ids_path, "a", encoding="utf-8") as f_done:
                        f_done.write(ark_id + "\n")
                    continue

                image = ""
                if isinstance(data.get("image"), list) and data["image"]:
                    image = data["image"][0].get("urlImage", "")

                creators = ", ".join(c.get("label", "") for c in data.get("creator", []))

                row = {
                    "arkId": data.get("arkId", ""),
                    "title": data.get("title", ""),
                    "collection": data.get("collection", ""),
                    "creator": creators,
                    "image": image,
                }

                writer.writerow(row)
                f_out.flush()

            except Exception as e:
                msg = str(e).lower()
                if not any(x in msg for x in ["404", "403", "timeout", "status code"]):
                    f_err.write(f"{ark_id} - {str(e)}\n")
                    f_err.flush()
            finally:
                # Toujours marquer l'ID comme traité
                with open(done_ids_path, "a", encoding="utf-8") as f_done:
                    f_done.write(ark_id + "\n")

    print(f"✅ Traitement terminé de {start} à {end}")


In [None]:
fetch_louvre_data(start=50000, end=51000)

In [None]:
import os
import csv
import requests
from tqdm import tqdm
from urllib.parse import urlparse

def telecharger_images(csv_path="louvre_peintures.csv", dossier_images="images", log_erreurs="erreurs_images.log"):
    os.makedirs(dossier_images, exist_ok=True)

    with open(csv_path, newline='', encoding='utf-8') as f_csv, \
         open(log_erreurs, mode="a", encoding="utf-8") as f_log:

        reader = csv.DictReader(f_csv)
        for row in tqdm(reader):
            ark_id = row.get("arkId", "")
            url_image = row.get("image", "")

            if not ark_id or not url_image:
                continue  # pas d'image à traiter

            nom_fichier = nom_fichier = f"{ark_id}.jpg"
            chemin_image = os.path.join(dossier_images, nom_fichier)

            if os.path.exists(chemin_image):
                continue  # déjà téléchargée

            try:
                r = requests.get(url_image, timeout=10)
                if r.status_code == 200:
                    with open(chemin_image, "wb") as f_img:
                        f_img.write(r.content)
                else:
                    raise Exception(f"Status code {r.status_code}")
            except Exception as e:
                f_log.write(f"{ark_id} - {url_image} - {str(e)}\n")
                f_log.flush()

    print(f"✅ Téléchargement terminé.")

if __name__ == "__main__":
    telecharger_images()