In [1]:
import requests
import json

In [2]:
def request(url, params):
    data = None
    try:
        # Make the GET request
        response = requests.get(url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()
        else:
            print(f"Error: {response.status_code} - {response.text}")
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
    return data


In [None]:
# https://<domain>/api/explore/v2.1/<resource>
BASE_URL = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records"

# Call to specicific museum
params = {

}

response = request(BASE_URL, params)

print(json.dumps(response, indent=4))


In [None]:
# TO DO
# which fields are useful


In [None]:
#  play around extractions
# how many records are in specific museum:
field0 = "nom_officiel_musee"
value0 = "musée du Louvre"
BASE_URL = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records"

# Call to specicific museum
params = {
    "where": f"search({field0},\"{value0}\")"
}

response = request(BASE_URL, params)

print(json.dumps(response, indent=4))

In [None]:
url = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records"
field = "nom_officiel_musee"
params = {
    "select": field,
    "group_by": field
}

response = request(url, params)
print(json.dumps(response, indent=4))

In [13]:
unique_museums = [entity.get("nom_officiel_musee") for entity in response["results"]]
unique_museums = unique_museums[1:]

# [m for m in unique_museums if "Musée du Louvre" in m]

### Extract unique museums

In [15]:
import unidecode

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [31]:
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = "".join([char for char in text if char.isalnum() or char.isspace()])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words("french")]
    return " ".join(tokens)


In [None]:
preprocessed_titles = [preprocess_text(museum_name) for museum_name in unique_museums]
preprocessed_titles

### Compute similarity

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_titles)

similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)

In [45]:
similarity_matrix_normalized = (similarity_matrix + 1)/2


In [52]:
distance = 1 - similarity_matrix_normalized
distance = np.maximum(0, distance)


In [36]:
from sklearn.cluster import DBSCAN
import numpy as np

In [None]:
similarity_threshold = 0.7
distance_matrix = 1 - similarity_matrix_normalized
distance_matrix = np.maximum(0, distance_matrix)
clustering = DBSCAN(eps=1-similarity_threshold, min_samples=1, metric="precomputed").fit(distance_matrix)

clusters = clustering.labels_
for cluster_id, title in sorted(zip(clusters, unique_museums)):
    print(f"Cluster {cluster_id}: {title}")

In [None]:
specific_museum = "musée du Louvre"
BASE_URL = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records"

# Call to specicific museum
params = {
    "where": f"search(nom_officiel_musee,\"{specific_museum}\")"
}

louvre_collection = request(BASE_URL, params)
louvre_collection

# Web-scraping

## 1. musée du Louvre 

In [22]:
# Find a piece "Service du roi d'Angleterre George III : Seau à verres, d'une paire"


In [12]:
BASE_URL = "https://collections.louvre.fr"
SEARCH = "recherche"
QUERY = "q"


In [3]:
from bs4 import BeautifulSoup
import requests

In [3]:
def search(query):
    """Search for Louvre's collection"""
    urls_to_results = []
    try:
        response = requests.get(url=f"{BASE_URL}/{SEARCH}", 
                                params={QUERY: query})
        soup = BeautifulSoup(response._content, "html.parser")
        results_container = soup.find("div", class_ = "search__results__container")
        results = results_container.find_all("div", class_="card__outer")
        urls_to_results = [f"{BASE_URL}/{r.find("a").attrs.get("href")}" for r in results]
    except:
        ...
    finally:
        return urls_to_results


In [4]:
piece = "Service du roi d'Angleterre George III : Seau à verres, d'une paire"
results = search(piece)

In [11]:
results[0]

'https://collections.louvre.fr//ark:/53355/cl010111604'

In [174]:
# example of one of the Louvre's pieces
"""
https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records?where=search%28titre%2C%22Service+du+roi+d%27Angleterre+George+III+%3A+Seau+%C3%A0+verres%2C+d%27une+paire%22%29
"""

'\nhttps://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records?where=search%28titre%2C%22Service+du+roi+d%27Angleterre+George+III+%3A+Seau+%C3%A0+verres%2C+d%27une+paire%22%29\n'

In [4]:
def get_piece_info(url):
    """returns dictionary of:
    - title
    - image resource url
    - author
    - means (materials, technique)
    - period of creation (start - end)
    - hystorical period 
    - place of creation
    - physical location
    - department of Louvre museum
    - description
    - historical information"""
    piece_info = None
    try:
        resource = requests.get(url)
        soup = BeautifulSoup(resource.content, "html.parser")
        content = soup.find("div", attrs={"id": "content"})
        
        piece_title = content.find(class_="notice__title h_1").text.strip()
        # image resource url
        piece_image_url = f"{BASE_URL}/{content.find("picture").find("img").attrs.get("src")}"
        # author
        piece_author = content.find(class_ = "notice__author").text.strip()
        # physical location
        piece_georeference = content.find(class_ = "notice__date geo-reference")
        if piece_georeference is not None:
            piece_georeference = piece_georeference.text.strip() 
        # department of Louvre museum
        museum_department = content.find_all(class_ = "notice__date")[-1].text.strip() # in joconde
        
        piece_details = content.find_all("div", class_="notice__fullcartel__part")

        detailed_info = dict()
        for i, _ in enumerate(['description', 'technical_info', 'creation', 'history'], start=1):
            fields = [field.text.strip() for field in piece_details[i].find_all("div", class_ = "part__label")]
            values = [field.text.strip() for field in piece_details[i].find_all("div", class_ = "part__content")]

            for f, v in zip(fields, values):
                detailed_info[f] = v
        piece_material_technique = detailed_info.get("Matière et technique")
        piece_info = {
            "title": piece_title,
            "image_url": piece_image_url,
            "author": piece_author,
            "means": piece_material_technique, 
            "creation_period": detailed_info.get("Date de création / fabrication"), 
            "creation_location": detailed_info.get("Lieu de création / fabrication / exécution"), 
            "physical_location": piece_georeference,
            "department": museum_department,
            "description": detailed_info.get("Description / Décor"), 
            "history": detailed_info.get("Historique de l'œuvre")
        }
    except:
        ...
    finally:
        return piece_info

# Test

In [None]:
get_piece_info('https://collections.louvre.fr/ark:/53355/cl010111604')

In [None]:
get_piece_info("https://collections.louvre.fr/ark:/53355/cl010052603")

In [61]:
from random import sample

In [None]:
louvre_collection

In [None]:
pieces = [piece["titre"] for piece in sample(louvre_collection["results"], 10)]
pieces

In [6]:
pieces = ["Service du roi d'Angleterre George III : Moutardier en forme de vase, d'une paire (OA 10611).",
 'Tapisserie Millefleurs : scène allégorique',
 'Secrétaire à abattant',
 'Histoire de Don Quichotte : Don Quichotte',
 "Nécessaire à thé du Régent, Philippe d'Orléans : tasse",
 "Service du roi d'Angleterre George III : Seau à verres, d'une paire",
 "Service du roi d'Angleterre George III : Huilier d'une paire",
 "Service du roi d'Angleterre George III : Chandelier, d'une paire",
 'Commode à trois vantaux ornée de 5 plaques en porcelaine',
 'Vase à dauphins']

In [7]:
search("Service du roi d'Angleterre George III : Moutardier en forme de vase, d'une paire (OA 10611).")

[]

In [14]:
query = "Service du roi d'Angleterre George III : Moutardier en forme de vase, d'une paire (OA 10611)."
response = requests.get(url=f"{BASE_URL}/{SEARCH}", 
                                params={QUERY: query})
response.status_code

200

In [15]:
soup = BeautifulSoup(response._content, "html.parser")

In [16]:
results_container = soup.find("div", class_ = "search__results__container")

In [17]:
results = results_container.find_all("div", class_="card__outer")

In [18]:
urls_to_results = [f"{BASE_URL}/{r.find("a").attrs.get("href")}" for r in results]
urls_to_results

['https://collections.louvre.fr//ark:/53355/cl010110033',
 'https://collections.louvre.fr//ark:/53355/cl010114056']

In [None]:
searches = [search(item) for item in pieces]
searches

In [None]:
searches

### import csv file of all Louvres entries


In [4]:
# import csv file of all Lovres entries
field0 = "nom_officiel_musee"
value0 = "musée du Louvre"
BASE_URL = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/exports/json"

# Call to specicific museum
params = {
    "where": f"domaine in (\"peinture\", \"dessin\") and search({field0},\"{value0}\")" # extract only paintings and drawings
}

louvre_collection = requests.get(BASE_URL, params).content



In [8]:
louvre_collection is None

False

In [26]:
BASE_URL

'https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/exports/json'

# Louvre csv file analysis

In [1]:
import pandas as pd

In [11]:
# paintings and gravures
louvre_0 = pd.read_csv("22_0_to_5000.csv", sep=";", quotechar='"')
louvre_1 = pd.read_csv("22_5000_to_10000.csv", sep=";", quotechar='"')
louvre_2 = pd.read_csv("22_10000_to_15000.csv", sep=";", quotechar='"')
# drawings
louvre_3 = pd.read_csv("13_0_to_5000.csv", sep=";", quotechar='"')
louvre_4 = pd.read_csv("13_5000_to_10000.csv", sep=";", quotechar='"')

louvre = pd.concat([louvre_0, louvre_1, louvre_2, louvre_3, louvre_4], axis=0)

In [26]:
louvre["url"] = louvre['ARK'].map(lambda ark: f"https://collections.louvre.fr/ark:/53355/{ark}.json")

In [30]:
len(louvre)

16702

In [31]:
import json

In [36]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [33]:
def get_retrying_session(retries=5, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,  # Exponential delay: 0.3, 0.6, 1.2, ...
        status_forcelist=status_forcelist,
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session


In [34]:
louvre["url"].values

array(['https://collections.louvre.fr/ark:/53355/cl010048442.json',
       'https://collections.louvre.fr/ark:/53355/cl010049217.json',
       'https://collections.louvre.fr/ark:/53355/cl010052603.json', ...,
       'https://collections.louvre.fr/ark:/53355/cl020629645.json',
       'https://collections.louvre.fr/ark:/53355/cl020629646.json',
       'https://collections.louvre.fr/ark:/53355/cl020629647.json'],
      dtype=object)

In [None]:
from tqdm import tqdm

In [37]:
louvre_ds = []
session = get_retrying_session()
for piece_url in tqdm(louvre["url"].values):
    try:
        request = session.get(piece_url)
        request.raise_for_status()
        louvre_ds.append(request.content)
    except Exception as e:
        print(f"Failed to GET reponse from {piece_url}: {e}")

        


In [38]:
import pickle

with open("louvre_ds", "wb") as f:
    pickle.dump(louvre_ds, f)

In [9]:
import json
import pickle

In [11]:
with open("louvre_ds", "rb") as f:
    louvre_ds = [json.loads(entity) for entity in pickle.load(f)]

In [12]:
louvre_ds[0]

{'arkId': 'cl010048442',
 'url': 'https://collections.louvre.fr/ark:/53355/cl010048442',
 'modified ': '2025-04-29',
 'title': 'Tyché de Constantinople',
 'titleComplement': '',
 'denominationTitle': [{'type': 'Titre', 'value': 'Tyché de Constantinople'},
  {'type': 'Dénomination', 'value': 'peinture'}],
 'displayDateCreated': 'Date de création/fabrication : époque byzantine (1e moitié VIIe s. ap. JC) (inscription/dédicace/signature) (600 - 649)',
 'currentLocation': 'non exposé',
 'room': '',
 'isMuseesNationauxRecuperation': False,
 'dateCreated': [{'startYear': 600,
   'endYear': 649,
   'imprecision': '',
   'text': 'époque byzantine',
   'type': 'Date de création/fabrication',
   'doubt': ''}],
 'creator': [],
 'objectNumber': [{'value': 'AF 10878', 'type': 'Numéro principal'},
  {'value': 'AF 10879', 'type': "Autre numéro d'inventaire"}],
 'collection': 'Département des Arts de Byzance et des Chrétientés en Orient',
 'printsDrawingsEntity': '',
 'printsDrawingsCollection': '',
 '

In [None]:
# 'title', 'classification', 'subjects', 'techniques', 'materials', 'description', 'categories', 'artist', 'id', 'date_start', 'date_end', 'department_title', 'image_id', 'full_info', 'image_url'


In [15]:
louvre_ds[2]

{'arkId': 'cl010052603',
 'url': 'https://collections.louvre.fr/ark:/53355/cl010052603',
 'modified ': '2025-04-29',
 'title': 'Vénitienne',
 'titleComplement': '',
 'denominationTitle': [{'type': 'Titre', 'value': 'Vénitienne'},
  {'type': 'Ancien titre',
   'value': 'Bourgeoise de Bologne - Femme à la mantille'}],
 'displayDateCreated': 'Date de création/fabrication : 2e tiers du XVIIIe siècle (vers 1755 - 1759)',
 'currentLocation': 'non exposé',
 'room': '',
 'isMuseesNationauxRecuperation': False,
 'dateCreated': [{'startYear': 1755,
   'endYear': 1759,
   'imprecision': 'vers',
   'text': '2e tiers du XVIIIe siècle',
   'type': 'Date de création/fabrication',
   'doubt': ''}],
 'creator': [{'label': 'Barbault, Jean',
   'attributionLevel': 'Attribution actuelle',
   'linkType': '',
   'dates': [{'date': '1762', 'place': 'Rome', 'type': 'date de mort'},
    {'date': '1718',
     'place': "Viarmes (Val-d'Oise)",
     'type': 'date de naissance'}],
   'creatorRole': '',
   'authenti

In [59]:
fields = ['image', 'index', 'materialsAndTechniques', 'description', 'inscriptions', 'displayDateCreated', 'date_end', 'date_start', 'title', 'url', 'arkId', 'attribution', 'artist', 'bibliography']

In [None]:
from functools import reduce

In [98]:
louvre = {f: [] for f in fields}

for i, entity in enumerate(louvre_ds):
    try:
        for f in fields:
            if f in ["date_end", 'date_start'] :
                d = entity.get('dateCreated', [])
                if len(d) > 0:
                    if f == "date_end":
                        d = str(d[0].get('endYear', ''))
                    else:
                        d = str(d[0].get('startYear', ''))
                else:
                    d = ''
                louvre[f].append(d)

            elif f in ['attribution', 'artist']:
                cs = entity.get('creator', '')
                author, creator = '',  ''
                if cs: 
                    author = cs[0].get('label', '') if len(cs) > 0 else ''
                    creator = ". ".join([f"{c.get('linkType', '')} {c.get('label', '')}, {c.get('attributionLevel', '')}, {c.get('creatorRole', '')}"  for c in cs])
                if f == 'artist':
                    louvre[f].append(author)
                else: 
                    louvre[f].append(creator)

            elif f == 'index':
                features = entity.get('index')
                index = ''
                if not isinstance(features, list):
                    index = '. '.join([f'{feature_name}: {', '.join(reduce(lambda x, y: x + y, 
                                                               [list(v.values()) for v in values]))}' 
                                for feature_name, values in features.items()])
                louvre[f].append(index)

            elif f == 'image':
                images = entity.get('image', [])
                images = [im.get('urlImage') for im in images]
                louvre[f].append(images[0] if len(images) > 0 else '')

            elif f == 'bibliography':
                bibliographies = entity.get('bibliography', [])
                bibliographies = '; '.join([bib.get('bibliographyRef', '') for bib in bibliographies])
                louvre[f].append(bibliographies)

            else:
                louvre[f].append(entity.get(f, ''))
    except Exception as e:
        print(f"[{i}] error on entity {entity.get('url')}: {e}")

                

In [None]:
# NER on 'artist'

In [68]:
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [103]:
louvre_dataset = Dataset.from_pandas(pd.DataFrame(data=louvre))
# louvre_dataset.save_to_disk("louvre_ds")

In [104]:
louvre_dataset

Dataset({
    features: ['image', 'index', 'materialsAndTechniques', 'description', 'inscriptions', 'displayDateCreated', 'date_end', 'date_start', 'title', 'url', 'arkId', 'attribution', 'artist', 'bibliography'],
    num_rows: 16702
})

In [108]:
import os

num_proc = os.cpu_count()

In [110]:
louvre_dataset = louvre_dataset.map(lambda batch: {
    'displayDateCreated': [x.replace('Date de création/fabrication :', '').strip() 
                           for x in batch['displayDateCreated']]
    }, 
                   batched=True, 
                   batch_size=1000, 
                   num_proc=num_proc)

Map (num_proc=8):   0%|          | 0/16702 [00:00<?, ? examples/s]

Map (num_proc=8): 100%|██████████| 16702/16702 [00:07<00:00, 2232.90 examples/s]


In [115]:
louvre_dataset.save_to_disk("louvre_ds_22_13")

Saving the dataset (1/1 shards): 100%|██████████| 16702/16702 [00:00<00:00, 291812.01 examples/s]


In [116]:
from datasets import load_from_disk

In [117]:
louvre_dataset = load_from_disk("louvre_ds_22_13")

In [118]:
louvre_dataset

Dataset({
    features: ['image', 'index', 'materialsAndTechniques', 'description', 'inscriptions', 'displayDateCreated', 'date_end', 'date_start', 'title', 'url', 'arkId', 'attribution', 'artist', 'bibliography'],
    num_rows: 16702
})

{'image': 'https://collections.louvre.fr/media/cache/large/0000000021/0000048442/0000040685_OG.JPG',
 'index': "Mode d'acquisition: partage après fouilles. denomination: peinture. material: encre, épicéa. technic: écriture, badigeonnage, peinture. description: couronne, palmette, Tyché, manteau. period: époque byzantine. place: Edfou. inscriptionWritingSystem: grec oncial, arabe, araméen",
 'materialsAndTechniques': 'Matériau : épicéa\r\nMatériau secondaire : encre\r\nTechnique : peinture, écriture, badigeonnage',
 'description': 'Décor : Tyché ; couronne ; manteau ; palmette',
 'inscriptions': 'Écriture : \r ; arabe (au revers)\r ; araméen (melkite, sous la peinture)\r ; grec oncial\r ; Traduction : \r ; la belle florissante',
 'displayDateCreated': 'époque byzantine (1e moitié VIIe s. ap. JC) (inscription/dédicace/signature) (600 - 649)',
 'date_end': '649',
 'date_start': '600',
 'title': 'Tyché de Constantinople',
 'url': 'https://collections.louvre.fr/ark:/53355/cl010048442',
 'ar