In [1]:
import requests
import json

In [2]:
def request(url, params):
    data = None
    try:
        # Make the GET request
        response = requests.get(url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()
        else:
            print(f"Error: {response.status_code} - {response.text}")
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
    return data

In [4]:
# https://<domain>/api/explore/v2.1/<resource>
BASE_URL = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records"

# Call to specicific museum
params = {
    # "limit": 5,  
    # "offset": 0, 
    # "lan": "fr"
}

response = request(BASE_URL, params)

print(json.dumps(response, indent=4))

{
    "total_count": 707540,
    "results": [
        {
            "reference": "50510016755",
            "ancien_depot": null,
            "appellation": null,
            "ancienne_appartenance": "Polakovits Mathias",
            "ancienne_attribution": null,
            "auteur": "DUBREUIL Toussaint",
            "bibliographie": null,
            "commentaires": null,
            "presence_image": "non",
            "date_d_acquisition": "1987 entr\u00e9e mat\u00e9rielle",
            "date_de_depot": null,
            "decouverte_collecte": null,
            "denomination": "recto verso",
            "lieu_de_depot": null,
            "description": "plume et encre, lavis brun, trac\u00e9 pr\u00e9paratoire \u00e0 la sanguine",
            "mesures": "H. 28.9 ; L. 41.5",
            "date_de_mise_a_jour": "2022-07-22",
            "date_creation": "2002-05-21",
            "domaine": [
                "dessin"
            ],
            "region": "Ile-de-France",
            "dep

In [5]:
# fields in french
fields = response["results"][0].keys()


In [13]:
# display fields in fr to copy-paste to translator 
[print(" ".join(field.split("_"))) for field in fields]

reference
ancien depot
appellation
ancienne appartenance
ancienne attribution
auteur
bibliographie
commentaires
presence image
date d acquisition
date de depot
decouverte collecte
denomination
lieu de depot
description
mesures
date de mise a jour
date creation
domaine
region
departement
date sujet represente
ecole pays
epoque
exposition
genese
geographie historique
inscription
numero inventaire
appellation musee de france
lien base arcade
lieu de creation utilisation
localisation
ville
lien video
manquant
manquant com
millesime de creation
millesime d utilisation
code museofile
nom officiel musee
genre
onomastique
precisions sur l auteur
precisions decouverte collecte
periode de l original copie
periode de creation
periode d utilisation
precisions inscriptions
precisions lieux creations
precisions sujets representes
precisions utilisation
references memoires
references merimee
reference maj
references palissy
sujet represente
lien inha
source de la representation
statut juridique
mater

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [6]:
translated_fields = """Reference
Former repository
Name
Former ownership
Former attribution
Author
Bibliography
Comments
Image presence
Acquisition date
Deposit date
Discovery collection
Name
Deposit location
Description
Measurements
Update date
Creation date
Domain
Region
Department
Date of subject represented
School country
Era
Exhibition
Genesis
Historical geography
Inscription
Inventory number
Museum of France name
Archway database link
Place of creation use
Location
City
Video link
Missing
Missing com
Creation year
Use year
Museofile code
Official museum name
Genre
Onomastic
Author details
Discovery collection details
Original copy period
Creation period
Use period
Inscription details
Creation location details
Subject represented details
Use details
References memoirs
Merimee references
Reference Update
Palissy references
Subject represented
Link to inha
Source of representation
Legal status
Technical materials
Title
Use
Link to associated site
Contact information
Artist under copyright
Date of entry into the public domain""".split("\n")

translated_fields = ["_".join(field.lower().split()) for field in translated_fields]
# [print(f) for f in translated_fields]
fr_to_eng_fields = {fr_field: eng_field for fr_field, eng_field in zip(fields, translated_fields)}


In [None]:
# TO DO
# which fields are useful


In [41]:
#  play around extractions
# how many records are in specific museum:
field0 = "nom_officiel_musee"
value0 = "musée de l'Ecole nationale supérieure des beaux-arts"
BASE_URL = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records"

# Call to specicific museum
params = {
    "where": f"search({field0},\"{value0}\")"
}

response = request(BASE_URL, params)

print(response["total_count"])

618


### Extract list of distinct museums 

In [7]:
url = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records"
field = "nom_officiel_musee"
params = {
    "select": field,
    "group_by": field
}

response = request(url, params)
print(json.dumps(response, indent=4))

{
    "results": [
        {
            "nom_officiel_musee": null
        },
        {
            "nom_officiel_musee": "ARCH\u00c9A, Arch\u00e9ologie en Pays de France"
        },
        {
            "nom_officiel_musee": "Biblioth\u00e8que de l\u2019Assembl\u00e9e Nationale"
        },
        {
            "nom_officiel_musee": "Biblioth\u00e8que municipale de Besan\u00e7on"
        },
        {
            "nom_officiel_musee": "Biblioth\u00e8que municipale de Grasse"
        },
        {
            "nom_officiel_musee": "CAPC-mus\u00e9e d'Art Contemporain"
        },
        {
            "nom_officiel_musee": "Carr\u00e9 d'art - mus\u00e9e d'art contemporain"
        },
        {
            "nom_officiel_musee": "Ch\u00e2teau-mus\u00e9e"
        },
        {
            "nom_officiel_musee": "Collections de la Fondation de Coubertin"
        },
        {
            "nom_officiel_musee": "Ecomus\u00e9e de la Basse-Seine"
        },
        {
            "nom_officiel_musee

In [8]:
unique_museums = [entity.get("nom_officiel_musee") for entity in response["results"]]
unique_museums

[None,
 'ARCHÉA, Archéologie en Pays de France',
 'Bibliothèque de l’Assemblée Nationale',
 'Bibliothèque municipale de Besançon',
 'Bibliothèque municipale de Grasse',
 "CAPC-musée d'Art Contemporain",
 "Carré d'art - musée d'art contemporain",
 'Château-musée',
 'Collections de la Fondation de Coubertin',
 'Ecomusée de la Basse-Seine',
 'Ethnothèque, Musée des Boucles de la Seine Normande',
 'Familistère de Guise',
 'Historial de la Vendée',
 "LAAC, Lieu d'art et d'action contemporaine",
 'La Piscine – musée d’art et d’industrie André Diligent de Roubaix',
 "LaM, Lille Métropole musée d'art moderne, d'art contemporain et d'art brut",
 'Le Beffroi - musée Boucher de Perthes - Manessier',
 "Le Carroi, musée d'arts et d'histoire",
 'Le Musée - Arts & Figures des Pyrénées Centrales',
 "Les Abattoirs, musée d'art moderne et contemporain",
 'Les Pêcheries, musée de Fécamp',
 'Lugdunum-musée et théâtres romains',
 "MUDO - musée de l'Oise",
 "MUS - musée d'histoire urbaine et sociale de Sure

In [9]:
# Remove the 'no-name' museum
unique_museums.pop(0)
len(unique_museums)

473

### Extract unique museums

In [15]:
import unidecode

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [31]:
def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = "".join([char for char in text if char.isalnum() or char.isspace()])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words("french")]
    return " ".join(tokens)


In [32]:
preprocessed_titles = [preprocess_text(museum_name) for museum_name in unique_museums]
preprocessed_titles

['archea archeologie pays france',
 'bibliotheque lassemblee nationale',
 'bibliotheque municipale besancon',
 'bibliotheque municipale grasse',
 'capcmusee dart contemporain',
 'carre dart musee dart contemporain',
 'chateaumusee',
 'collections fondation coubertin',
 'ecomusee basseseine',
 'ethnotheque musee boucles seine normande',
 'familistere guise',
 'historial vendee',
 'laac lieu dart daction contemporaine',
 'piscine musee dart dindustrie andre diligent roubaix',
 'lam lille metropole musee dart moderne dart contemporain dart brut',
 'beffroi musee boucher perthes manessier',
 'carroi musee darts dhistoire',
 'musee arts figures pyrenees centrales',
 'abattoirs musee dart moderne contemporain',
 'pecheries musee fecamp',
 'lugdunummusee theatres romains',
 'mudo musee loise',
 'mus musee dhistoire urbaine sociale suresnes',
 'muba eugene leroy tourcoing',
 'maison victor hugo',
 'maison larcheologie vosges nord',
 'maison lumieres denis diderot',
 'maison verre cristal',
 'm

### Compute similarity

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [34]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_titles)

similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.3061462  ... 0.         0.         0.        ]
 [0.         0.3061462  1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.45142483 0.53075849]
 [0.         0.         0.         ... 0.45142483 1.         0.52531495]
 [0.         0.         0.         ... 0.53075849 0.52531495 1.        ]]


In [45]:
similarity_matrix_normalized = (similarity_matrix + 1)/2


In [52]:
distance = 1 - similarity_matrix_normalized
distance = np.maximum(0, distance)

In [36]:
from sklearn.cluster import DBSCAN
import numpy as np

In [56]:
similarity_threshold = 0.7
distance_matrix = 1 - similarity_matrix_normalized
distance_matrix = np.maximum(0, distance_matrix)
clustering = DBSCAN(eps=1-similarity_threshold, min_samples=1, metric="precomputed").fit(distance_matrix)

clusters = clustering.labels_
for cluster_id, title in sorted(zip(clusters, unique_museums)):
    print(f"Cluster {cluster_id}: {title}")

Cluster 0: ARCHÉA, Archéologie en Pays de France
Cluster 1: Bibliothèque de l’Assemblée Nationale
Cluster 2: Bibliothèque municipale de Besançon
Cluster 2: Bibliothèque municipale de Grasse
Cluster 3: CAPC-musée d'Art Contemporain
Cluster 3: Carré d'art - musée d'art contemporain
Cluster 3: LaM, Lille Métropole musée d'art moderne, d'art contemporain et d'art brut
Cluster 3: Les Abattoirs, musée d'art moderne et contemporain
Cluster 3: MUDO - musée de l'Oise
Cluster 3: Musée d'Art Moderne André Malraux - MuMa
Cluster 3: collections du musée de la société archéologique de Touraine
Cluster 3: hôtel Cabu, musée d’histoire et d’archéologie
Cluster 3: les Abattoirs, musée d'art moderne et contemporain
Cluster 3: musée Antoine Lécuyer
Cluster 3: musée Antoine Vivenel
Cluster 3: musée Archéologique départemental du Val-d'Oise
Cluster 3: musée Bonnat-Helleu, musée des Beaux-Arts de Bayonne
Cluster 3: musée Municipal
Cluster 3: musée Municipal d'Art et d'Histoire
Cluster 3: musée Saint-Raymond


In [None]:
"https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records?where=search(nom_officiel_musee,%20%22mus%C3%A9e%20de%20l%27Ecole%20nationale%20sup%C3%A9rieure%20des%20beaux-arts%22)"
"https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records?where=search(nom_officiel_musee,mus%C3%A9e%20de%20l%27Ecole%20nationale%20sup%C3%A9rieure%20des%20beaux-arts"
"https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records?where=search(nom_officiel_musee,___%22mus%C3%A9e%20de%20l%27Ecole%20nationale%20sup%C3%A9rieure%20des%20beaux-arts%22"


In [37]:
BASE_URL + "?where=" + params["where"]

'https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records?where=search(nom_officiel_musee,mus%C3%A9e%20de%20l%27Ecole%20nationale%20sup%C3%A9rieure%20des%20beaux-arts)'

In [30]:
encoded_value0

'%22mus%C3%A9e%20de%20l%27Ecole%20nationale%20sup%C3%A9rieure%20des%20beaux-arts%22'

In [21]:
# filte records
filtered_records = [
    record
    for record in response["results"]
    if record.get("nom_officiel_musee") == specific_museum
]


In [23]:
len(filtered_records)

6

In [None]:
specific_museum = "musée de l'Ecole nationale supérieure des beaux-arts"
BASE_URL = "https://data.culture.gouv.fr/api/explore/v2.1/catalog/datasets/base-joconde-extrait/records"

# Call to specicific museum
params = {
    # "limit": 5,  
    # "offset": 0, 
    # "nom_officiel_musee": specific_museum,
    "search": f"(nom_officiel_musee, \"{'+'.join(specific_museum.split())}\""
    # "lan": "fr"
}

response = request(BASE_URL, params)

In [24]:
print(f"(nom_officiel_musee, \"{'+'.join(specific_museum.split())}\"")

(nom_officiel_musee, "musée+de+l'Ecole+nationale+supérieure+des+beaux-arts"
