This notebook query Wikidata to populate the data with pictures and dimensions of paintings.

In [1]:
from bs4 import BeautifulSoup
import requests

from tqdm.autonotebook import tqdm
import pandas as pd
import numpy as np
import json

from string import punctuation
from enum import Enum
import re

from PIL import Image



In [2]:
artworks_df = pd.read_csv("data/catalogue_artworks.csv")

In [3]:
def standardize_string(s):
    """ 
    Put the given string into a standard form to be compatible with the querying 
  
    Parameters: 
    s : a raw string
    
    return:
    string: standardized string
  
    """
    p = punctuation.replace("'","")
    
    s = str(s)
    s = s.lower()
    s = s.replace("-"," ")
    s = s.strip(" ")
    s = s.translate({ord(i) : None for i in p})
    return s

In [4]:
artworks_df.title = artworks_df.title.apply(standardize_string)

artworks_df.author = artworks_df.author.apply(lambda a : standardize_string("".join(a.split(",")[:2])).split(" "))

# Query Wikidata

In [5]:
#Find a given property in a wikidata page
def find_property(soup,prop):
    for division in soup.findAll("div", {"data-property-id":prop}):
        for k, value in enumerate(division.\
                  findAll("div",{"class":"wikibase-snakview-value wikibase-snakview-variation-valuesnak"})):
            if k == 0:
                return value.text
                
def find_width(soup):
    return find_property(soup,"P2049")

def find_height(soup):
    return find_property(soup,"P2048")

#Find a joconde database ID and query the government french website
def find_image_joconde(soup):
    joconde_id = find_property(soup,"P347")
    if joconde_id != None:
        url = "https://www.pop.culture.gouv.fr/notice/joconde/"+joconde_id

        soup_jocond = BeautifulSoup(requests.get(url).text,"html.parser")

        for image in soup_jocond.findAll("img"):
            if image["src"].startswith("http"):
                return image["src"]

def find_image(soup):
    for image in soup.findAll("meta", {"property":"og:image"}):
        return image["content"]
    
    #If image not in WikiData, tries to find joconde ref.
    return find_image_joconde(soup)


    
for index, row in tqdm(artworks_df.iterrows()):
    got_url = False
    
    url_base = "https://www.wikidata.org/w/index.php?search=&search="
    
    url = url_base+"+"+row.author[0]+"+"+row.author[-1]+"+"+row.title.replace(" ","+")
    
    r1 = requests.get(url)
    
    soup_search = BeautifulSoup(r1.text, "html.parser")
    
    for i, li in enumerate(soup_search.findAll("li",class_ = "mw-search-result")):
        if i == 0:
            for j, a in enumerate(li.findAll("a")):
                if j == 0:
                    got_url = True
                    
                    url_painting = "https://www.wikidata.org" + a["href"]
                    r2 = requests.get(url_painting)
                    
                    
                    soup_painting = BeautifulSoup(r2.text,"html.parser")
                    
                    artworks_df.loc[index,"width"] = find_width(soup_painting)
                    
                    artworks_df.loc[index,"height"] = find_height(soup_painting)
                    
                    artworks_df.loc[index,"image_url"] = find_image(soup_painting)
                    

                

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
print("Total number of rows :",len(artworks_df))
print("Artwork fully retrieved :",len(artworks_df.dropna()))
print("Image retrieved :",len(artworks_df["image_url"].dropna()))
print("Dimensions retrieved :",len(artworks_df[["width","height"]].dropna()))

Total number of rows : 2254
Artwork fully retrieved : 658
Image retrieved : 690
Dimensions retrieved : 696


In [7]:
artworks_df.sample(5)

Unnamed: 0.1,Unnamed: 0,number,author,life,title,position,width,height,image_url
189,204,2801,"[corot, jean, baptiste, camille]",(1796-1875).,le vallon,T. T.,53.5±0.1 centimetre,35±1 centimetre,https://upload.wikimedia.org/wikipedia/commons...
1037,1168,796 b,"[rigaud, et, sevin, de, la, pennaye, charles]",(1685- 1741).,portrait de jacques benigne bossuet évêque de ...,XIV-N,,,
1232,1473,1123,"[amerighi, exactement, merisi]",(vers 1560 ou 1500-1600).,concert,VI tr. D-N,,,
291,310,191,"[david, jacques, louis]",(1748-1825).,les licteurs rapportant à brutus les corps de ...,111-0,422 centimetre,323 centimetre,https://upload.wikimedia.org/wikipedia/commons...
709,776,S. N°,"[la, tour, m, quentin, de]",(1704-1788).,portrait de la tour par lui même pastel ébauche,S. des pastels,,,


# Saving

In [8]:
artworks_df.to_csv("data/complete_artworks.csv")