# Retrieve paintings and their descriptions from the Web Gallery of Art
This notebook retrieves paintings and their descriptions from the Web Gallery of Art digital collection.

### 0. Import libraries

In [1]:
import os
import json
import time

import requests
import polars as pl
from tqdm import tqdm
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup

from preprocess_data_utils import *

RAW_DATA_PATH = "../../data/raw/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bogdan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
try:
    os.mkdir(RAW_DATA_PATH + "wqa_paintings/")
except FileExistsError:
    pass

In [3]:
wga_features_df = (
    (
        pl.read_excel(RAW_DATA_PATH + "artists_wga.xlsx")
        .filter(pl.col("FORM") == "painting")
        .with_columns(pl.col("AUTHOR").alias("raw_artist"))
        .with_columns(pl.col("TITLE").alias("raw_title"))
        .with_columns(
            pl.col("AUTHOR")
            .map_elements(
                lambda x: clean_artist_name(rearrange_artist_name(x)), return_dtype=pl.String
            )
            .alias("artist")
        )
        .with_columns(pl.col("URL").alias("url"))
        .with_columns(
            pl.col("TITLE")
            .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
            .alias("title")
        )
        .with_columns(
            pl.col("DATE")
            .map_elements(lambda x: clean_date(x), return_dtype=pl.Int64)
            .alias("year")
        )
        .with_columns(
            pl.col("TYPE")
            .replace(
                {
                    "historical": "history",
                    "other": None,
                    "still-life": "still life",
                    "study": "sketch and study",
                }
            )
            .alias("type")
        )
        .select("title", "artist", "type", "year", "raw_title", "raw_artist", "url")
    )
    .filter(pl.col("type").is_not_null() & (pl.col("year") >= 1201) & (pl.col("year") < 2001))
    .unique()
)

wga_features_df

title,artist,type,year,raw_title,raw_artist,url
str,str,str,i64,str,str,str
"""fat kitchen""","""jan steen""","""still life""",1667,"""The Fat Kitchen""","""STEEN, Jan""","""https://www.wga.hu/html/s/stee…"
"""assumption detail""","""filippino lippi""","""religious""",1489,"""Assumption (detail)""","""LIPPI, Filippino""","""https://www.wga.hu/html/l/lipp…"
"""hell detail""","""hieronymus bosch""","""religious""",1500,"""Hell (detail)""","""BOSCH, Hieronymus""","""https://www.wga.hu/html/b/bosc…"
"""storm lake""","""pierre henri de valenciennes""","""landscape""",1780,"""Storm by a Lake""","""VALENCIENNES, Pierre-Henri de""","""https://www.wga.hu/html/v/vale…"
"""girls bathing seaside""","""claude joseph vernet""","""landscape""",1761,"""Girls Bathing at the Seaside""","""VERNET, Claude-Joseph""","""https://www.wga.hu/html/v/vern…"
…,…,…,…,…,…,…
"""laundress quai d'anjou""","""honoré daumier""","""genre""",1860,"""Laundress on the Quai d'Anjou""","""DAUMIER, Honoré""","""https://www.wga.hu/html/d/daum…"
"""story psyche 2 zephyr transpor…","""maurice denis""","""mythological""",1908,"""The Story of Psyche: 2. Zephyr…","""DENIS, Maurice""","""https://www.wga.hu/html/d/deni…"
"""poor woman village""","""gustave courbet""","""landscape""",1866,"""Poor Woman of the Village""","""COURBET, Gustave""","""https://www.wga.hu/html/c/cour…"
"""triptych holy kinship central …","""lucas the elder cranach""","""religious""",1509,"""Triptych with the Holy Kinship…","""CRANACH, Lucas the Elder""","""https://www.wga.hu/html/c/cran…"


In [4]:
retrieved_descriptions = []
 
for i in tqdm(range(wga_features_df.shape[0])):
    painting_url = wga_features_df["url"][i]

    response = requests.get(painting_url)

    if str(response.status_code)[0] != "2":
       raise Exception("Wrong call")

    soup = BeautifulSoup(response.text, "html.parser")

    for x in soup.find_all("td"):
        if "<!-- Comment Start -->" in str(x):
            description = x.get_text().strip()

    retrieved_descriptions.append(description.split("\r\n\n\n\r\n")[0])

100%|██████████| 25914/25914 [1:04:48<00:00,  6.66it/s]


In [5]:
wga_features_df = wga_features_df.with_columns(pl.Series(retrieved_descriptions).alias("description"))
wga_features_df

title,artist,type,year,raw_title,raw_artist,url,description
str,str,str,i64,str,str,str,str
"""fat kitchen""","""jan steen""","""still life""",1667,"""The Fat Kitchen""","""STEEN, Jan""","""https://www.wga.hu/html/s/stee…","""Jan Steen frequently incorpora…"
"""assumption detail""","""filippino lippi""","""religious""",1489,"""Assumption (detail)""","""LIPPI, Filippino""","""https://www.wga.hu/html/l/lipp…","""The picture shows the detail o…"
"""hell detail""","""hieronymus bosch""","""religious""",1500,"""Hell (detail)""","""BOSCH, Hieronymus""","""https://www.wga.hu/html/b/bosc…","""The marked contrasts between l…"
"""storm lake""","""pierre henri de valenciennes""","""landscape""",1780,"""Storm by a Lake""","""VALENCIENNES, Pierre-Henri de""","""https://www.wga.hu/html/v/vale…",""""""
"""girls bathing seaside""","""claude joseph vernet""","""landscape""",1761,"""Girls Bathing at the Seaside""","""VERNET, Claude-Joseph""","""https://www.wga.hu/html/v/vern…","""This painting is characteristi…"
…,…,…,…,…,…,…,…
"""laundress quai d'anjou""","""honoré daumier""","""genre""",1860,"""Laundress on the Quai d'Anjou""","""DAUMIER, Honoré""","""https://www.wga.hu/html/d/daum…",""""""
"""story psyche 2 zephyr transpor…","""maurice denis""","""mythological""",1908,"""The Story of Psyche: 2. Zephyr…","""DENIS, Maurice""","""https://www.wga.hu/html/d/deni…","""The monumental canvases depict…"
"""poor woman village""","""gustave courbet""","""landscape""",1866,"""Poor Woman of the Village""","""COURBET, Gustave""","""https://www.wga.hu/html/c/cour…","""In addition to the many nudes,…"
"""triptych holy kinship central …","""lucas the elder cranach""","""religious""",1509,"""Triptych with the Holy Kinship…","""CRANACH, Lucas the Elder""","""https://www.wga.hu/html/c/cran…","""In the central panel Mary is s…"


In [6]:
# are counted only words without punctuation, numbers or stopwords
min_description_word_count = 20

artworks_with_description = wga_features_df.with_columns(
    pl.col("description")
    .map_elements(lambda x: len(clean_description(x).split(" ")), return_dtype=pl.Int64)
    .alias("description word count")
).filter((pl.col("description word count") >= min_description_word_count))

artworks_with_description

title,artist,type,year,raw_title,raw_artist,url,description,description word count
str,str,str,i64,str,str,str,str,i64
"""fat kitchen""","""jan steen""","""still life""",1667,"""The Fat Kitchen""","""STEEN, Jan""","""https://www.wga.hu/html/s/stee…","""Jan Steen frequently incorpora…",23
"""hell detail""","""hieronymus bosch""","""religious""",1500,"""Hell (detail)""","""BOSCH, Hieronymus""","""https://www.wga.hu/html/b/bosc…","""The marked contrasts between l…",21
"""girls bathing seaside""","""claude joseph vernet""","""landscape""",1761,"""Girls Bathing at the Seaside""","""VERNET, Claude-Joseph""","""https://www.wga.hu/html/v/vern…","""This painting is characteristi…",48
"""portrait john 20th earl crawfo…","""richard waitt""","""portrait""",1725,"""Portrait of John 20th Earl of …","""WAITT, Richard""","""https://www.wga.hu/html/w/wait…","""This painting represents the f…",50
"""three ages man""","""tiziano vecellio""","""mythological""",1512,"""The Three Ages of Man""","""TIZIANO Vecellio""","""https://www.wga.hu/html/t/tizi…","""The young Titian was influence…",52
…,…,…,…,…,…,…,…,…
"""coronation virgin detail""","""fra filippo lippi""","""religious""",1441,"""Coronation of the Virgin (deta…","""LIPPI, Fra Filippo""","""https://www.wga.hu/html/l/lipp…","""The detail shows the upper lef…",39
"""story nastagio degli onesti fi…","""sandro botticelli""","""history""",1483,"""The Story of Nastagio degli On…","""BOTTICELLI, Sandro""","""https://www.wga.hu/html/b/bott…","""The first episode depicts Degl…",25
"""story psyche 2 zephyr transpor…","""maurice denis""","""mythological""",1908,"""The Story of Psyche: 2. Zephyr…","""DENIS, Maurice""","""https://www.wga.hu/html/d/deni…","""The monumental canvases depict…",41
"""triptych holy kinship central …","""lucas the elder cranach""","""religious""",1509,"""Triptych with the Holy Kinship…","""CRANACH, Lucas the Elder""","""https://www.wga.hu/html/c/cran…","""In the central panel Mary is s…",24


In [7]:
artworks_with_description.write_csv(RAW_DATA_PATH + "wqa_paintings/wqa_paintings_data.csv")

In [8]:
artworks_with_description.sort("description word count")["description"].to_list()

["On one of the room's short walls, between the figures of Adam and Eve, there was a lunette containing a Madonna and Child beneath a canopy supported by angels that was only discovered several years after the rest of the decoration.",
 'It has proved possible to identify the book that lies open in front of this mystically-clad astronomer. It is by Adriaen Metius and is called The Exploration and Observation of the Stars. The globe was made by Jodocus Hondius.',
 'The provenance of the altarpiece is not known. This curious "pastiche" was probably made after the death of Francesco by his former collaborators. The principal characters (Christ, the Madonna and St Joseph) are probably from Francesco\'s hand.',
 "Melbye's early pictures followed the style of Eckersberg's marine paintings, which are characterized by a heightened calm and clear colour, but Melbye soon moved towards a more international, Romantic style.",
 'This painting, close to the securely serene art of Perugino, is also k

In [9]:

# painting_url = artworks_with_description["url"][3]

# response = requests.get(painting_url)

# if str(response.status_code)[0] != "2":
#     raise Exception("Wrong call")

# soup = BeautifulSoup(response.text, "html.parser")
# soup


In [10]:
# image_url_identifier = soup.find("a", {"href": re.compile(r".*\.jpg$")}, {"onclick": ".*"}).attrs["href"]
# "https://www.wga.hu/" + image_url_identifier

In [11]:
# image = requests.get("https://www.wga.hu/" + image_url_identifier).content

In [12]:
# Image.open(BytesIO(image))