# Retrieve paintings and their descriptions from the Web Gallery of Art
This notebook retrieves paintings and their descriptions from the Web Gallery of Art digital collection.

### 0. Import libraries

In [1]:
import os
import json
import time

import requests
import polars as pl
from tqdm import tqdm
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup

from preprocess_data_utils import *

RAW_DATA_PATH = "../../data/raw/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bogdan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
try:
    os.mkdir(RAW_DATA_PATH + "wga_paintings/")
except FileExistsError:
    pass

try:
    os.mkdir(INTERMEDIATE_DATA_PATH + "wga_paintings/")
except FileExistsError:
    pass

### 1. Load and preprocess the data

In [None]:
wga_features_df = (
    (
        pl.read_excel(RAW_DATA_PATH + "artists_wga.xlsx")
        .filter(pl.col("FORM") == "painting")
        .with_columns(pl.col("AUTHOR").alias("raw_artist"))
        .with_columns(pl.col("TITLE").alias("raw_title"))
        .with_columns(
            pl.col("AUTHOR")
            .map_elements(
                lambda x: clean_artist_name(rearrange_artist_name(x)), return_dtype=pl.String
            )
            .alias("artist")
        )
        .with_columns(pl.col("URL").alias("url"))
        .with_columns(
            pl.col("TITLE")
            .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
            .alias("title")
        )
        .with_columns(
            pl.col("DATE")
            .map_elements(lambda x: clean_date(x), return_dtype=pl.Int64)
            .alias("year")
        )
        .with_columns(
            pl.col("TYPE")
            .replace(
                {
                    "historical": "history",
                    "other": None,
                    "still-life": "still life",
                    "study": "sketch and study",
                }
            )
            .alias("type")
        )
        .select("title", "artist", "type", "year", "raw_title", "raw_artist", "url")
    )
    .filter(pl.col("type").is_not_null() & (pl.col("year") >= 1201) & (pl.col("year") < 2001))
    .unique()
)

wga_features_df

### 2. Scrape painting descriptions and keep only paintings with longer descriptions

In [None]:
retrieved_descriptions = []

for i in tqdm(range(wga_features_df.shape[0])):
    painting_url = wga_features_df["url"][i]

    response = requests.get(painting_url)

    if str(response.status_code)[0] != "2":
        raise Exception("Wrong call")

    soup = BeautifulSoup(response.text, "html.parser")

    for x in soup.find_all("td"):
        if "<!-- Comment Start -->" in str(x):
            description = x.get_text().strip()

    retrieved_descriptions.append(description.split("\r\n\n\n\r\n")[0])

wga_features_df = wga_features_df.with_columns(
    pl.Series(retrieved_descriptions).alias("description")
)

wga_features_df

In [None]:
# are counted only words without punctuation, numbers or stopwords
min_description_word_count = 20

artworks_with_description = wga_features_df.with_columns(
    pl.col("description")
    .map_elements(lambda x: len(clean_description(x).split(" ")), return_dtype=pl.Int64)
    .alias("description word count")
).filter((pl.col("description word count") >= min_description_word_count))

artworks_with_description

In [None]:
artworks_with_description.write_csv(RAW_DATA_PATH + "wga_paintings/wga_paintings_data.csv")

In [None]:
artworks_with_description.sort("description word count")["description"].to_list()

### 3. Keep the paintings that have not been collected so far and get their style and image

In [None]:
artworks_with_description_df = pl.read_csv(RAW_DATA_PATH + "wga_paintings/wga_paintings_data.csv")
artworks_with_description = artworks_with_description_df.to_numpy()

In [4]:
all_wikiart_features = (
    pl.read_csv(INTERMEDIATE_DATA_PATH + "wikiart_paintings/wikiart_processed.csv")
    .select("title", "artist", "type", "style")
    .to_numpy()
)

In [5]:
met_paintings_df = (
    pl.read_json(INTERMEDIATE_DATA_PATH + "met_paintings/met_paintings_enhanced_data.json")
    .select("id", "title", "artist", "type")
    .with_columns(
        pl.col("artist")
        .map_elements(lambda x: clean_artist_name(rearrange_artist_name(x)), return_dtype=pl.String)
        .alias("artist")
    )
    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
)

wikiart_paintings_df = (
    pl.read_json(INTERMEDIATE_DATA_PATH + "wikiart_paintings/wikiart_paintings_enhanced_data.json")
    .select("id", "title", "artist", "type")
    .with_columns(
        pl.col("artist")
        .map_elements(lambda x: clean_artist_name(rearrange_artist_name(x)), return_dtype=pl.String)
        .alias("artist")
    )
    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
)

existing_paintings_df = pl.concat([met_paintings_df, wikiart_paintings_df])
existing_paintings = existing_paintings_df.drop("id").to_numpy()

In [11]:
types_mapping = []
kept_paintings = []
painting_id = existing_paintings_df["id"].max()

for index, painting in enumerate(tqdm(artworks_with_description)):
    found_painting = False

    for existing_painting in existing_paintings:
        found_painting = is_same_painting(
            painting[0], painting[1], existing_painting[0], existing_painting[1]
        )

        if found_painting:
            break

    if not found_painting:
        painting_type = painting[2]
        painting_style = None

        for wikiart_features in all_wikiart_features:
            found_painting = is_same_painting(
                painting[0], painting[1], wikiart_features[0], wikiart_features[1]
            )

            if found_painting:
                painting_style = wikiart_features[3]
                types_mapping.append([painting_type, wikiart_features[2]])
                break

        try:
            time.sleep(0.1)
            response = requests.get(painting[6])

            if str(response.status_code)[0] != "2":
                raise Exception("Wrong call")

            soup = BeautifulSoup(response.text, "html.parser")
            image_url = (
                "https://www.wga.hu/"
                + soup.find("a", {"href": re.compile(r".*\.jpg$")}, {"onclick": ".*"}).attrs["href"]
            )
            image = requests.get(image_url).content

            painting_id += 1
            kept_paintings.append(
                {
                    "id": painting_id,
                    "title": painting[4],
                    "artist": painting[5],
                    "year": painting[3],
                    "type": painting_type,
                    "style": painting_style,
                    "description": painting[7],
                }
            )

            with open(RAW_DATA_PATH + f"wga_paintings/{painting_id}.png", "wb") as handler:
                handler.write(image)

            with open(
                INTERMEDIATE_DATA_PATH + "wga_paintings/wga_paintings_enhanced_data.json", "w"
            ) as json_file:
                json.dump(kept_paintings, json_file, indent=4)

            types_mapping_df = (
                pl.DataFrame(types_mapping)
                .transpose()
                .unique()
                .rename({"column_0": "wga_type", "column_1": "wikiart_type"})
            )
            types_mapping_df.write_csv(INTERMEDIATE_DATA_PATH + "wga_paintings/types_mapping.csv")
        except:
            pass

100%|██████████| 100/100 [01:19<00:00,  1.26it/s]
