# Retrieve paintings and their descriptions from Wikiart
This notebook retrieves paintings and their descriptions from the Wikiart digital collection. This dataset is also used to get the style and type for artworks from museums that do not provide these pieces of information.

### 0. Import libraries

In [None]:
import os
import json
import time

import requests
import polars as pl
from tqdm import tqdm
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup

from preprocess_data_utils import *

RAW_DATA_PATH = "../../data/raw/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"

### 1. Load and preprocess the data

In [None]:
try:
    os.mkdir(RAW_DATA_PATH + "wikiart_paintings/")
except FileExistsError:
    pass

try:
    os.mkdir(INTERMEDIATE_DATA_PATH + "wikiart_paintings/")
except FileExistsError:
    pass

In [None]:
wikiart_paintings_files = os.listdir(RAW_DATA_PATH + "wikiart_paintings/wikiart_data/")

wikiart_features = []

for file in wikiart_paintings_files:
    artist_paintings = pl.read_json(
        RAW_DATA_PATH + "wikiart_paintings/wikiart_data/" + file, infer_schema_length=10000
    )

    if artist_paintings.shape[0] != 0:
        artist_paintings = artist_paintings.select(
            "title",
            "artistName",
            "description",
            "style",
            "genre",
            "url",
            "artistUrl",
            "completitionYear",
            "image",
        )

        for col in artist_paintings.columns:
            artist_paintings = artist_paintings.with_columns(pl.col(col).cast(pl.String).alias(col))

        wikiart_features.append(artist_paintings)

raw_wikiart_features_df = pl.concat(wikiart_features)
raw_wikiart_features_df.write_csv(RAW_DATA_PATH + "wikiart_paintings/wikiart_data.csv")

wikiart_features_df = (
    raw_wikiart_features_df.with_columns(pl.col("title").alias("raw_title"))
    .with_columns(pl.col("artistName").alias("raw_artist"))
    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
    .with_columns(
        pl.col("completitionYear")
        .map_elements(lambda x: clean_date(x), return_dtype=pl.Int64)
        .alias("year")
    )
    .with_columns(
        pl.col("artistName")
        .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
        .alias("artist")
    )
    .with_columns(
        pl.col("image")
        .map_elements(lambda x: "".join(x.split("!")[:-1]), return_dtype=pl.String)
        .alias("image_url")
    )
    .with_columns(pl.col("style").str.to_lowercase().alias("style"))
    .with_columns(pl.col("genre").str.to_lowercase().alias("genre"))
    .with_columns(
        ("https://www.wikiart.org/en/" + pl.col("artistUrl") + "/" + pl.col("url")).alias("url")
    )
    .filter(~((pl.col("artist").str.len_bytes() <= 4) | (pl.col("title").str.len_bytes() <= 2)))
    .filter(~(pl.col("style").is_null() & pl.col("genre").is_null()))
    .rename({"genre": "type"})
    .unique()
    .select(
        "title",
        "artist",
        "type",
        "style",
        "year",
        "description",
        "url",
        "raw_title",
        "raw_artist",
        "image_url",
    )
)
wikiart_features = wikiart_features_df.to_numpy()
wikiart_features_df.write_csv(INTERMEDIATE_DATA_PATH + "wikiart_paintings/wikiart_processed.csv")

wikiart_features_df

### 2. Get only paintings with descriptions and adequate style and type

In [None]:
# are counted only words without punctuation, numbers or stopwords
artworks_with_description = wikiart_features_df.with_columns(
    pl.col("description")
    .map_elements(lambda x: len(clean_description(x).split(" ")), return_dtype=pl.Int64)
    .alias("description word count")
).filter(
    (pl.col("description word count") >= MIN_DESCRIPTION_WORD_COUNT)
    & (pl.col("year") > MIN_YEAR)
    & (pl.col("year") < MAX_YEAR)
)

In [None]:
selected_paintings = []

for selected_type in WIKIART_KEPT_TYPES:
    selected_paintings.append(
        artworks_with_description.filter(pl.col("type").str.contains(selected_type))
    )
selected_paintings_df = pl.concat(selected_paintings).unique()

for left_out_type in WIKIART_LEFT_OUT_TYPES:
    selected_paintings_df = selected_paintings_df.filter(
        ~pl.col("type").str.contains(left_out_type)
    )
selected_paintings_df

In [None]:
print("Left out types:")
set(artworks_with_description.join(selected_paintings_df, on="url", how="anti")["type"].to_list())

In [None]:
print("Selected descriptions sorted by word count:")
selected_paintings_df.sort("description word count")["description"].to_list()

In [None]:
def get_media(paintings):
    all_media = []

    for painting in tqdm(paintings):
        painting_url = painting[6]
        response = requests.get(painting_url)

        if str(response.status_code)[0] != "2":
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        media = [
            media_tag.get_text()
            for media_tag in soup.find_all("a", {"href": re.compile(r"/en/paintings-by-media/.*")})
        ]
        all_media.append(media)

    return all_media

In [None]:
selected_paintings = selected_paintings_df.to_numpy()
all_media = get_media(selected_paintings)
selected_paintings_df = selected_paintings_df.with_columns(pl.Series(all_media).alias("media"))

In [None]:
for medium in WIKIART_LEFT_OUT_MEDIA:
    for style in WIKIART_LEFT_OUT_STYLES:
        selected_paintings_df = selected_paintings_df.filter(
            ~pl.col("media").list.contains(medium) & ~pl.col("style").str.contains(style)
        )

### 3. Keep the paintings that have not been collected so far and add the coarse type from the Web Gallery of Art

In [None]:
selected_paintings = selected_paintings_df.drop("description word count", "media").to_numpy()
met_paintings_df = pl.read_csv(INTERMEDIATE_DATA_PATH + "met_paintings/met_processed.csv")
met_paintings = met_paintings_df.to_numpy()
wga_features_df = pl.read_csv(INTERMEDIATE_DATA_PATH + "wga_paintings/wga_processed.csv")
wga_features = wga_features_df.to_numpy()

In [None]:
kept_paintings = []
painting_id = met_paintings[-1][0]

for index, painting in enumerate(tqdm(selected_paintings)):
    found = False

    for met_painting in met_paintings:
        found = is_same_painting(painting[0], painting[1], met_painting[1], met_painting[2])
        if found:
            break

    if not found:
        found = False
        coarse_painting_type = None

        for wga_painting in wga_features:
            found = is_same_painting(painting[0], painting[1], wga_painting[0], wga_painting[1])
            if found:
                coarse_painting_type = wga_painting[2]
                break

        try:
            time.sleep(0.1)
            image = requests.get(painting[9]).content

            painting_id += 1
            kept_paintings.append(
                {
                    "id": painting_id,
                    "title": painting[7],
                    "artist": painting[8],
                    "year": painting[4],
                    "coarse_type": coarse_painting_type,
                    "fine_grained_type": painting[2],
                    "style": painting[3],
                    "description": painting[5],
                }
            )

            with open(RAW_DATA_PATH + f"wikiart_paintings/{painting_id}.png", "wb") as handler:
                handler.write(image)

            with open(
                INTERMEDIATE_DATA_PATH + "wikiart_paintings/wikiart_paintings_enhanced_data.json",
                "w",
            ) as json_file:
                json.dump(kept_paintings, json_file, indent=4)
        except:
            pass

In [None]:
Image.open(BytesIO(image))