# Add style and type features to the retrieved paintings
This notebook adds to the retrieved paintings the type and style if they were not available initially. These characteristics are retrieved from Wikiart.

### 0. Import libraries

In [1]:
import re
import os
import json

import nltk
import polars as pl
from tqdm import tqdm
from nltk.corpus import stopwords

nltk.download("stopwords")
STOP_WORDS = stopwords.words("english")
RAW_DATA_PATH = "../../data/raw/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bogdan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1. Load and preprocess data

#### 1.1. Wikiart data about the style and type of paintings

In [2]:
def clean_artist_name(artist):
    first_artist_name = re.sub(r"\([^)]*\)", "", artist.lower()).split("|")[0]
    artist_wo_punctuation = re.sub(r"[.,\-!?;:()\[\]{}]", " ", first_artist_name).strip()
    artist_wo_multiple_spaces = re.sub(r"\s+", " ", artist_wo_punctuation).strip()

    return artist_wo_multiple_spaces

In [3]:
def clean_title_name(title):
    title_wo_punctuation = re.sub(
        r"[.,\-!?;:()\[\]{}]", " ", title.lower().replace("\xa0", " ")
    ).strip()
    title_wo_multiple_spaces = re.sub(r"\s+", " ", title_wo_punctuation).strip().split(" ")
    title_wo_stop_words = " ".join(
        [word for word in title_wo_multiple_spaces if word not in STOP_WORDS]
    )

    return title_wo_stop_words

In [4]:
def clean_genre(genre):
    cleaned_genre = genre.replace(" painting", "").replace(" (nu)", "").strip().lower()

    if cleaned_genre == "none":
        return None
    else:
        return cleaned_genre

In [5]:
def clean_style(style):
    cleaned_style = style.replace(" painting", "").replace("\xa0", " ").strip().lower()

    if clean_style == "none":
        return None
    else:
        return cleaned_style

In [6]:
def clean_date(year):
    cleaned_year = re.findall(r"\b\d{4}\b", year.lower())

    if len(cleaned_year) == 0:
        return None
    else:
        return int(cleaned_year[0])

In [7]:
wikiart_features_df = (
    (
        pl.read_parquet(RAW_DATA_PATH + "attributes/*.parquet")
        .drop("description", "filename", "image", "embeddings_pca512")
        .with_columns(
            pl.col("genre")
            .map_elements(lambda x: clean_genre(x), return_dtype=pl.String)
            .alias("type")
        )
        .with_columns(
            pl.col("style")
            .map_elements(lambda x: clean_style(x), return_dtype=pl.String)
            .replace({"none": None})
            .alias("style")
        )
        .with_columns(
            pl.col("title")
            .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
            .alias("title")
        )
        .with_columns(
            pl.col("artist")
            .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
            .alias("artist")
        )
        .with_columns(
            pl.col("date")
            .map_elements(lambda x: clean_date(x), return_dtype=pl.Int64)
            .alias("date")
        )
    )
    .filter(~((pl.col("artist").str.len_bytes() <= 4) | (pl.col("title").str.len_bytes() <= 2)))
    .filter(~(pl.col("style").is_null() & pl.col("type").is_null()))
    .unique()
).select("title", "artist", "type", "style")

wikiart_features = wikiart_features_df.to_numpy()
wikiart_features_df

title,artist,type,style
str,str,str,str
"""peacocks looks""","""ion tuculescu""","""figurative""","""expressionism"""
"""aragats mount ara""","""martiros saryan""","""landscape""","""post-impressionism"""
"""chopin performing guest hall a…","""henryk siemiradzki""","""history""","""romanticism"""
"""la lutte""","""jacques villon""","""abstract""","""cubism"""
"""lilacs glass apple lemon""","""meijer de haan""","""still life""","""post-impressionism"""
…,…,…,…
"""les musiciens""","""nicolas de staël""","""figurative""","""lyrical abstraction"""
"""flute passage""","""jack bush""","""abstract""","""color field painting"""
"""john plampin""","""thomas gainsborough""","""portrait""","""rococo"""
"""reverie""","""ipolit strambu""","""nude""","""impressionism"""


In [16]:
wikiart_paintings_files = os.listdir(RAW_DATA_PATH + "wikiart/")

wikiart_features = []

for file in wikiart_paintings_files:
    artist_paintings = pl.read_json(RAW_DATA_PATH + "wikiart/" + file, infer_schema_length=1000)

    if artist_paintings.shape[0] != 0:
        artist_paintings = artist_paintings.select(
            "title", "artistName", "description", "style", "genre", "url", "artistUrl"
        )

        for col in artist_paintings.columns:
            artist_paintings = artist_paintings.with_columns(pl.col(col).cast(pl.String).alias(col))

        wikiart_features.append(artist_paintings)

wikiart_features_df = (
    pl.concat(wikiart_features)
    .with_columns(pl.col("title").alias("raw_title"))
    .with_columns(pl.col("artistName").alias("raw_artist"))
    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
    .with_columns(
        pl.col("artistName")
        .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
        .alias("artist")
    )
    .with_columns(
        pl.col("style").map_elements(lambda x: x.lower(), return_dtype=pl.String).alias("style")
    )
    .with_columns(
        pl.col("genre").map_elements(lambda x: x.lower(), return_dtype=pl.String).alias("genre")
    )
    .with_columns((pl.col("artistUrl") + "/" + pl.col("url")).alias("url"))
    .filter(~((pl.col("artist").str.len_bytes() <= 4) | (pl.col("title").str.len_bytes() <= 2)))
    .filter(~(pl.col("style").is_null() & pl.col("genre").is_null()))
    .rename({"genre": "type"})
    .unique()
    .select(
        "title",
        "artist",
        "style",
        "type",
        "description",
        "url",
        "raw_title",
        "raw_artist",
    )
)
wikiart_features = wikiart_features_df.to_numpy()
wikiart_features_df

title,artist,style,type,description,url,raw_title,raw_artist
str,str,str,str,str,str,str,str
"""still life self portrait""","""carlsen emil""","""realism""","""still life""",,"""emil-carlsen/still-life-with-s…","""Still Life with Self Portrait""","""Carlsen Emil"""
"""field marshal radetzky staff b…","""adam albrecht""","""romanticism""","""battle painting""",,"""albrecht-adam/field-marshal-ra…","""Field Marshal Radetzky and his…","""Adam Albrecht"""
"""children country road first st…","""brendekilde hans andersen""","""realism""","""genre painting""",,"""hans-andersen-brendekilde/chil…","""Children on a Country Road (Th…","""Brendekilde Hans Andersen"""
"""temptation st anthony detail""","""bosch hieronymus""","""northern renaissance""","""religious painting""",,"""hieronymus-bosch/the-temptatio…","""The Temptation of St Anthony (…","""Bosch Hieronymus"""
"""madame philippe panon desbassa…","""benoist marie guillemine""","""neoclassicism""","""portrait""",,"""marie-guillemine-benoist/madam…","""Madame Philippe Panon Desbassa…","""Benoist Marie-Guillemine"""
…,…,…,…,…,…,…,…
"""temptation st anthony detail""","""bosch hieronymus""","""northern renaissance""","""religious painting""",,"""hieronymus-bosch/the-temptatio…","""The Temptation of St. Anthony …","""Bosch Hieronymus"""
"""green stripe""","""calder alexander""","""abstract art""","""abstract""",,"""alexander-calder/the-green-str…","""The Green Stripe""","""Calder Alexander"""
"""abbe fanfreluche""","""beardsley aubrey""","""art nouveau (modern)""","""portrait""",,"""aubrey-beardsley/the-abbe-fanf…","""The Abbe Fanfreluche""","""Beardsley Aubrey"""
"""untitled""","""beksinski zdislav""","""surrealism""","""symbolic painting""",,"""zdzislaw-beksinski/untitled-46…","""Untitled""","""Beksinski Zdislav"""


#### 1.2. Web Gallery of Art data about the style and type of paintings

In [9]:
def rearrange_artist_name(name):
    divided_name = name.split(", ")

    if len(divided_name) == 1:
        return divided_name[0]

    elif len(divided_name) == 2:
        return divided_name[1] + " " + divided_name[0]

    else:
        return divided_name[1] + " " + divided_name[0] + " " + " ".join(divided_name[2:])

In [10]:
wga_features_df = (
    (
        pl.read_excel(RAW_DATA_PATH + "artists_wga.xlsx")
        .with_columns(
            pl.col("AUTHOR")
            .map_elements(
                lambda x: clean_artist_name(rearrange_artist_name(x)), return_dtype=pl.String
            )
            .alias("artist")
        )
        .with_columns(
            pl.col("TITLE")
            .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
            .alias("title")
        )
        .with_columns(
            pl.col("TYPE")
            .replace(
                {
                    "historical": "history",
                    "other": None,
                    "still-life": "still life",
                    "study": "sketch and study",
                }
            )
            .alias("type")
        )
        .select("title", "artist", "type")
    )
    .filter(pl.col("type").is_not_null())
    .unique()
)

wga_features = wga_features_df.to_numpy()
wga_features_df

title,artist,type
str,str,str
"""virgin child""","""giovanni di lorenzo larciani""","""religious"""
"""interior view""","""georg moller""","""interior"""
"""christ handing keys st peter""","""pietro perugino""","""religious"""
"""marauders""","""willem cornelisz duyster""","""genre"""
"""polyptych resurrection""","""tiziano vecellio""","""religious"""
…,…,…
"""virgin appears st bernard""","""fra filippo lippi""","""religious"""
"""palazzo ducale venice""","""pierre auguste renoir""","""landscape"""
"""view apse calotte""","""jacopo torriti""","""religious"""
"""peasant sitting table""","""vincent van gogh""","""genre"""


#### 1.3. MET data

In [11]:
met_paintings_df = (
    pl.read_json(RAW_DATA_PATH + "met_paintings/met_paintings_data.json")
    .unique()
    .sort("id")
    .with_columns(pl.col("title").alias("raw_title"))
    .with_columns(pl.col("artist").alias("raw_artist"))
    .with_columns(
        pl.col("artist")
        .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
        .alias("artist")
    )

    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
    .sort("id")
)

met_paintings = met_paintings_df.to_numpy()
met_paintings_df

id,title,artist,year,description,raw_title,raw_artist
i64,str,str,i64,str,str,str
0,"""ship stormy sea""","""ivan konstantinovich aivazovsk…",1900,"""Aivazovsky was a celebrated pa…","""A Ship in a Stormy Sea""","""Ivan Konstantinovich Aivazovsk…"
1,"""saint giles christ triumphant …","""miguel alcañiz""",1413,"""These panels, from an altarpie…","""Saint Giles with Christ Triump…","""Miguel Alcañiz (or Miquel Alca…"
2,"""flora zephyr""","""jacopo amigoni""",1739,"""The composition celebrates the…","""Flora and Zephyr""","""Jacopo Amigoni"""
4,"""jérôme bonaparte 1784–1860 kin…","""giacomo andreoli""",1813,"""The following miniature is cle…","""Jérôme Bonaparte (1784–1860), …","""Giacomo Andreoli"""
5,"""saint alexander""","""fra angelico""",1430,"""This early work by Fra Angelic…","""Saint Alexander""","""Fra Angelico (Guido di Pietro)"""
…,…,…,…,…,…,…
2137,"""picquigny""","""frits thaulow""",1899,"""Thaulow earned great success w…","""Picquigny""","""Frits Thaulow"""
2138,"""bust length study man""","""françois auguste biard""",1848,"""Despite the nuanced depiction …","""Bust-Length Study of a Man""","""François-Auguste Biard"""
2139,"""man seated asleep""","""giuseppe abbati""",1870,"""This picture’s lack of pretens…","""A Man Seated and Asleep""","""Giuseppe Abbati"""
2140,"""rachel ruysch 1664–1750""","""michiel van musscher""",1692,"""Over a career that spanned mor…","""Rachel Ruysch (1664–1750)""","""Michiel van Musscher|Rachel Ru…"


### 2. Get style and type for MET paintings available in the Wikiart dataset

In [12]:
def match_painting(painting, painting_features, found_paintings):
    searched_title = painting[1]
    searched_artist = painting[2]

    title = painting_features[0]
    artist = painting_features[1]

    found_artist = searched_artist in artist or artist in searched_artist
    found_title = searched_title in title or title in searched_title

    if found_artist and found_title:
        found_paintings.append(
            {
                "id": painting[0],
                "title": painting[5],
                "artist": painting[6],
                "year": painting[3],
                "type": painting_features[2],
                "style": painting_features[3] if len(painting_features) >= 4 else None,
                "description": painting[4],
            }
        )

    return found_artist and found_title

In [13]:
found_paintings = []

for index, painting in enumerate(tqdm(met_paintings)):
    found = False

    for painting_features in wikiart_features:
        found = match_painting(painting, painting_features, found_paintings)
        if found:
            break

    if found:
        continue

    for painting_features in wga_features:
        found =  match_painting(painting, painting_features, found_paintings)
        if found:
            break

    if not found:
        found_paintings.append(
            {
                "id": painting[0],
                "title": painting[5],
                "artist": painting[6],
                "year": painting[3],
                "type": None,
                "style": None,
                "description": painting[4],
            }
        )

100%|██████████| 2053/2053 [03:17<00:00, 10.38it/s]


In [15]:
with open(INTERMEDIATE_DATA_PATH + "met_paintings/met_paintings_enhanced_data.json", "w") as f:
    json.dump(found_paintings, f, indent=4)