# Add style and type features to the retrieved paintings
This notebook adds to the retrieved paintings the type and style if they were not available initially. These characteristics are retrieved from Wikiart.

### 0. Import libraries

In [None]:
import re
import os


import json


import nltk
import polars as pl



from tqdm import tqdm


from nltk.corpus import stopwords



nltk.download("stopwords")


STOP_WORDS = stopwords.words("english")


RAW_DATA_PATH = "../../data/raw/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"

### 1. Load data

#### 1.1. Load Wikiart data about the style and type of paintings

In [None]:
def clean_artist_name(artist):
    first_artist_name = re.sub(r"\([^)]*\)", "", artist.lower()).split("|")[0]
    artist_wo_punctuation = re.sub(r"[.,\-!?;:()\[\]{}]", " ", first_artist_name).strip()
    artist_wo_multiple_spaces = re.sub(r"\s+", " ", artist_wo_punctuation).strip()

    return artist_wo_multiple_spaces

In [None]:
def clean_title_name(title):
    title_wo_punctuation = re.sub(
        r"[.,\-!?;:()\[\]{}]", " ", title.lower().replace("\xa0", " ")
    ).strip()
    title_wo_multiple_spaces = re.sub(r"\s+", " ", title_wo_punctuation).strip().split(" ")
    title_wo_stop_words = " ".join(
        [word for word in title_wo_multiple_spaces if word not in STOP_WORDS]
    )

    return title_wo_stop_words

In [None]:
def clean_genre(genre):
    cleaned_genre = genre.replace(" painting", "").replace(" (nu)", "").strip().lower()

    if cleaned_genre == "none":
        return None
    else:
        return cleaned_genre

In [None]:
def clean_style(style):
    cleaned_style = style.replace(" painting", "").replace("\xa0", " ").strip().lower()

    if clean_style == "none":
        return None
    else:
        return cleaned_style

In [None]:
def clean_date(year):
    cleaned_year = re.findall(r"\b\d{4}\b", year.lower())

    if len(cleaned_year) == 0:
        return None
    else:
        return int(cleaned_year[0])

In [None]:
wikiart_features_df = (
    (
        pl.read_parquet(RAW_DATA_PATH + "attributes/*.parquet")
        .drop("description", "filename", "image", "embeddings_pca512")
        .with_columns(
            pl.col("genre")
            .map_elements(lambda x: clean_genre(x), return_dtype=pl.String)
            .alias("type")
        )
        .with_columns(
            pl.col("style")
            .map_elements(lambda x: clean_style(x), return_dtype=pl.String)
            .replace({"none": None})
            .alias("style")
        )
        .with_columns(
            pl.col("title")
            .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
            .alias("title")
        )
        .with_columns(
            pl.col("artist")
            .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
            .alias("artist")
        )
        .with_columns(
            pl.col("date")
            .map_elements(lambda x: clean_date(x), return_dtype=pl.Int64)
            .alias("date")
        )
    )
    .filter(~((pl.col("artist").str.len_bytes() <= 4) | (pl.col("title").str.len_bytes() <= 2)))
    .filter(~(pl.col("style").is_null() & pl.col("type").is_null()))
    .unique()
).select("title", "artist", "style", "type")

wikiart_features = wikiart_features_df.to_numpy()
wikiart_features_df

In [None]:
wikiart_paintings_files = os.listdir(RAW_DATA_PATH + "wikiart/")

wikiart_features = []

for file in wikiart_paintings_files:
    artist_paintings = pl.read_json(RAW_DATA_PATH + "wikiart/" + file, infer_schema_length=1000)

    if artist_paintings.shape[0] != 0:
        artist_paintings = artist_paintings.select(
            "title", "artistName", "description", "style", "genre", "url", "artistUrl"
        )

        for col in artist_paintings.columns:
            artist_paintings = artist_paintings.with_columns(pl.col(col).cast(pl.String).alias(col))

        wikiart_features.append(artist_paintings)

wikiart_features_df = (
    pl.concat(wikiart_features)
    .with_columns(pl.col("title").alias("raw_title"))
    .with_columns(pl.col("artistName").alias("raw_artist"))
    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
    .with_columns(
        pl.col("artistName")
        .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
        .alias("artist")
    )
    .with_columns(
        pl.col("style")
        .map_elements(lambda x: x.lower(), return_dtype=pl.String)
        .alias("style")
    )
    .with_columns(
        pl.col("genre")
        .map_elements(lambda x: x.lower(), return_dtype=pl.String)
        .alias("genre")
    )
    .with_columns((pl.col("artistUrl") + "/" + pl.col("url")).alias("url"))
    .filter(~((pl.col("artist").str.len_bytes() <= 4) | (pl.col("title").str.len_bytes() <= 2)))
    .filter(~(pl.col("style").is_null() & pl.col("genre").is_null()))
    .rename({"genre": "type"})
    .unique()
    .select(
        "title",
        "artist",
        "style",
        "type",
        "description",
        "url",
        "raw_title",
        "raw_artist",
    )
)
wikiart_features = wikiart_features_df.to_numpy()
wikiart_features_df

#### 1.2. Load Web Gallery of Art data about the style and type of paintings

In [None]:
def rearrange_artist_name(name):
    divided_name = name.split(", ")

    if len(divided_name) == 1:
        return divided_name[0]

    elif len(divided_name) == 2:
        return divided_name[1] + " " + divided_name[0]

    else:
        return divided_name[1] + " " + divided_name[0] + " " + " ".join(divided_name[2:])

In [None]:
wga_features_df = (
    (
        pl.read_excel(RAW_DATA_PATH + "artists_wga.xlsx")
        .with_columns(
            pl.col("AUTHOR")
            .map_elements(
                lambda x: clean_artist_name(rearrange_artist_name(x)), return_dtype=pl.String
            )
            .alias("artist")
        )
        .with_columns(
            pl.col("TITLE")
            .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
            .alias("title")
        )
        .with_columns(
            pl.col("TYPE")
            .replace(
                {
                    "historical": "history",
                    "other": None,
                    "still-life": "still life",
                    "study": "sketch and study",
                }
            )
            .alias("type")
        )
        .select("title", "artist", "type")
    )
    .filter(pl.col("type").is_not_null())
    .unique()
)

wga_features = wga_features_df.to_numpy()
wga_features_df

#### 1.3. Load MET data

In [None]:
met_paintings_df = (
    pl.read_json(RAW_DATA_PATH + "met_paintings/met_paintings_data.json")
    .unique()
    .sort("id")
    .with_columns(pl.col("title").alias("raw_title"))
    .with_columns(pl.col("artist").alias("raw_artist"))
    .with_columns(
        pl.col("artist")
        .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
        .alias("artist")
    )

    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
    .sort("id")
)


met_paintings = met_paintings_df.to_numpy()
met_paintings_df

### 2. Get style and type for MET paintings available in the Wikiart dataset

In [None]:
def match_painting(painting, raw_painting, painting_features, found_paintings):
    searched_title = painting[1]
    searched_artist = painting[2]

    title = painting_features[0]
    artist = painting_features[1]

    found_artist = searched_artist in artist or artist in searched_artist
    found_title = searched_title in title or title in searched_title

    if found_artist and found_title:
        found_paintings.append(
            {
                "id": raw_painting[0],
                "title": raw_painting[5],
                "artist": raw_painting[6],
                "year": raw_painting[3],
                "type": painting_features[2],
                "style": painting_features[3] if len(painting_features) == 4 else None,
                "description": raw_painting[4],
            }
        )

        return True

    return False

In [None]:
found_paintings = []

for index, painting in enumerate(tqdm(met_paintings)):
    found = False

    for painting_features in wikiart_features:
        found = match_painting(painting, painting_features, found_paintings)
        if found:
            break

    if found:
        continue

    for painting_features in wga_features:
        if match_painting(painting, painting_features, found_paintings):
            break

print(f"Enhanced with additional features {len(found_paintings)}/{len(met_paintings)} paintings.")

In [None]:
with open(INTERMEDIATE_DATA_PATH + "met_paintings/met_paintings_enhanced_data.json", "w") as f:
    json.dump(found_paintings, f, indent=4)