# Add style and type features to the retrieved paintings
This notebook adds to the retrieved paintings the type and style if they were not available initially. These characteristics are retrieved from Wikiart.

### 0. Import libraries

In [None]:
import re
import json
import nltk 
import polars as pl
from tqdm import tqdm
from nltk.corpus import stopwords

nltk.download('stopwords')
STOP_WORDS = stopwords.words('english')
DATA_PATH = "../../data/raw/"

### 1. Load data

#### 1.1. Load Wikiart data about the style and type of paintings

In [None]:
def clean_artist_name(artist):
    first_artist_name = re.sub(r"\([^)]*\)", "", artist.lower()).split("|")[0]
    artist_wo_punctuation = re.sub(r"[.,\-!?;:()\[\]{}]", " ", first_artist_name).strip()
    artist_wo_multiple_spaces = re.sub(r"\s+", " ", artist_wo_punctuation).strip()

    return artist_wo_multiple_spaces

In [None]:
def clean_title_name(title):
    title_wo_punctuation = re.sub(r"[.,\-!?;:()\[\]{}]", " ", title.lower().replace("\xa0", " ")).strip()
    title_wo_multiple_spaces = re.sub(r"\s+", " ", title_wo_punctuation).strip().split(" ")
    title_wo_stop_words = " ".join([word for word in title_wo_multiple_spaces if word not in STOP_WORDS])

    return title_wo_stop_words

In [None]:
def clean_genre(genre):
    cleaned_genre = genre.replace(" painting", "").replace(" (nu)", "").strip().lower()

    if cleaned_genre == "none":
        return None
    else:
        return cleaned_genre

In [None]:
def clean_style(style):
    cleaned_style = style.replace(" painting", "").replace("\xa0", " ").strip().lower()

    if clean_style == "none":
        return None
    else:
        return cleaned_style

In [None]:
def clean_date(year):
    cleaned_year = re.findall(r"\b\d{4}\b", year.lower())

    if len(cleaned_year) == 0:
        return None
    else:
        return int(cleaned_year[0])

In [None]:
paintings_features_df = (
    (
        pl.read_parquet(DATA_PATH + "attributes/*.parquet")
        .drop("description", "filename", "image", "embeddings_pca512")
        .with_columns(
            pl.col("genre")
            .map_elements(lambda x: clean_genre(x), return_dtype=pl.String)
            .alias("genre")
        )
        .with_columns(
            pl.col("style")
            .map_elements(lambda x: clean_style(x), return_dtype=pl.String)
            .alias("style")
        )
        .with_columns(
            pl.col("title")
            .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
            .alias("title")
        )
        .with_columns(
            pl.col("artist")
            .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
            .alias("artist")
        )
        .with_columns(
            pl.col("date")
            .map_elements(lambda x: clean_date(x), return_dtype=pl.Int64)
            .alias("date")
        )
    )
    .filter(~((pl.col("artist").str.len_bytes() <= 4) | (pl.col("title").str.len_bytes() <= 2)))
    .unique()
)
paintings_features = paintings_features_df.to_numpy()
paintings_features_df

#### 1.2. Load MET data

In [None]:
original_met_paintings_df = pl.read_json(DATA_PATH + "met_paintings/met_paintings_data.json").unique()
original_met_paintings = original_met_paintings_df.to_numpy()

met_paintings_df = (
    original_met_paintings_df
    .with_columns(
        pl.col("artist")
        .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
        .alias("artist")
    )
    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
)
met_paintings = met_paintings_df.to_numpy()
met_paintings_df

### 2. Get style and type for MET paintings available in the Wikiart dataset

In [None]:
met_paintings_with_features = []

for index, met_painting in enumerate(tqdm(met_paintings)):
    id = met_painting[0]
    title = met_painting[1]
    artist = met_painting[2]

    for painting_features in paintings_features:
        current_title = painting_features[0]
        current_artist = painting_features[1]

        if (artist in current_artist or current_artist in artist) and (title in current_title or current_title in title):
            met_paintings_with_features.append({
                    "id": id,
                    "title": original_met_paintings[index][1],
                    "artist": original_met_paintings[index][2],
                    "year": original_met_paintings[index][3],
                    "type": painting_features[3],
                    "style": painting_features[4],
                    "description": original_met_paintings[index][4],
            })
            break

print(f"{len(met_paintings_with_features)} / {len(met_paintings)} paintings have an attached style and type")

In [None]:
with open(DATA_PATH + "met_paintings/met_paintings_enhanced_data.json", "w") as f:
    json.dump(met_paintings_with_features, f, indent=4)  