# Retrieve paintings and their descriptions from MET
This notebook retrieves European paintings and their descriptions from the Metropolitan Museum of Art.

### 0. Import libraries

In [None]:
import os
import json
import time
import random
import requests
import polars as pl
from tqdm import tqdm
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup

from preprocess_data_utils import *

### 1. Load data

In [None]:
RAW_DATA_PATH = "../../data/raw/met_paintings/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"

In [None]:
try:
    os.mkdir(RAW_DATA_PATH)
except FileExistsError:
    pass

try:
    os.mkdir(INTERMEDIATE_DATA_PATH + "met_paintings/")
except FileExistsError:
    pass

In [None]:
data = pl.read_csv(RAW_DATA_PATH + "met_data.csv", ignore_errors=True)
data

In [None]:
european_paintings = data.filter(
    (pl.col("Department") == "European Paintings")
    & (pl.col("Is Public Domain") == True)
    & (pl.col("Object End Date") > MIN_YEAR)
    & (pl.col("Object End Date") < MAX_YEAR)
)

european_paintings_details_df = european_paintings[
    ["Title", "Artist Display Name", "Object End Date", "Link Resource"]
]
european_paintings_details = european_paintings_details_df.to_numpy()
european_paintings_details

In [None]:
european_paintings_details_df

### 2. Retrieve paintings

In [None]:
painting_id = -1
paintings_data = []

for title, artist, year, painting_url in tqdm(european_paintings_details):
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
    ]

    headers = {"User-Agent": random.choice(user_agents)}
    response = requests.get(painting_url, headers=headers)
    time.sleep(0.1)

    if str(response.status_code)[0] != "2":
        continue

    description = ""

    soup = BeautifulSoup(response.text, "html.parser")
    brief_description = (
        soup.find("div", {"class": "artwork__intro__desc js-artwork__intro__desc"})
        .get_text()
        .strip()
    )

    if len(brief_description) > 0:
        description += brief_description + "\n"

    try:
        catalog_entry = (
            soup.find("section", {"id": "catalogue-entry"})
            .find("div", class_="show-more__body js-show-more__body")
            .get_text()
            .strip()
        )
        description += catalog_entry
    except AttributeError:
        pass

    if len(clean_description(description).split(" ")) >= MIN_DESCRIPTION_WORD_COUNT:
        image_url = soup.find("meta", {"property": "og:image"})["content"]
        image = requests.get(image_url).content
        painting_id += 1

        paintings_data.append(
            {
                "id": painting_id,
                "title": title,
                "artist": artist,
                "year": year,
                "description": description,
            }
        )

        with open(RAW_DATA_PATH + f"{painting_id}.png", "wb") as handler:
            handler.write(image)

In [None]:
Image.open(BytesIO(image))

### 3. Keep artworks with long-enough descriptions

In [None]:
met_paintings_df = (
    pl.DataFrame(paintings_data)
    .unique()
    .sort("id")
    .with_columns(pl.col("title").alias("raw_title"))
    .with_columns(pl.col("artist").alias("raw_artist"))
    .with_columns(
        pl.col("artist")
        .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
        .alias("artist")
    )
    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
    .sort("id")
)

met_paintings_df.write_csv(INTERMEDIATE_DATA_PATH + "met_paintings/met_processed.csv")
met_paintings = met_paintings_df.to_numpy()
met_paintings_df

### 4. Get style and type from Wikidata and the Web Gallery of Art
For this step, there is needed the processed data from these two sources.

In [None]:
wikiart_features_df = pl.read_csv(
    INTERMEDIATE_DATA_PATH + "wikiart_paintings/wikiart_processed.csv"
)
wikiart_features = wikiart_features_df.to_numpy()
wikiart_features_df

In [None]:
wga_features_df = pl.read_csv(INTERMEDIATE_DATA_PATH + "wga_paintings/wga_processed.csv")
wga_features = wga_features_df.to_numpy()
wga_features_df

In [None]:
def match_painting(painting, painting_features, found_paintings):
    same_painting = is_same_painting(
        painting[1], painting[2], painting_features[0], painting_features[1]
    )

    if same_painting:
        found_paintings.append(
            {
                "id": painting[0],
                "title": painting[5],
                "artist": painting[6],
                "year": painting[3],
                "type": painting_features[2],
                "style": painting_features[3] if len(painting_features) >= 4 else None,
                "description": painting[4],
            }
        )

    return same_painting

In [None]:
found_paintings = []

for index, painting in enumerate(tqdm(met_paintings)):
    painting_style = None
    coarse_painting_type = None
    fine_grained_painting_type = None

    found = False
    for painting_features in wikiart_features:
        found = is_same_painting(
            painting[1], painting[2], painting_features[0], painting_features[1]
        )
        if found:
            painting_style = painting_features[3]
            fine_grained_painting_type = painting_features[2]
            break

    found = False
    for painting_features in wga_features:
        found = is_same_painting(
            painting[1], painting[2], painting_features[0], painting_features[1]
        )
        if found:
            coarse_painting_type = painting_features[2]
            break

    found_paintings.append(
        {
            "id": painting[0],
            "title": painting[5],
            "artist": painting[6],
            "year": painting[3],
            "coarse_type": coarse_painting_type,
            "fine_grained_type": fine_grained_painting_type,
            "style": painting_style,
            "description": painting[4],
        }
    )

    with open(INTERMEDIATE_DATA_PATH + "met_paintings/met_paintings_enhanced_data.json", "w") as f:
        json.dump(found_paintings, f, indent=4)