# Retrieve paintings and their descriptions from MET
This notebook retrieves European paintings and their descriptions from the Metropolitan Museum of Art from the 13th-20th centuries.

### 0. Import libraries

In [None]:
import os
import json
import time
import random
import requests
import polars as pl
from tqdm import tqdm
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup

from preprocess_data_utils import *

### 1. Load data

In [7]:
RAW_DATA_PATH = "../../data/raw/met_paintings/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/met_paintings/"

In [None]:
try:
    os.mkdir(RAW_DATA_PATH)
except FileExistsError:
    pass

In [None]:
data = pl.read_csv(RAW_DATA_PATH + "met_objects.csv", ignore_errors=True)
data.head()

In [None]:
european_paintings = data.filter(
    (pl.col("Department") == "European Paintings")
    & (pl.col("Is Public Domain") == True)
    & (pl.col("Object End Date") >= 1201)
    & (pl.col("Object End Date") < 2001)
)

european_paintings_details_df = european_paintings[
    ["Title", "Artist Display Name", "Object End Date", "Link Resource"]]
european_paintings_details = european_paintings_details_df.to_numpy()
european_paintings_details

In [None]:
european_paintings_details_df

### 2. Retrieve paintings

In [None]:
paintings_data = []

for painting_id, (title, artist, year, painting_url) in enumerate(tqdm(european_paintings_details)):
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
    ]

    headers = {'User-Agent': random.choice(user_agents)}
    response = requests.get(painting_url, headers=headers)
    time.sleep(1)

    if str(response.status_code)[0] != "2":
        continue

    description = ""

    soup = BeautifulSoup(response.text, "html.parser")
    brief_description = (
        soup.find("div", {"class": "artwork__intro__desc js-artwork__intro__desc"})
        .get_text()
        .strip()
    )

    if len(brief_description) > 0:
        description += brief_description + "\n"

    try:
        catalog_entry = (
            soup.find("section", {"id": "catalogue-entry"})
            .find("div", class_="show-more__body js-show-more__body")
            .get_text()
            .strip()
        )
        description += catalog_entry
    except AttributeError:
        pass

    if len(description) > 0:
        image_url = soup.find("meta", {"property": "og:image"})["content"]
        image = requests.get(image_url).content
        painting_id += 1

        paintings_data.append(
            {
                "id": painting_id,
                "title": title,
                "artist": artist,
                "year": year,
                "description": description,
            }
        )

        with open(RAW_DATA_PATH + f"{painting_id}.png", "wb") as handler:
            handler.write(image)

        with open(RAW_DATA_PATH + "met_paintings_data.json", 'w') as json_file:
            json.dump(paintings_data, json_file, indent=4)  

In [None]:
Image.open(BytesIO(image))

### 3. Store processed data as a CSV file

In [8]:
met_paintings_df = (
    pl.read_json(RAW_DATA_PATH + "/met_paintings_data.json")
    .unique()
    .sort("id")
    .with_columns(pl.col("title").alias("raw_title"))
    .with_columns(pl.col("artist").alias("raw_artist"))
    .with_columns(
        pl.col("artist")
        .map_elements(lambda x: clean_artist_name(x), return_dtype=pl.String)
        .alias("artist")
    )
    .with_columns(
        pl.col("title")
        .map_elements(lambda x: clean_title_name(x), return_dtype=pl.String)
        .alias("title")
    )
    .sort("id")
)

met_paintings_df.write_csv(INTERMEDIATE_DATA_PATH + "met_processed.csv")
met_paintings_df

id,title,artist,year,description,raw_title,raw_artist
i64,str,str,i64,str,str,str
0,"""ship stormy sea""","""ivan konstantinovich aivazovsk…",1900,"""Aivazovsky was a celebrated pa…","""A Ship in a Stormy Sea""","""Ivan Konstantinovich Aivazovsk…"
1,"""saint giles christ triumphant …","""miguel alcañiz""",1413,"""These panels, from an altarpie…","""Saint Giles with Christ Triump…","""Miguel Alcañiz (or Miquel Alca…"
2,"""flora zephyr""","""jacopo amigoni""",1739,"""The composition celebrates the…","""Flora and Zephyr""","""Jacopo Amigoni"""
4,"""jérôme bonaparte 1784–1860 kin…","""giacomo andreoli""",1813,"""The following miniature is cle…","""Jérôme Bonaparte (1784–1860), …","""Giacomo Andreoli"""
5,"""saint alexander""","""fra angelico""",1430,"""This early work by Fra Angelic…","""Saint Alexander""","""Fra Angelico (Guido di Pietro)"""
…,…,…,…,…,…,…
2137,"""picquigny""","""frits thaulow""",1899,"""Thaulow earned great success w…","""Picquigny""","""Frits Thaulow"""
2138,"""bust length study man""","""françois auguste biard""",1848,"""Despite the nuanced depiction …","""Bust-Length Study of a Man""","""François-Auguste Biard"""
2139,"""man seated asleep""","""giuseppe abbati""",1870,"""This picture’s lack of pretens…","""A Man Seated and Asleep""","""Giuseppe Abbati"""
2140,"""rachel ruysch 1664–1750""","""michiel van musscher""",1692,"""Over a career that spanned mor…","""Rachel Ruysch (1664–1750)""","""Michiel van Musscher|Rachel Ru…"
