# Retrieve paintings and their descriptions from MET
This notebook retrieves European paintings and their descriptions from the Metropolitan Museum of Art from the 13th-20th centuries.

### 0. Import libraries

In [None]:
import os
import json
import time
import random
import requests
import polars as pl
from tqdm import tqdm
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup

### 1. Load data

In [None]:
DATA_PATH = "../../data/raw/met_paintings/"

In [None]:
try:
    os.mkdir(DATA_PATH)
except FileExistsError:
    pass

In [None]:
data = pl.read_csv(DATA_PATH + "met_objects.csv", ignore_errors=True)
data.head()

In [None]:
european_paintings = data.filter(
    (pl.col("Department") == "European Paintings")
    & (pl.col("Is Public Domain") == True)
    & (pl.col("Object End Date") >= 1201)
    & (pl.col("Object End Date") < 2001)
)

european_paintings_details = european_paintings[
    ["Title", "Artist Display Name", "Object End Date", "Link Resource"]
].to_numpy()
european_paintings_details

In [None]:
paintings_data = []

for painting_id, (title, artist, year, painting_url) in enumerate(tqdm(european_paintings_details)):
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
    ]

    headers = {'User-Agent': random.choice(user_agents)}
    response = requests.get(painting_url, headers=headers)
    time.sleep(1)

    if str(response.status_code)[0] != "2":
        continue

    description = ""

    soup = BeautifulSoup(response.text, "html.parser")
    brief_description = (
        soup.find("div", {"class": "artwork__intro__desc js-artwork__intro__desc"})
        .get_text()
        .strip()
    )

    if len(brief_description) > 0:
        description += brief_description + "\n"

    try:
        catalog_entry = (
            soup.find("section", {"id": "catalogue-entry"})
            .find("div", class_="show-more__body js-show-more__body")
            .get_text()
            .strip()
        )
        description += catalog_entry
    except AttributeError:
        pass

    if len(description) > 0:
        image_url = soup.find("meta", {"property": "og:image"})["content"]
        image = requests.get(image_url).content
        painting_id += 1

        paintings_data.append(
            {
                "id": painting_id,
                "title": title,
                "artist": artist,
                "year": year,
                "description": description,
            }
        )

        with open(DATA_PATH + f"{painting_id}.png", "wb") as handler:
            handler.write(image)

        with open(DATA_PATH + "met_paintings_data.json", 'w') as json_file:
            json.dump(paintings_data, json_file, indent=4)  

In [None]:
Image.open(BytesIO(image))