In [None]:
import os
import json
import requests
import polars as pl
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from bs4 import BeautifulSoup

In [None]:
DATA_PATH = "../../data/raw/"

In [None]:
try:
    os.mkdir(DATA_PATH + "met_paintings/")
except FileExistsError:
    pass

In [None]:
met_data = pl.read_csv(DATA_PATH + "met_objects.csv", ignore_errors=True)
met_data.head()

In [None]:
met_european_paintings = met_data.filter((pl.col("Department") == "European Paintings") & (pl.col("Is Public Domain") == True) & (pl.col("Object End Date") > 1201) & (pl.col("Object End Date") < 2001))
european_paintings_urls = met_european_paintings[["Title", "Artist Display Name", "Object End Date", "Link Resource"]].to_numpy()
european_paintings_urls

In [None]:
paintings_data = []

for painting_id, (title, artist, year, painting_url) in enumerate(tqdm(european_paintings_urls[:5])):
    # Send a request to the page
    response = requests.get(painting_url)
    response.raise_for_status()  # Check if the request was successful

    description = ""

    # Parse the page content
    soup = BeautifulSoup(response.text, "html.parser")
    brief_description = soup.find("div", {"class": "artwork__intro__desc js-artwork__intro__desc"}).get_text().strip()

    if len(brief_description) > 0:
        description += brief_description + "\n"

    try:
        catalog_entry = (
            soup.find("section", {"id": "catalogue-entry"})
            .find("div", class_="show-more__body js-show-more__body")
            .get_text()
            .strip()
        )
        description += catalog_entry
    except AttributeError:
        pass

    if len(description) > 0:
        image_url = soup.find("meta", {"property": "og:image"})["content"]
        image = requests.get(image_url).content

        paintings_data.append({"id": painting_id, "title": title, "artist": artist, "year": year, "description": description})

        with open(DATA_PATH + f'{painting_id}.png', 'wb') as handler:
            handler.write(image)

In [None]:
with open(DATA_PATH + "met_paintings_data.json", 'w') as json_file:
    json.dump(paintings_data, json_file, indent=4)  

In [None]:
Image.open(BytesIO(image))