In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import json
import re
import requests
from bs4 import BeautifulSoup
import time
import random
from sklearn.model_selection import train_test_split

In [None]:
def scrape(url):
    # 1) GET con User-Agent e pausa
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }
    r = requests.get(url, headers=headers)
    r.raise_for_status()             # lancia HTTPError su 429 o altri 4xx/5xx
    time.sleep(random.uniform(1, 3)) # pausa 1–3s

    soup = BeautifulSoup(r.text, "html.parser")

    # 2) Plot
    try:
        plot = soup.find("meta", {"name": "description"})["content"].strip()
    except Exception:
        plot = np.nan

    # 3) Durata
    duration = np.nan
    try:
        f = soup.find("p", class_="text-link text-footer")
        if f:
            m = re.search(r"(\d+)\s*min", f.get_text())
            if m:
                duration = int(m.group(1))
    except Exception:
        pass

    # 4) JSON-LD
    directors   = [np.nan, np.nan]
    genres      = [np.nan, np.nan, np.nan]
    countries   = [np.nan, np.nan, np.nan]
    year        = np.nan
    rating_avg  = np.nan
    rating_count= np.nan

    tag = soup.find("script", type="application/ld+json")
    if tag:
        try:
            txt = tag.string or tag.get_text()
            # prendi dal primo { all’ultimo } per includere tutto
            start = txt.find("{")
            end   = txt.rfind("}")
            js = json.loads(txt[start:end+1])

            # registi
            d = js.get("director", [])
            if not isinstance(d, list): d = [d]
            names = [x.get("name", np.nan) for x in d if isinstance(x, dict)]
            directors = (names + [np.nan]*2)[:2]

            # generi
            g = js.get("genre", [])
            if not isinstance(g, list): g = [g]
            genres = (g + [np.nan]*3)[:3]

            # paesi
            c = js.get("countryOfOrigin", [])
            if not isinstance(c, list): c = [c]
            cn = []
            for x in c:
                if isinstance(x, dict):
                    cn.append(x.get("name", np.nan))
                else:
                    cn.append(x)
            countries = (cn + [np.nan]*3)[:3]

            # anno
            reld = js.get("releasedEvent", [])
            if isinstance(reld, list) and reld and isinstance(reld[0], dict):
                year = reld[0].get("startDate", np.nan)

            # rating
            agg = js.get("aggregateRating", {})
            rating_avg   = agg.get("ratingValue", np.nan)
            rating_count = agg.get("ratingCount", np.nan)

        except Exception:
            # se JSON non valido, rimangono i nan
            pass

    # 5) Ritorno
    return pd.Series({
        "Plot": plot,
        "Duration": duration,
        "Director1": directors[0],
        "Director2": directors[1],
        "Genre1": genres[0],
        "Genre2": genres[1],
        "Genre3": genres[2],
        "Country1": countries[0],
        "Country2": countries[1],
        "Country3": countries[2],
        "Year": year,
        "RatingAvg": rating_avg,
        "RatingCount": rating_count
    })


In [None]:
ratings = pd.read_csv('letterboxd_data/ratings.csv')
ratings.rename(columns={"Name": "Title", "Letterboxd URI": "URI", "Rating": "MyRating"}, inplace=True)
ratings = ratings[["URI", "Title", "MyRating"]]

In [4]:
new_cols = ratings['URI'].progress_apply(scrape)
df = pd.concat([ratings, new_cols], axis=1)

  0%|          | 0/1715 [00:00<?, ?it/s]

100%|██████████| 1715/1715 [1:22:18<00:00,  2.88s/it]


In [None]:
df = df[(df['Duration'] >= 60) & (df['Duration'] <= 240)]
df = df[(df['Genre1'] != 'TV Movie') & (df['Genre2'] != 'TV Movie') & (df['Genre3'] != 'TV Movie')]
df.reset_index(drop=True, inplace=True)

df.to_csv('ml_data/full.csv', index=False)

In [36]:
df = df[['URI', 'Title', 'Year', 'Plot', 'Director1', 'Genre1', 'Genre2', 'Genre3', 'Country1', 'RatingAvg', 'RatingCount', 'MyRating']]
df.rename(columns={'Director1': 'Director', 'Country1': 'Country'}, inplace=True)

In [None]:
train, test = train_test_split(df, test_size=200/len(df), random_state=99)

train.to_csv('ml_data/train.csv', index=False)
test.to_csv('ml_data/test.csv', index=False)