<a href="https://colab.research.google.com/github/AnsarYesma/letterboxd_watchlist_picker/blob/main/lb_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [124]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import requests
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

!pip install bs4 aiohttp

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df



Applied Z-score normalisation

In [125]:
def preprocess_data(df):
    df = df.rename(columns={"user_id": "uid", "movie_id": "fid", "rating_val": "rating"})
    df["rating"] = (df["rating"] - df["rating"].mean()) / df["rating"].std()

    uid_mapping = {uid: i for i, uid in enumerate(df["uid"].unique())}
    fid_mapping = {fid: i for i, fid in enumerate(df["fid"].unique())}

    df["uid_index"] = df["uid"].map(uid_mapping)
    df["fid_index"] = df["fid"].map(fid_mapping)
    return df, uid_mapping, fid_mapping

The dataset is sparse so we use csr

In [126]:
def build_matrix(df, num_users, num_films):
    matrix = csr_matrix(
        (df["rating"], (df["uid_index"], df["fid_index"])),
        shape=(num_users, num_films)
    )
    return matrix

SVD is often used for collaborative filtering

In [127]:
def train_svd(matrix, n_components=50):
    svd = TruncatedSVD(n_components=n_components)
    latent_matrix = svd.fit_transform(matrix)
    return svd, latent_matrix

This part deals with functions responsible for scraping data about the user.

In [128]:
import requests

from bs4 import BeautifulSoup
import aiohttp
import asyncio

def scrape_letterboxd_diary(username):
    url = f"https://letterboxd.com/{username}/films/diary/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    films = []
    for entry in soup.select("tr.diary-entry-row"):
        title_tag = entry.select_one(".headline-3 a")
        fid = title_tag["href"].split("/")[-2] if title_tag and title_tag["href"] else None

        rating_tag = entry.select_one(".td-rating input")
        rating = rating_tag['value'] if rating_tag else None

        if fid and rating:
          films.append({"fid": fid, "rating": rating, "uid": username})

    return films[:10]

import aiohttp
import asyncio

async def fetch_page(session, url):
    async with session.get(url) as response:
        return await response.text()

async def scrape_watched(username):
    all_fids = []
    base_url = f"https://letterboxd.com/{username}/films/"

    async with aiohttp.ClientSession() as session:
        # Get total pages
        first_page_html = await fetch_page(session, base_url)
        soup = BeautifulSoup(first_page_html, "html.parser")

        try:
            page_count = int(soup.select("li.paginate-page")[-1].text) + 1
        except IndexError:
            page_count = 2

        tasks = [fetch_page(session, f"{base_url}page/{page}/") for page in range(1, page_count)]
        pages_html = await asyncio.gather(*tasks)

        for html in pages_html:
            soup = BeautifulSoup(html, "html.parser")
            fids = [li.div["data-film-slug"] for li in soup.select("li.poster-container")]
            all_fids.extend(fids)

    return all_fids

async def scrape_watchlist(username):
    all_fids = []
    base_url = f"https://letterboxd.com/{username}/watchlist/"

    async with aiohttp.ClientSession() as session:
        first_page_html = await fetch_page(session, base_url)
        soup = BeautifulSoup(first_page_html, "html.parser")

        try:
            page_count = int(soup.select("li.paginate-page")[-1].text) + 1
        except IndexError:
            page_count = 2

        tasks = [fetch_page(session, f"{base_url}page/{page}/") for page in range(1, page_count)]
        pages_html = await asyncio.gather(*tasks)

        for html in pages_html:
            soup = BeautifulSoup(html, "html.parser")
            fids = [li.div["data-film-slug"] for li in soup.select("li.poster-container")]
            all_fids.extend(fids)

    return all_fids

async def scrap_lb(username):
  last_20 = scrape_letterboxd_diary(username)
  watched = await scrape_watched(username)
  watchlist = await scrape_watchlist(username)
  return last_20, watched, watchlist

This function makes scraped data usable

In [129]:
def create_new_user_vector(scraped_user_data, fid_mapping, num_latent_factors=50):
    new_user_df = pd.DataFrame(scraped_user_data)
    new_user_df["fid_index"] = new_user_df["fid"].map(fid_mapping)
    new_user_df = new_user_df.dropna(subset=["fid_index"])

    new_user_vector = np.zeros(num_latent_factors)

    for _, row in new_user_df.iterrows():
        try:
            fid_index = int(row["fid_index"])
            new_user_vector[fid_index] = row["rating"]
        except IndexError:
            pass

    return new_user_vector

This function finds the most appropriate movie recommendations with a bias towards films in user's watchlist

In [130]:
def generate_list_films(user_vector, latent_matrix, fid_mapping, watched_films, boost_fids=[], boost_factor=2):
    similarities = np.dot(latent_matrix, user_vector)

    for fid in boost_fids:
        if fid in fid_mapping and fid_mapping[fid] < len(similarities):
            similarities[fid_mapping[fid]] *= boost_factor

    recommended_film_indices = np.argsort(similarities)[::-1]
    recommended_fids = [
        list(fid_mapping.keys())[i] for i in recommended_film_indices
        if list(fid_mapping.keys())[i] not in watched_films
    ][:10]
    return recommended_fids

In [131]:
async def recommend_films(username, latent_matrix, fid_mapping, boost_factor=1.5):
  last_20, watched, watchlist = await scrap_lb(username)
  new_user_vector = create_new_user_vector(last_20, fid_mapping)
  return generate_list_films(new_user_vector, latent_matrix, fid_mapping, watched, watchlist, boost_factor)

This part prepares the model to be used. We will be using this dataset https://www.kaggle.com/datasets/samlearner/letterboxd-movie-ratings-data


In [132]:
data_file = "/content/drive/MyDrive/lb_recs/ratings_export.csv"
df = load_data(data_file)
df, uid_mapping, fid_mapping = preprocess_data(df)
num_users, num_films = len(uid_mapping), len(fid_mapping)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

rating_matrix = build_matrix(train_df, num_users, num_films)
svd, latent_matrix = train_svd(rating_matrix)

Here you must input your letterboxd username

In [None]:
username = input()
recommendations = await recommend_films(username, latent_matrix, fid_mapping)
print(f"🎬 Recommendations: {recommendations}")

anturgan


The next part is to showcase the accurasy of the model. Average results:
RMSE: 0.807
MAE: 0.628

In [None]:
y_true = []
y_pred = []

for _, row in test_df.iterrows():
    uid_idx = row["uid_index"]
    fid_idx = row["fid_index"]

    if uid_idx < latent_matrix.shape[0] and fid_idx < latent_matrix.shape[1]:
        pred_rating = np.dot(latent_matrix[uid_idx], svd.components_[:, fid_idx])
        y_pred.append(pred_rating)
        y_true.append(row["rating"])

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print(f"Model Performance:")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")