In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import drive
import time

def http_request(url):
  headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
  return requests.get(url, headers=headers)

In [3]:
# Username crawling function definitions
def get_usernames(page_idx, subfolder):
  members_url = f"https://letterboxd.com/members/popular/{subfolder}page/{page_idx}"

  response = http_request(members_url)
  if (response.status_code != 200):
    return []

  person_table = BeautifulSoup(response.text, 'html.parser').find_all(class_="person-table")
  assert len(person_table)>0
  person_table = person_table[0]

  usernames = person_table.find_all("a", class_="name")
  return set(map(lambda user: user["href"][1:-1], usernames))

def crawl_usernames(subfolder):
  usernames = set()
  page_idx = 1
  print(f"  Crawling 'popular/{subfolder}' set")

  while (res := get_usernames(page_idx, subfolder)):
    usernames |= res
    page_idx+=1

  print(f"  Crawled {page_idx-1} pages, found {len(usernames)} unique users")
  return usernames

In [None]:
# Run this cell to crawl and store user IDs
drive.mount('/content/drive')

usernames = set()
leaderboards = ["", "this/week/", "this/month/", "this/year/"]

print(f"Crawling usernames...")
for lb in leaderboards:
  usernames |= crawl_usernames(lb)
print(f"Crawling complete. Found {len(usernames)} unique user_ids.")


df = pd.DataFrame(usernames, columns=["user_id"])

# Insert path here
# output_path = "/content/drive/MyDrive/deep_learning_training/experiments/letterboxd/crawled/xxx.csv"
df.to_csv(output_path, index=False)

In [13]:
# User rating crawling function definitons
from itertools import repeat

def stars_to_score(stars):
  score = len(stars)
  if stars[-1] == "½":
    score -= 0.5
  return score

def crawl_ratings_one_page(user_id, page_idx):
  movies_url = f"https://letterboxd.com/{user_id}/films/page/{page_idx}"

  response = None
  for _ in range(10):
    response = http_request(movies_url)
    if response.status_code == 200:
      break
    time.sleep(1)
    print(f"Request failed. Id: {user_id}. Page: {page_idx}.")
    response = None
  if response == None:
    return False, []

  movie_list = BeautifulSoup(response.text, 'html.parser').find_all("li", class_="poster-container")
  if (len(movie_list) == 0):
    return False, []
  movie_list = [li for li in movie_list if li.find("span", class_="rating")]

  slugs = [movie.select_one("div")["data-film-slug"] for movie in movie_list]
  ratings = [stars_to_score(movie.select_one("span.rating").get_text()) for movie in movie_list]

  return True, list(zip(repeat(user_id), slugs, ratings))

def crawl_ratings_one_user(user_id):
  ratings = set()
  page_idx = 1

  while True:
    success, page_ratings = crawl_ratings_one_page(user_id, page_idx)
    if not success:
      break
    ratings |= set(page_ratings)
    page_idx+=1

  print(f" User: {user_id}. Movie pages: {page_idx-1}, Ratings: {len(ratings)}.")
  return ratings

In [None]:
# Load user ids
import pandas as pd
drive.mount('/content/drive')

input_path = "/content/drive/MyDrive/deep_learning_training/experiments/letterboxd/crawled/user_ids.csv"
df = pd.read_csv(input_path)
user_ids = df["user_id"]
print(len(user_ids))

Mounted at /content/drive
11893


In [None]:
# Run this cell to crawl and store user ratings
import ipywidgets as widgets
import pandas as pd
drive.mount('/content/drive')

batch_size = 100
batches = [user_ids[i:i + batch_size] for i in range(0, len(user_ids), batch_size)]

for i in range(0, len(batches)):
  print(f"Starting crawling batch {i}:")
  batch = batches[i]
  ratings = []
  k = 0
  progress = widgets.IntProgress(value=0, min=0, max=batch_size)
  percent = widgets.Label(value="0%")
  display(widgets.HBox([progress, percent]))

  for user_id in batch:
    ratings += list(crawl_ratings_one_user(user_id))
    k+=1
    progress.value = k
    percent.value = f"{k}%"

  df = pd.DataFrame(ratings, columns=["user_id", "movie_id", "rating"])
  output_path = f"/content/drive/MyDrive/deep_learning_training/experiments/letterboxd/crawled/ratings/{i}.csv"
  df.to_csv(output_path, index=False)

In [18]:
# Run this cell to crawl hand-picked users
import pandas as pd
drive.mount('/content/drive')

special_users = ["drakman", "maszu", "f_frusciante", "bvterrcvp"]
ratings = []
for user in special_users:
  ratings += crawl_ratings_one_user(user)

df = pd.DataFrame(ratings, columns=["user_id", "movie_id", "rating"])
output_path = f"/content/drive/MyDrive/deep_learning_training/experiments/letterboxd/crawled/ratings/special.csv"
df.to_csv(output_path, index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 User: drakman. Movie pages: 21, Ratings: 1466.
 User: maszu. Movie pages: 30, Ratings: 1874.
 User: f_frusciante. Movie pages: 57, Ratings: 4041.
 User: bvterrcvp. Movie pages: 3, Ratings: 172.
