In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [11]:
rating_map = {
    '★★★★★': 1.0,
    '★★★★½': 0.9,
    '★★★★': 0.8,
    '★★★½': 0.7,
    '★★★': 0.6,
    '★★½': 0.5,
    '★★': 0.4,
    '★½': 0.3,
    '★': 0.2,
    '½': 0.1,
}

def normalize_rating(star_rating):
    return rating_map.get(star_rating, None)

In [41]:
def scrape_letterboxd_reviews(movie_url, num_reviews):
    reviews = []
    page = 1
    while len(reviews) < num_reviews:        
        response = requests.get(f"{movie_url}/reviews/page/{page}/")
        soup = BeautifulSoup(response.content, 'html.parser')
        review_elements = soup.find_all('div', class_='film-detail-content')
        
        for element in review_elements:
            if len(reviews) >= num_reviews:
                break
            
            review_text_element = element.find('div', class_='body-text -prose collapsible-text')
            review_text = review_text_element.get_text(strip=True) if review_text_element else None
            
            rating_element = element.find('span', class_='rating')
            star_rating = rating_element.get_text(strip=True) if rating_element else None
            
            if review_text and star_rating is not None:
                reviews.append({'review': review_text, 'rating': normalize_rating(star_rating) })
        
        page += 1

    return pd.DataFrame(reviews)

In [42]:
# Example usage
movie_url = 'https://letterboxd.com/film/deadpool-wolverine'  # Replace with actual movie URL
num_reviews = 300
reviews_df = scrape_letterboxd_reviews(movie_url, num_reviews)
#reviews_df.to_csv('data/reviews.csv', index=False)
reviews_df.head()

Unnamed: 0,review,rating
0,If it's not too meta is it even a Deadpool fil...,0.5
1,Genuinely good fight scenes I know it’s Stefan...,0.7
2,Ooh I been waiting a long time fo dis,1.0
3,Very funny. The plot was hard to follow but so...,0.8
4,who else replayed the end cus of wolverine,0.8


In [43]:
np.sort( reviews_df.rating.unique()) # to double check that the rating hashmap working and non nan

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])