In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

def get_parsed_page(url: str) -> None:
    # This fixes a blocked by cloudflare error i've encountered
    headers = {
        "referer": "https://letterboxd.com",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }

    return BeautifulSoup(requests.get(url, headers=headers).text, "lxml")

def fk_movie_popular_reviews(movie, n=None) -> dict:
    try:
        base = movie.get('url')
    except Exception as e:
        print(e, movie)
        return None
    urls = []
    urls.append(base + "/reviews/by/activity")
    if n:
        for i in range(2, n+1): 
            urls.append(base + f"/reviews/by/activity/page/{i}")
   

    ret = []
    for url in urls:
        page = get_parsed_page(url) 


        film_details = page.find_all(class_='film-detail')
        for detail in film_details:
            curr = {}
            try:
                curr['stars'] = detail.select_one('.rating').get_text(strip=True)
            except:
                curr['stars'] = None

            try: 
                curr['review'] = detail.select_one('.body-text').get_text(strip=True)
            except:
                curr['review'] = None

            try:
                curr['date'] = detail.select_one('.date ._nobr').get_text(strip=True)
            except:
                curr['review'] = None
            ret.append(curr)
    return ret

In [2]:
import os 

def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())

In [3]:
import ast

def fk_apply_literal(x):
    try:
        return ast.literal_eval(x)
    except: 
        return None



movie_df = pd.read_csv(f"{root}/Data/tropes_year_movie.csv")
movie_df = movie_df.dropna(subset=['Movie'])
movie_df['Movie'] = movie_df['Movie'].apply(lambda x : fk_apply_literal(x))
print(movie_df['Movie'][0].get('url'))

https://letterboxd.com/film/puerta-de-hierro-el-exilio-de-peron/


In [4]:
from datetime import datetime
def get_reviews_all(df, chunksize=50, begin=0, length=None, n=8, output_file=None): ## change the save name

    # first clean the frame
    df = df[df['Movie'] != 'Movie']
    df = df.dropna(subset=['Movie'])
    df = df.drop_duplicates(keep='first')
    df['Movie'] = df['Movie'].apply(lambda x : fk_apply_literal(x))

    if not output_file:
        output_file = f'{root}/Data/movie_n={n}_comments.csv'

    if not length:
        length = len(df)
    for start in range(begin, length, chunksize):
        #print start time
        current_time = datetime.now()
        print(f"started {start} to {start + chunksize}  at  " + current_time.strftime("%Y-%m-%d %H:%M:%S"))

        # actual code 
        chunk = df.iloc[start : start + chunksize].copy()
        chunk['comments'] = chunk['Movie'].apply(lambda x : fk_movie_popular_reviews(x, 8))
        chunk.to_csv(output_file, mode='a', header=not pd.io.common.file_exists(output_file), index=False)

        # print end time
        current_time = datetime.now()
        print(f"finished {start} to {start + chunksize}  at  " + current_time.strftime("%Y-%m-%d %H:%M:%S"))


In [6]:
get_reviews_all(df = pd.read_csv(f"{root}/Data/tropes_year_movie.csv"), chunksize=25, begin=8550 )

started 8550 to 8575  at  2024-10-27 15:25:03
finished 8550 to 8575  at  2024-10-27 15:27:51
started 8575 to 8600  at  2024-10-27 15:27:51
finished 8575 to 8600  at  2024-10-27 15:31:34
started 8600 to 8625  at  2024-10-27 15:31:34
finished 8600 to 8625  at  2024-10-27 15:35:09
started 8625 to 8650  at  2024-10-27 15:35:09
finished 8625 to 8650  at  2024-10-27 15:39:02
started 8650 to 8675  at  2024-10-27 15:39:02
finished 8650 to 8675  at  2024-10-27 15:42:30
started 8675 to 8700  at  2024-10-27 15:42:30
finished 8675 to 8700  at  2024-10-27 15:46:15
started 8700 to 8725  at  2024-10-27 15:46:15
finished 8700 to 8725  at  2024-10-27 15:50:00
started 8725 to 8750  at  2024-10-27 15:50:00
finished 8725 to 8750  at  2024-10-27 15:53:42
started 8750 to 8775  at  2024-10-27 15:53:42
finished 8750 to 8775  at  2024-10-27 15:57:27
started 8775 to 8800  at  2024-10-27 15:57:27
finished 8775 to 8800  at  2024-10-27 16:01:02
started 8800 to 8825  at  2024-10-27 16:01:02
finished 8800 to 8825  a