In [35]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

def get_parsed_page(url: str) -> None:
    # This fixes a blocked by cloudflare error i've encountered
    headers = {
        "referer": "https://letterboxd.com",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }

    return BeautifulSoup(requests.get(url, headers=headers).text, "lxml")

def fk_movie_popular_reviews(movie, n=None) -> dict:
    base = movie.get('url')
    urls = []
    urls.append(base + "/reviews/by/activity")
    if n:
        for i in range(1, n+1): 
            urls.append(base + f"/reviews/by/activity/page/{i}")
   

    ret = []
    for url in urls:
        page = get_parsed_page(url) 


        film_details = page.find_all(class_='film-detail')
        for detail in film_details:
            curr = {}
            try:
                curr['stars'] = detail.select_one('.rating').get_text(strip=True)
            except:
                curr['stars'] = None

            try: 
                curr['review'] = detail.select_one('.body-text').get_text(strip=True)
            except:
                curr['review'] = None

            try:
                curr['date'] = detail.select_one('.date ._nobr').get_text(strip=True)
            except:
                curr['review'] = None
            ret.append(curr)
    return ret

In [3]:
import os 

def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())

In [36]:
import ast

movie_df = pd.read_csv(f"{root}/Data/tropes_year_movie.csv")
movie_df = movie_df.dropna(subset=['Movie'])
movie_df['Movie'] = movie_df['Movie'].apply(ast.literal_eval)
print(movie_df['Movie'][0].get('url'))

https://letterboxd.com/film/puerta-de-hierro-el-exilio-de-peron/


In [52]:
small_df = movie_df.iloc[:4].copy()
small_df['comments'] = small_df['Movie'].apply(lambda x : fk_movie_popular_reviews(x, 12))

In [None]:
df = pd.read_csv(f"{root}/Data/tropes_year_movie.csv")
df = df[df['year'] != 'year']
df = df.dropna(subset=['Movie'])
df['Movie'] = df['Movie'].apply(ast.literal_eval)


def get_reviews_all(df, chunksize=50, begin=0, length=None, output_file=f'{root}/Data/movie_n=5_comments.csv'):
    for start in range(begin, length, chunksize):
        print(f"started {start} to {start + chunksize}")
        chunk = df.iloc[start : start + chunksize].copy()
        chunk['comments'] = chunk['Movie'].apply(lambda x : fk_movie_popular_reviews(x, 8))
        chunk.to_csv(output_file, mode='a', header=not pd.io.common.file_exists(output_file), index=False)
        print(f"finished {start} to {start + chunksize}")
