In [26]:
import ast
import pandas as pd
import re
import os
from bs4 import BeautifulSoup
from letterboxdpy.movie import Movie
from datetime import datetime
import concurrent.futures
import requests
import numpy as np
import random
import time

# Setup git root
def find_repo_root(start_path):
    current_path = os.path.abspath(start_path)
    while True:
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        parent_path = os.path.dirname(current_path)
        if parent_path == current_path:
            break
        current_path = parent_path
    return None

root = find_repo_root(os.getcwd())
root = root.replace("\\", "/")

# Define Functions
def fk_apply_literal(x):
    try:
        return ast.literal_eval(x)
    except Exception as e:
        return None
    
def get_parsed_page(url: str) -> None:
# This fixes a blocked by cloudflare error i've encountered
    headers = {
        "referer": "https://letterboxd.com",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    return BeautifulSoup(requests.get(url, headers=headers).text, "lxml")

In [27]:
## Load the scraped data ##
df_comments = pd.read_csv(f"{root}/Data/2020_trope_data/Scraped_Data/movie_n=10_comments.csv")
df_actors = pd.read_csv(f"{root}/Data/2020_trope_data/Scraped_Data/actors_roles_studios.csv")


In [28]:
df = pd.merge(
    left = df_comments,
    right=df_actors,
    on=['letterboxd_search', 'Movie'],
    how='left'
)

In [29]:
df.head(2)

Unnamed: 0,letterboxd_search,Movie,Budget,comments,url,actors,roles,studio
0,https://letterboxd.com/search/ABCs+of+Death+2+...,"{'title': 'abcs-of-death-2', 'url': 'https://l...","BUDGET; UNKNOWN Gross worldwide $7,171","[{'stars': '★★★', 'review': 'To the creators o...",https://letterboxd.com/film/abcs-of-death-2/,"['Eric Jacobus', 'Julian Barratt', 'Ian Virgo'...","['Assassin (Segment ""Amateur"")', 'Peter Toland...","['Drafthouse Films', 'Timpson Films']"
1,https://letterboxd.com/search/A+Beautiful+Day+...,{'title': 'a-beautiful-day-in-the-neighborhood...,"BUDGET: $25,000,000 / €23,011,567 BOX OFFICE (...","[{'stars': '★★★', 'review': 'therapy: expensiv...",https://letterboxd.com/film/a-beautiful-day-in...,"['Matthew Rhys', 'Tom Hanks', 'Chris Cooper', ...","['Lloyd Vogel', 'Fred Rogers', 'Jerry Vogel', ...","['TriStar Pictures', 'Tencent Pictures', 'Big ..."


In [30]:
df['Movie'] = df['Movie'].apply(fk_apply_literal)


In [31]:
def prune_spoilers(reviews):
    return [review.replace("This review may contain spoilers.I can handle the truth.", "") for review in reviews]

def extract_reviews(reviews_str):
    try:
        # Convert the string representation of the list to an actual list
        reviews_list = ast.literal_eval(reviews_str)
        # Extract 'review' from each dictionary
        return [review['review'] for review in reviews_list]
    except (ValueError, SyntaxError):
        return []
    
def convert_to_1_10(rating_str):
    # Count full stars and check for half star
    full_stars = 0 + rating_str.count('★')
    half_star = 0.5 if '½' in rating_str else 0

    # Combine the full stars and half star, then scale to 1-10
    score = full_stars + half_star
    score_1_10 = score * 2

    return int(score_1_10)

def extract_rating(reviews_str):
   try:
        # Convert the string representation of the list to an actual list
        reviews_list = ast.literal_eval(reviews_str)
        # Extract 'stars' from each dictionary
        stars =  [review['stars'] for review in reviews_list]
        return [convert_to_1_10(star) if star else pd.NA for star in stars]
   except:
       return []


def extract_date(reviews_str):
    try:
        reviews_list = ast.literal_eval(reviews_str)
        
        # Ensure each item in reviews_list is a dictionary and contains the 'date' key
        return [review.get('date', 'No date available') for review in reviews_list]
    
    except Exception as e:
        print(f"Error processing reviews string: {reviews_str}\nException: {e}")
        return None  # Return an empty list if there's an error


df['reviews_extracted'] = df['comments'].apply(extract_reviews)
df['reviews_extracted'] = df['reviews_extracted'].apply(lambda review: prune_spoilers(review))
df['review_stars'] = df['comments'].apply(extract_rating)
df['review_dates'] = df['comments'].apply(extract_date)


df.head()

Unnamed: 0,letterboxd_search,Movie,Budget,comments,url,actors,roles,studio,reviews_extracted,review_stars,review_dates
0,https://letterboxd.com/search/ABCs+of+Death+2+...,"{'title': 'abcs-of-death-2', 'url': 'https://l...","BUDGET; UNKNOWN Gross worldwide $7,171","[{'stars': '★★★', 'review': 'To the creators o...",https://letterboxd.com/film/abcs-of-death-2/,"['Eric Jacobus', 'Julian Barratt', 'Ian Virgo'...","['Assassin (Segment ""Amateur"")', 'Peter Toland...","['Drafthouse Films', 'Timpson Films']","[To the creators of ""P is for P-P-P-P SCARY!"":...","[6, 2, 7, 4, 9, 7, 5, 6, 4, 7, 6, 4, 6, 7, 2, ...","[14 Feb 2024, 16 Apr 2022, 17 Oct 2018, 24 Jan..."
1,https://letterboxd.com/search/A+Beautiful+Day+...,{'title': 'a-beautiful-day-in-the-neighborhood...,"BUDGET: $25,000,000 / €23,011,567 BOX OFFICE (...","[{'stars': '★★★', 'review': 'therapy: expensiv...",https://letterboxd.com/film/a-beautiful-day-in...,"['Matthew Rhys', 'Tom Hanks', 'Chris Cooper', ...","['Lloyd Vogel', 'Fred Rogers', 'Jerry Vogel', ...","['TriStar Pictures', 'Tencent Pictures', 'Big ...",[therapy: expensivetom hanks as mr rogers brea...,"[6, 7, 7, 6, 7, 8, 8, 8, 8, 6, 6, 8, 8, 8, 8, ...","[13 Oct 2019, 27 Nov 2019, 22 Jan 2020, 14 Dec..."
2,https://letterboxd.com/search/A+Beautiful+Mind...,"{'title': 'a-beautiful-mind', 'url': 'https://...","BUDGET: $58,000,000 / 53,243,768 EUR BOX OFFIC...","[{'stars': '★★★★', 'review': 'ugh, his mind', ...",https://letterboxd.com/film/a-beautiful-mind/,"['Russell Crowe', 'Jennifer Connelly', 'Ed Har...","['John Nash', 'Alicia Nash', 'William Parcher'...","['Universal Pictures', 'DreamWorks Pictures', ...","[ugh, his mind, I guess you could say Paul Bet...","[8, 10, 4, 6, 4, 3, 2, 8, 3, 6, 6, 4, 8, 4, 7,...","[12 Mar 2018, 24 Sep 2020, 15 Jul 2018, 22 Feb..."
3,https://letterboxd.com/search/A+Better+Tomorro...,"{'title': 'a-better-tomorrow', 'url': 'https:/...","BUDGET: UNKNOWN Gross worldwide $4,999,517","[{'stars': '★★★★', 'review': 'Nobody understan...",https://letterboxd.com/film/a-better-tomorrow/,"['Ti Lung', 'Chow Yun-fat', 'Leslie Cheung', '...","['Sung Chi Ho', ""Mark 'Gor' Lee"", 'Kit', 'Jack...","['Cinema City', 'Film Workshop']",[Nobody understands the male friendship & brot...,"[8, 10, 8, 10, 8, 10, 9, 8, 8, 7, 9, 8, 9, 10,...","[30 Apr 2021, 22 May 2021, 07 Mar 2021, 25 Jun..."
4,https://letterboxd.com/search/A+Birder's+Guide...,"{'title': 'a-birders-guide-to-everything', 'ur...",,"[{'stars': '★½', 'review': ""Cute as a bug's re...",https://letterboxd.com/film/a-birders-guide-to...,"['Kodi Smit-McPhee', 'James Le Gros', 'Daniela...","['David Portnoy', 'Donald Portnoy', 'Juliana S...","['There We Go Films', 'dreamFly Productions', ...",[Cute as a bug's rear!A heartwarming coming of...,"[3, 6, 7, 7, 6, 7, 6, 6, 6, 6, 5, 10, 6, 8, 6,...","[28 May 2014, 09 Jul 2014, 09 Jun 2014, 14 Mar..."


In [32]:
def fk_get_part(movie, part):
    try:
        return movie.get(part)
    except Exception as e:
        print(movie, e)
        return "None" 

df['letterboxd_directors'] = df['Movie'].apply(lambda x : fk_get_part(x, 'directors'))
df['letterboxd_year'] = df['Movie'].apply(lambda x : fk_get_part(x, 'year'))
df['letterboxd_rating'] = df['Movie'].apply(lambda x : fk_get_part(x, 'rating'))
df['letterboxd_genres'] = df['Movie'].apply(lambda x : fk_get_part(x, 'genres'))


df.head()

Unnamed: 0,letterboxd_search,Movie,Budget,comments,url,actors,roles,studio,reviews_extracted,review_stars,review_dates,letterboxd_directors,letterboxd_year,letterboxd_rating,letterboxd_genres
0,https://letterboxd.com/search/ABCs+of+Death+2+...,"{'title': 'abcs-of-death-2', 'url': 'https://l...","BUDGET; UNKNOWN Gross worldwide $7,171","[{'stars': '★★★', 'review': 'To the creators o...",https://letterboxd.com/film/abcs-of-death-2/,"['Eric Jacobus', 'Julian Barratt', 'Ian Virgo'...","['Assassin (Segment ""Amateur"")', 'Peter Toland...","['Drafthouse Films', 'Timpson Films']","[To the creators of ""P is for P-P-P-P SCARY!"":...","[6, 2, 7, 4, 9, 7, 5, 6, 4, 7, 6, 4, 6, 7, 2, ...","[14 Feb 2024, 16 Apr 2022, 17 Oct 2018, 24 Jan...","[Rodney Ascher, Bill Plympton, Erik Matti, Lan...",2014,2.74 out of 5,[Horror]
1,https://letterboxd.com/search/A+Beautiful+Day+...,{'title': 'a-beautiful-day-in-the-neighborhood...,"BUDGET: $25,000,000 / €23,011,567 BOX OFFICE (...","[{'stars': '★★★', 'review': 'therapy: expensiv...",https://letterboxd.com/film/a-beautiful-day-in...,"['Matthew Rhys', 'Tom Hanks', 'Chris Cooper', ...","['Lloyd Vogel', 'Fred Rogers', 'Jerry Vogel', ...","['TriStar Pictures', 'Tencent Pictures', 'Big ...",[therapy: expensivetom hanks as mr rogers brea...,"[6, 7, 7, 6, 7, 8, 8, 8, 8, 6, 6, 8, 8, 8, 8, ...","[13 Oct 2019, 27 Nov 2019, 22 Jan 2020, 14 Dec...",[Marielle Heller],2019,3.60 out of 5,"[History, Drama]"
2,https://letterboxd.com/search/A+Beautiful+Mind...,"{'title': 'a-beautiful-mind', 'url': 'https://...","BUDGET: $58,000,000 / 53,243,768 EUR BOX OFFIC...","[{'stars': '★★★★', 'review': 'ugh, his mind', ...",https://letterboxd.com/film/a-beautiful-mind/,"['Russell Crowe', 'Jennifer Connelly', 'Ed Har...","['John Nash', 'Alicia Nash', 'William Parcher'...","['Universal Pictures', 'DreamWorks Pictures', ...","[ugh, his mind, I guess you could say Paul Bet...","[8, 10, 4, 6, 4, 3, 2, 8, 3, 6, 6, 4, 8, 4, 7,...","[12 Mar 2018, 24 Sep 2020, 15 Jul 2018, 22 Feb...",[Ron Howard],2001,3.84 out of 5,"[Drama, Romance]"
3,https://letterboxd.com/search/A+Better+Tomorro...,"{'title': 'a-better-tomorrow', 'url': 'https:/...","BUDGET: UNKNOWN Gross worldwide $4,999,517","[{'stars': '★★★★', 'review': 'Nobody understan...",https://letterboxd.com/film/a-better-tomorrow/,"['Ti Lung', 'Chow Yun-fat', 'Leslie Cheung', '...","['Sung Chi Ho', ""Mark 'Gor' Lee"", 'Kit', 'Jack...","['Cinema City', 'Film Workshop']",[Nobody understands the male friendship & brot...,"[8, 10, 8, 10, 8, 10, 9, 8, 8, 7, 9, 8, 9, 10,...","[30 Apr 2021, 22 May 2021, 07 Mar 2021, 25 Jun...",[John Woo],1986,3.88 out of 5,"[Drama, Crime, Action]"
4,https://letterboxd.com/search/A+Birder's+Guide...,"{'title': 'a-birders-guide-to-everything', 'ur...",,"[{'stars': '★½', 'review': ""Cute as a bug's re...",https://letterboxd.com/film/a-birders-guide-to...,"['Kodi Smit-McPhee', 'James Le Gros', 'Daniela...","['David Portnoy', 'Donald Portnoy', 'Juliana S...","['There We Go Films', 'dreamFly Productions', ...",[Cute as a bug's rear!A heartwarming coming of...,"[3, 6, 7, 7, 6, 7, 6, 6, 6, 6, 5, 10, 6, 8, 6,...","[28 May 2014, 09 Jul 2014, 09 Jun 2014, 14 Mar...",[Rob Meyer],2013,3.19 out of 5,[Comedy]


In [33]:
df.iloc[:1]

Unnamed: 0,letterboxd_search,Movie,Budget,comments,url,actors,roles,studio,reviews_extracted,review_stars,review_dates,letterboxd_directors,letterboxd_year,letterboxd_rating,letterboxd_genres
0,https://letterboxd.com/search/ABCs+of+Death+2+...,"{'title': 'abcs-of-death-2', 'url': 'https://l...","BUDGET; UNKNOWN Gross worldwide $7,171","[{'stars': '★★★', 'review': 'To the creators o...",https://letterboxd.com/film/abcs-of-death-2/,"['Eric Jacobus', 'Julian Barratt', 'Ian Virgo'...","['Assassin (Segment ""Amateur"")', 'Peter Toland...","['Drafthouse Films', 'Timpson Films']","[To the creators of ""P is for P-P-P-P SCARY!"":...","[6, 2, 7, 4, 9, 7, 5, 6, 4, 7, 6, 4, 6, 7, 2, ...","[14 Feb 2024, 16 Apr 2022, 17 Oct 2018, 24 Jan...","[Rodney Ascher, Bill Plympton, Erik Matti, Lan...",2014,2.74 out of 5,[Horror]


In [34]:
def convert_rating_to_10_scale(rating_text):
    # Find the pattern "X out of Y" in the text
    if rating_text == 'None found' or rating_text == 'None':
        return None
    else:
        match = re.match(r'(\d+(\.\d+)?)\s*out of\s*(\d+)', rating_text)
        
        if match:
            # Extract the rating (X) and the max rating (Y)
            rating = float(match.group(1))
            max_rating = float(match.group(3))
            
            # Convert to 10 scale
            rating_out_of_10 = (rating / max_rating) * 10
            
            # Return the new rating out of 10
            return round(rating_out_of_10, 2)  # rounding to 2 decimal places
        else: 
            print(rating_text, match)
            return None

df['letterboxd_rating'] = df['letterboxd_rating'].apply(convert_rating_to_10_scale)


In [35]:
df['Movie'].iloc[2]

{'title': 'a-beautiful-mind',
 'url': 'https://letterboxd.com/film/a-beautiful-mind/',
 'directors': ['Ron Howard'],
 'rating': '3.84 out of 5',
 'year': '2001',
 'genres': ['Drama', 'Romance']}

In [36]:
df_lettersearch = pd.read_csv(f"{root}/Data/2020_trope_data/letterboxd_search.csv", index_col=0)
df_lettersearch.rename(columns={
        'Id': "IMDB_ID",
        "Rating": "IMDB_rating",
        "NAMEIDMB" : "IMDB_Title",
        }, 
        inplace=True)
df_lettersearch.drop(columns="Year", inplace=True)
df_merged = pd.merge(
    left=df,
    right=df_lettersearch,
    on="letterboxd_search",
    how='left'
)
df_merged.drop(columns=["Movie", "comments"], inplace=True)

In [37]:
for b in df_merged['Budget'].head(1000):
    print(b)

BUDGET; UNKNOWN Gross worldwide $7,171
BUDGET: $25,000,000 / €23,011,567 BOX OFFICE (US) $61,704,055 BOX OFFICE (WW) $67,925,733
BUDGET: $58,000,000 / 53,243,768 EUR BOX OFFICE (US) $170,742,341 BOX OFFICE (WW) $316,791,257
BUDGET: UNKNOWN Gross worldwide $4,999,517
nan
BUDGET: UNKNOWN
BUDGET: $400,000
BUDGET: UNKNOWN
nan
BUDGET: $27,000,000 / €24,922,171 BOX OFFICE (US) $50,750,000
BUDGET: $1,000,000 / 919,421 EUR
nan
BUDGET: $10,000,000 / 9,208,816 EUR BOX OFFICE (US) $17,287,898
BUDGET: $50,000
nan
BUDGET: $650,000 / €604,876
nan
nan
BUDGET: $5,600,000 / €5,193,409
nan
BUDGET: UNKNOWN
BUDGET: UNKNOWN
nan
nan
BUDGET: $17,000,000 / €15,849,931
BUDGET: UNKNOWN
BUDGET: UNKNOWN
BUDGET: $3,300,000 / €3,047,739 BOX OFFICE (US) $20,778,141 BOX OFFICE (WW) $20,791,797
BUDGET: $19,000,000 / €17,498,714 BOX OFFICE (US) $51,438,175 BOX OFFICE (WW) $70,080,371
BUDGET: $2,200,000 / €2,042,411 BOX OFFICE (US): $26,617,553 BOX OFFICE (WW): $27,023,100
BUDGET: UNKNOWN
nan
nan
nan
BUDGET: $40,000,000

In [38]:
import re
import numpy as np

def extract_financial_data(financial_str):
    # Convert the input string to lowercase for case-insensitive matching
    financial_str = financial_str.lower()

    # Handle the 'nan' case (missing data)
    if financial_str in ['nan']:
        return (None, None, None)

    # Initialize values for budget and box office
    budget_usd = None
    box_office_us = None
    box_office_ww = None

    # Handle the 'UNKNOWN' case for budget (missing data)
    if financial_str in ['unknown']:
        budget_usd = None
    else:
        # Match the budget in USD (could be in $ or EUR, but we are focusing on USD here)
        budget_match = re.search(r'budget: \$(\d{1,3}(?:,\d{3})*)(?: /|\s|$)', financial_str)
        if budget_match:
            budget_usd = int(budget_match.group(1).replace(',', ''))

    # Match the box office (US) or gross US
    box_office_us_match = re.search(r'(?:box office \(us\)|gross us|gross us & canada)\s*\$?(\d{1,3}(?:,\d{3})*)', financial_str)
    if box_office_us_match:
        box_office_us = int(box_office_us_match.group(1).replace(',', ''))

    # Match the box office (Worldwide) or gross worldwide
    box_office_ww_match = re.search(r'(?:box office \(ww\)|gross worldwide)\s*\$?(\d{1,3}(?:,\d{3})*)', financial_str)
    if box_office_ww_match:
        box_office_ww = int(box_office_ww_match.group(1).replace(',', ''))
    
    # Return the extracted financial values
    return (budget_usd, box_office_us, box_office_ww)

# Apply the function to each test string and print results
for b in df_merged['Budget'].head(1000):
    b = str(b)
    budget, box_office_us, box_office_ww = extract_financial_data(b)
    print(f"Input: {b}")
    print(f"Budget (USD): {budget}, Box Office (US): {box_office_us}, Box Office (WW): {box_office_ww}")
    print("-" * 50)


Input: BUDGET; UNKNOWN Gross worldwide $7,171
Budget (USD): None, Box Office (US): None, Box Office (WW): 7171
--------------------------------------------------
Input: BUDGET: $25,000,000 / €23,011,567 BOX OFFICE (US) $61,704,055 BOX OFFICE (WW) $67,925,733
Budget (USD): 25000000, Box Office (US): 61704055, Box Office (WW): 67925733
--------------------------------------------------
Input: BUDGET: $58,000,000 / 53,243,768 EUR BOX OFFICE (US) $170,742,341 BOX OFFICE (WW) $316,791,257
Budget (USD): 58000000, Box Office (US): 170742341, Box Office (WW): 316791257
--------------------------------------------------
Input: BUDGET: UNKNOWN Gross worldwide $4,999,517
Budget (USD): None, Box Office (US): None, Box Office (WW): 4999517
--------------------------------------------------
Input: nan
Budget (USD): None, Box Office (US): None, Box Office (WW): None
--------------------------------------------------
Input: BUDGET: UNKNOWN
Budget (USD): None, Box Office (US): None, Box Office (WW): No

In [39]:
df_merged['Budget'] = df_merged['Budget'].astype(str)
df_merged[['letter_USD_Budget', 'letter_US_Gross', 'letter_WW_Gross']] = pd.DataFrame(df_merged['Budget'].apply(extract_financial_data).tolist(), index=df_merged.index)

In [40]:
df_merged.drop(columns='Budget', inplace=True)
df_merged.head(2)

Unnamed: 0,letterboxd_search,url,actors,roles,studio,reviews_extracted,review_stars,review_dates,letterboxd_directors,letterboxd_year,letterboxd_rating,letterboxd_genres,NameIMDB,IMDB_rating,IMDB_ID,letter_USD_Budget,letter_US_Gross,letter_WW_Gross
0,https://letterboxd.com/search/ABCs+of+Death+2+...,https://letterboxd.com/film/abcs-of-death-2/,"['Eric Jacobus', 'Julian Barratt', 'Ian Virgo'...","['Assassin (Segment ""Amateur"")', 'Peter Toland...","['Drafthouse Films', 'Timpson Films']","[To the creators of ""P is for P-P-P-P SCARY!"":...","[6, 2, 7, 4, 9, 7, 5, 6, 4, 7, 6, 4, 6, 7, 2, ...","[14 Feb 2024, 16 Apr 2022, 17 Oct 2018, 24 Jan...","[Rodney Ascher, Bill Plympton, Erik Matti, Lan...",2014,5.48,[Horror],ABCs of Death 2,5.4,tt2926810,,,7171.0
1,https://letterboxd.com/search/A+Beautiful+Day+...,https://letterboxd.com/film/a-beautiful-day-in...,"['Matthew Rhys', 'Tom Hanks', 'Chris Cooper', ...","['Lloyd Vogel', 'Fred Rogers', 'Jerry Vogel', ...","['TriStar Pictures', 'Tencent Pictures', 'Big ...",[therapy: expensivetom hanks as mr rogers brea...,"[6, 7, 7, 6, 7, 8, 8, 8, 8, 6, 6, 8, 8, 8, 8, ...","[13 Oct 2019, 27 Nov 2019, 22 Jan 2020, 14 Dec...",[Marielle Heller],2019,7.2,"[History, Drama]",A Beautiful Day in the Neighborhood,0.0,tt3224458,25000000.0,61704055.0,67925733.0


In [41]:
df_merged.to_csv(f"{root}/Data/2020_trope_data/Scraped_Data/merged_NER_scraped_data.csv")