# Fetch film data

## Import modules

In [1]:
import ast
import datetime as dt
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import requests

from bs4 import BeautifulSoup
from pathlib import Path

## Choose data source

In [2]:
source_type = "rating" # Choose between "rating", "popular" and "list"
if source_type == "rating":
    list_url = "https://letterboxd.com/films/ajax/by/rating/"
elif source_type == "popular":
    list_url = "https://letterboxd.com/films/ajax/popular/"
elif source_type == "list":
    # URL of the target Letterboxd list
    pass 
else:
    pass

## Choose how many pages to scrap, or find the amount of pages of a list

In [3]:
if source_type in ["rating", "popular"]:
    total_pages = 10
elif source_type == "list":
    session = requests.session()
    response = session.get(f"{list_url}?esiAllowFilters=true")
    cookies = requests.utils.dict_from_cookiejar(session.cookies)
    # Use cookies to enable search filters
    # Filters; combine into a string with "%20" in between options
    cookies['filmFilter'] = 'hide-tv%20hide-shorts%20hide-unreleased%20hide-docs'
    Path("cookies.json").write_text(json.dumps(cookies))

    session = requests.session()
    cookies = json.loads(Path("cookies.json").read_text())
    cookies = requests.utils.cookiejar_from_dict(cookies)
    session.cookies.update(cookies)
    response = session.get(f"{list_url}?esiAllowFilters=true")
    soup = BeautifulSoup(response.text)
    total_pages = int(soup.find("div", class_="paginate-pages").find_all("a", href=True)[-1].text)
else:
    pass

## Fetch the data

In [4]:
film_df = pd.DataFrame(columns=["Title", "Year", "ReleaseDate", "Director", "Writer", "Cast", "Runtime", "Genre", "Country", "Language", "Budget", "BoxOffice",
                                "IMDbScore", "IMDbVotes", "IMDbReviews", "IMDbURL", 
                                "LetterboxdScore", "LetterboxdVotes", "LetterboxdWatches", "LetterboxdReviews", "LetterboxdURL",
                                "MetacriticCriticScore", "MetacriticCriticReviews", "MetacriticUserScore", "MetacriticUserVotes", "MetacriticUserReviews", "MetacriticURL"])

for page_n in range(1, total_pages+1):
    session = requests.session()
    cookies = json.loads(Path("cookies.json").read_text())
    cookies = requests.utils.cookiejar_from_dict(cookies)
    session.cookies.update(cookies)
    response = session.get(f"{list_url}page/{page_n}/?esiAllowFilters=true")
    soup = BeautifulSoup(response.text)
    film_list = soup.find_all("li", class_="listitem")

    for film_item in film_list:
        tmp_data = pd.Series(data=None, dtype=object)
        for col in film_df.columns:
            tmp_data[col] = np.nan
        # Letterboxd
        lbxd_film_handle = film_item.find("div")['data-film-slug']
        lbxd_full_url = f"https://letterboxd.com{lbxd_film_handle}"
        tmp_data["LetterboxdURL"] = str(lbxd_full_url)
        response = requests.get(lbxd_full_url, headers={'User-agent': 'Mozilla/5.0'}, timeout=30)
        soup = BeautifulSoup(response.text)
        
        try:
            title = soup.find("div", id="content").find("section", id="featured-film-header").find("h1").text
            tmp_data["Title"] = str(title)
        except:
            print("\tTitle not found")
            
        try:
            year = soup.find("section", id="featured-film-header").find("a").text
            tmp_data["Year"] = int(year)
        except:
            print("\tYear not found")
            
        try:
            director = soup.find("meta", attrs={"name":"twitter:data1"})['content']
            tmp_data["Director"] = str(director)
        except:
            print("\tDirector not found")

        try:
            lbxd_rating = soup.find("meta", attrs={"name":"twitter:data2"})['content'].split()[0]
            tmp_data["LetterboxdScore"] = float(lbxd_rating)
        except:
            print("\tLetterboxd rating not found")
            
        try:
            imdb_url = soup.find("a", attrs={"data-track-action": "IMDb"}, href=True)['href']
            imdb_url = imdb_url.replace("maindetails","")
            tmp_data["IMDbURL"] = str(imdb_url)
        except:
            print("\tIMDb URL not found")
            
        try:
            runtime = soup.find("p", class_="text-link text-footer").text.split()[0]
            tmp_data["Runtime"] = int(runtime)
        except:
            print("\tRuntime not found")
            
        try:
            premiere_date = soup.find("div", class_="release-table -bydate").find("h5").text
            tmp_data["ReleaseDate"] = str(premiere_date)
        except:
            print("\tRelease date not found")
        
        try:
            idx = soup.prettify().find('"ratingCount":')
            idx2 = soup.prettify().find(',"worstRating')
            lbxd_ratings = float(soup.prettify()[idx:idx2].replace('"ratingCount":', ''))
            tmp_data["LetterboxdVotes"] = int(lbxd_ratings)
        except:
            print("\tLetterboxd votes not found")
            
        response = requests.get(f"{lbxd_full_url}members", headers={'User-agent': 'Mozilla/5.0'}, timeout=30)
        soup = BeautifulSoup(response.text)
        try:
            lbxd_watches = soup.find("li", class_="js-route-watches").find("a")['title'].split()
            tmp_data["LetterboxdWatches"] = int(lbxd_watches[0].replace(",",""))
        except:
            print("\tLetterboxd watches not found")
        
        try:
            lbxd_reviews = soup.find("li", class_="js-route-reviews").find("a")['title'].split()
            tmp_data["LetterboxdReviews"] = int(lbxd_reviews[0].replace(",",""))
        except:
            print("\tLetterboxd reviews not found")
            
        # IMDb
        response = requests.get(imdb_url, headers={'User-agent': 'Mozilla/5.0'}, timeout=30)
        soup = BeautifulSoup(response.text)
        try:
            idx = soup.prettify().find('{"@type":"AggregateRating"')
            idx2 = soup.prettify().find(',"contentRating"')
            tmp = soup.prettify()[idx:idx2]
            tmp2 = ast.literal_eval(tmp)
            IMDbScore = tmp2['ratingValue']
            tmp_data["IMDbScore"] = float(IMDbScore)
            IMDbVotes = tmp2['ratingCount']
            tmp_data["IMDbVotes"] = int(IMDbVotes)
        except:
            print("\tIMDb rating not found")
            
        try:
            tmp = soup.find("li", attrs={"data-testid":"title-boxoffice-budget"})
            tmp2 = tmp.find("span", class_="ipc-metadata-list-item__list-content-item").text
            if "$" in tmp2:
                budget = float(tmp2.replace("$", "").replace("(estimated)", "").replace(",", ""))
            elif "€" in tmp2:
                budget = 1.11 * float(tmp2.replace("€", "").replace("(estimated)", "").replace(",", ""))
            elif "£" in tmp2:
                budget = 1.27 * float(tmp2.replace("£", "").replace("(estimated)", "").replace(",", ""))
            elif "¥" in tmp2:
                budget = 0.007 * float(tmp2.replace("¥", "").replace("(estimated)", "").replace(",", ""))
            tmp_data["Budget"] = budget
        except:
            print("\tBudget not found")
            
        try:
            tmp = soup.find("li", attrs={"data-testid":"title-boxoffice-cumulativeworldwidegross"})
            tmp2 = tmp.find("span", class_="ipc-metadata-list-item__list-content-item").text
            if "$" in tmp2:
                box_office = float(tmp2.replace("$", "").replace("(estimated)", "").replace(",", ""))
            elif "€" in tmp2:
                box_office = 1.11 * float(tmp2.replace("€", "").replace("(estimated)", "").replace(",", ""))
            elif "£" in tmp2:
                box_office = 1.27 * float(tmp2.replace("£", "").replace("(estimated)", "").replace(",", ""))
            elif "¥" in tmp2:
                box_office = 0.007 * float(tmp2.replace("¥", "").replace("(estimated)", "").replace(",", ""))
            tmp_data["BoxOffice"] = box_office
        except:
            print("\tBox office not found")
            
        
        response = requests.get(f"{imdb_url}reviews/", headers={'User-agent': 'Mozilla/5.0'}, timeout=30)
        soup = BeautifulSoup(response.text)
        try:
            tmp = [i.prettify() for i in soup.find_all("div", class_="header")]
            tmp2 = ";".join(tmp)
            res = re.search('[0-9]+ Reviews', tmp2)
            res2 = re.search('[0-9]+', res.group())
            IMDbReviews = res2.group()
            tmp_data["IMDbReviews"] = IMDbReviews
        except:
            print("\tIMDb reviews not found")
        
        # Metacritic
        response = requests.get(f"{imdb_url}criticreviews", headers={'User-agent': 'Mozilla/5.0'}, timeout=30)
        soup = BeautifulSoup(response.text)
        try:
            tmp = re.search("(?P<url>https?://www.metacritic[^\s]+)", soup.prettify()).group("url")
            metacritic_url = f"{tmp.split('?ftag')[0]}/"
            #print(str(metacritic_url))
            tmp_data["MetacriticURL"] = str(metacritic_url)
            response = requests.get(metacritic_url, headers={'User-agent': 'Mozilla/5.0'})
            soup = BeautifulSoup(response.text)
            try:
                raw_scores = soup.find_all("a", class_="metascore_anchor")
                tmp_data["MetacriticCriticScore"] = float(raw_scores[0].text)/10.0
                tmp_data["MetacriticUserScore"] = float(raw_scores[1].text)
            except:
                print("\tMetacritic scores not found")
            
            try:
                tmp = soup.find_all("a", class_="data", href=True)
                for el in tmp:
                    if 'user' in el['href']:
                        metacritic_user_votes = re.search('[0-9]+', el.text).group()
                    elif 'critic' in el['href']:
                        metacritic_critic_reviews = re.search('[0-9]+', el.text).group()
                tmp_data["MetacriticCriticReviews"] = int(metacritic_critic_reviews)
                tmp_data["MetacriticUserVotes"] = int(metacritic_user_votes)
            except:
                print("\tMetacritic reviews not found")
            
            try:
                tmp = soup.find_all("a", class_="see_all")
                for el in tmp:
                    if 'user' in el['href']:
                        metacritic_user_reviews = re.search('[0-9]+', el.text).group()
                    else:
                        pass
                tmp_data["MetacriticUserReviews"] = int(metacritic_user_reviews)
            except:
                print("\tMetacritic user reviews not found")
                
            response = requests.get(f"{metacritic_url}details", headers={'User-agent': 'Mozilla/5.0'})
            soup = BeautifulSoup(response.text)
            try:
                tmp = soup.find("table", class_="details").find("tr", class_="genres").text
                genre = re.sub("\n", "", re.sub(" +", " ", re.sub("Genres:", "", re.sub("Genre:", "", tmp))))
                tmp_data["Genre"] = str(genre)
            except:
                print("\tGenre not found")

            try:
                tmp = soup.find("table", class_="details").find("tr", class_="countries").text
                country = re.sub("\n", "", re.sub(" +", " ", re.sub("Countries:", "", re.sub("Country:", "", tmp))))
                tmp_data["Country"] = str(country)
            except:
                print("\tCountry not found")

            try:
                tmp = soup.find("table", class_="details").find("tr", class_="languages").text
                language = re.sub("\n", "", re.sub(" +", " ", re.sub("Languages:", "", re.sub("Language:", "", tmp))))
                tmp_data["Language"] = str(language)
            except:
                print("\tLanguage not found")

            try:
                tmp = soup.find_all("table", class_="credits")
                director = []
                writer = []
                cast = []
                for el in tmp:
                    credits = el['summary'].lower()
                    if 'director credits' in credits:
                        for it in el.find_all("td", class_="person"):
                            tmp = it.find("a").text
                            director.append(re.sub("\n", "", re.sub("  ", "", tmp)))
                    elif 'writer credits' in credits:
                        for it in el.find_all("td", class_="person"):
                            tmp = it.find("a").text
                            writer.append(re.sub("\n", "", re.sub("  ", "", tmp)))
                    elif 'principal cast credits' in credits:
                        for it in el.find_all("td", class_="person"):
                            tmp = it.find("a").text
                            cast.append(re.sub("\n", "", re.sub("  ", "", tmp)))
                director = ",".join(director)
                tmp_data["Director"] = str(director)
                writer = ",".join(writer)
                tmp_data["Writer"] = str(writer)
                cast = ",".join(cast)
                tmp_data["Cast"] = str(cast)
            except:
                print("\tCast and crew not found")
        except:
            print("\tMetacritic URL not found")
        
        film_df = pd.concat([film_df, tmp_data.to_frame().T], ignore_index=True)

FileNotFoundError: [Errno 2] No such file or directory: 'cookies.json'

## Try and filter out unwanted entries (TV shows, music documentaries)

In [None]:
film_df.drop(index=film_df[film_df["Genre"] == "Music"].index, inplace=True)
film_df.drop(index=film_df[film_df["Runtime"] >= 250].index, inplace=True)
film_df.dropna(axis="index", subset="Runtime", inplace=True)
film_df.reset_index(inplace=True)

## Clean up country information

In [None]:
lang_dict = {"AT":"Austria", "AU":"Australia", "BA":"Bosnia", "BE":"Belgium", "BG":"Bulgaria", "BR":"Brazil", 
            "CA":"Canada", "CH":"Switzerland", "CL":"Chile", "CN":"China", "CO":"Colombia", "CZ":"Czech Republic", 
            "DE":"Germany", "DK":"Denmark", "DZ":"Algeria", "ES":"Spain", "FI":"Finland", "FR":"France", 
            "GB":"United Kingdom", "GE":"Georgia", "GR":"Greece", "HK":"Hong Kong", "HU":"Hungary", 
            "IE":"Ireland", "IN":"India", "IR":"Iran", "IT":"Italy", "JPN":"Japan", "JP":"Japan",  
            "KR":"South Korea", "LB":"Lebanon", "LU": "Luxembourg", "MA":"Macedonia", "MC":"Monaco", "MT":"Malta", "MX":"Mexico", 
            "NL":"Netherlands", "NO":"Norway", "NZ":"New Zealand", "PK":"Pakistan", "PL":"Poland", "PT":"Portugal",
            "RO":"Romania", "RU":"Russia", "SE":"Sweden", "SUHH":"Soviet Union", 
            "TR":"Turkey", "TW":"Taiwan", "UK":"United Kingdom",  "USA":"United States", "US":"United States","USSR":"Soviet Union", 
            "XWG":"West Germany", "YUCS":"Yugoslavia"}

# film_df.info()
for idx in range(0, len(film_df)):
    if type(film_df.loc[idx, "Country"]) == str:
        for code, lang in lang_dict.items():
            if lang not in film_df.loc[idx, "Country"]:
                film_df.loc[idx, "Country"] = film_df.loc[idx, "Country"].replace(code, lang)
            else:
                film_df.loc[idx, "Country"] = film_df.loc[idx, "Country"].replace(code, "").replace(" ,", "").replace(", ", ",")

## Save dataset

In [None]:
tmp = dt.datetime.now(dt.timezone.utc)
timestamp = f"{tmp.year:04}{tmp.month:02}{tmp.day:02}"
film_df.to_csv(f"{source_type}_{len(film_df)}_{timestamp}.csv")