In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import random
from bs4 import BeautifulSoup
import time
import glob

In [11]:
def getNumericalData(soup):
    release_year = getReleaseYear(soup)
    duration = getDuration(soup)
    average_rating = getAverageRating(soup)
    rater_count = getRaterCount(soup)
    fan_count = getFanCount(soup)
    watch_count = getWatchCount(soup)
    list_count = getListCount(soup)
    like_count = getLikeCount(soup)
    return {
        "Release Year": release_year,
        "Duration": duration,
        "Avg Rating": average_rating,
        "Raters": rater_count,
        "Fans": fan_count,
        "Watched": watch_count,
        "Lists": list_count,
        "Likes": like_count
    }

def getFanCount(soup):
    fan_count_a = soup.find('a', class_='all-link more-link')
    fan_count = fan_count_a.text if fan_count_a else 0
    return fan_count

def getLikeCount(soup):
    like_count_li = soup.find('li', class_='stat filmstat-likes')
    like_count_a = like_count_li.find('a', class_='has-icon') if like_count_li else None
    like_count_text = like_count_a['data-original-title'] if like_count_a else None
    like_count = like_count_text.split(" ")[2] if like_count_text else "N/A"
    return like_count

def getListCount(soup):
    list_count_li = soup.find('li', class_='stat filmstat-lists')
    list_count_a = list_count_li.find('a', class_='has-icon') if list_count_li else None
    list_count_text = list_count_a['data-original-title'] if list_count_a else None
    list_count = list_count_text.split(" ")[2] if list_count_text else "N/A"
    return list_count

def getWatchCount(soup):
    watch_count_li = soup.find('li', class_='stat filmstat-watches')
    watch_count_a = watch_count_li.find('a', class_='has-icon') if watch_count_li else None
    watch_count_text = watch_count_a['data-original-title'] if watch_count_a else None
    watch_count = watch_count_text.split(" ")[2] if watch_count_text else "N/A"
    return watch_count

def getRaterCount(soup):
    average_rating_span = soup.find('span', class_='average-rating')
    average_rating_span_a = average_rating_span.find('a', class_='tooltip display-rating') if average_rating_span else None
    if(average_rating_span_a == None):
        average_rating_span_a = average_rating_span.find('a', class_='tooltip display-rating -highlight') if average_rating_span else None
    average_rating_span_text = average_rating_span_a['data-original-title'] if average_rating_span_a else None
    rater_count = average_rating_span_text.split(" ")[6] if average_rating_span_text else "N/A"
    return rater_count

def getAverageRating(soup):
    average_rating_span = soup.find('span', class_='average-rating')
    average_rating_span_a = average_rating_span.find('a', class_='tooltip display-rating') if average_rating_span else None
    if(average_rating_span_a == None):
        average_rating_span_a = average_rating_span.find('a', class_='tooltip display-rating -highlight') if average_rating_span else None
    average_rating_span_text = average_rating_span_a['data-original-title'] if average_rating_span_a else None
    average_rating = average_rating_span_text.split(" ")[3] if average_rating_span_text else "N/A"
    return average_rating

def getDuration(soup):
    duration_p = soup.find('p', class_='text-link text-footer')
    duration = duration_p.text.split(" ")[0] if duration_p else "N/A"
    return duration

def getReleaseYear(soup):
    release_year_div = soup.find('div', class_='releaseyear')
    release_year = release_year_div.a.text if release_year_div else "N/A"
    return release_year


In [12]:
def getCategoricalData(soup):
    movie_name = getMovieTitle(soup)
    genre = getGenre(soup)
    details_divs = getDetailsDivs(soup)
    sluglist_divs = getDetailsSlugList(details_divs)
    details_headers = getDetailHeaders(details_divs)
    studio = getStudio(sluglist_divs, details_headers) 
    country = getCountry(sluglist_divs, details_headers) 
    primary_language = getPrimaryLanguage(sluglist_divs, details_headers) 
    return {
        "Movie Name": movie_name,
        "Genre": genre,
        "Studio": studio,
        "Country": country,
        "Primary Language": primary_language
    }

def getStudio(sluglist_divs, details_headers):
    try:
        if(details_headers[0] == "Studios" or details_headers[0] == "Studio"):
            studio = sluglist_divs[0].find('a', class_='text-slug').text
            return studio
        return "N/A"
    except IndexError:
        return "N/A"

def getCountry(sluglist_divs, details_headers):
    try:
        if(details_headers[0] == "Countries" or details_headers[0] == "Country"):
            country = sluglist_divs[0].find('a', class_='text-slug').text
            return country
        elif(details_headers[1] == "Countries" or details_headers[1] == "Country"):
            country = sluglist_divs[1].find('a', class_='text-slug').text
            return country
        return "N/A"
    except IndexError:
        return "N/A"

def getPrimaryLanguage(sluglist_divs, details_headers):
    try:
        if(details_headers[0] == "Primary Language" or details_headers[0] == "Language"):
            primary_language = sluglist_divs[0].find('a', class_='text-slug').text
            return primary_language
        elif(details_headers[1] == "Primary Language" or details_headers[1] == "Language"):
            primary_language = sluglist_divs[1].find('a', class_='text-slug').text
            return primary_language
        elif(details_headers[2] == "Primary Language" or details_headers[2] == "Language"):
            primary_language = sluglist_divs[2].find('a', class_='text-slug').text
            return primary_language
        return "N/A"
    except IndexError:
        return "N/A"

def getGenre(soup):
    try:
        div_genres = soup.find('div', id='tab-genres')
        h3_tag = div_genres.find('h3').span.text if div_genres else None
        if(h3_tag == "Genres" or h3_tag == "Genre"):
            sluglist_div = div_genres.find('div', class_='text-sluglist')
            genre = sluglist_div.find('a', class_='text-slug').text
            return genre
        return "N/A"
    except IndexError:
        return "N/A"

def getMovieTitle(soup):
    movie_title_span = soup.find("span", class_="name js-widont prettify")
    movie_name = movie_title_span.text.strip() if movie_title_span else "N/A"
    return movie_name

def getDetailsDivs(soup):
    details_divs = soup.find('div', id='tab-details')
    return details_divs

def getDetailsSlugList(details_divs):
    sluglist_divs = details_divs.find_all('div', class_='text-sluglist') if details_divs else []
    return sluglist_divs

def getDetailHeaders(details_divs):
    h3_tags = details_divs.find_all('h3') if details_divs else []
    details_headers = [h3.span.text for h3 in h3_tags if h3.span]
    return details_headers

In [13]:
def extractLetterBoxdData(driver, movie_urls, start_index = 0, checkpoint_interval = 300):
    movie_data = []
    for checkpoint_count, url in movie_urls[start_index:].iterrows():
        href = url['URL']
        soup = getSoup(driver, href)
        categorical_data = getCategoricalData(soup)
        numerical_data = getNumericalData(soup)
        movie_data.append({**categorical_data, **numerical_data})
        if checkpoint_count % checkpoint_interval == 0:
            saveCheckpoint(movie_data, f'movie_data_checkpoint_{checkpoint_count // checkpoint_interval}.csv')
            movie_data = []

    saveCheckpoint(movie_data, f'movie_data_checkpoint_last_batch.csv')
    driver.quit()
    return movie_data

def saveCheckpoint(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

def getSoup(driver, href):
    movie_url = "https://letterboxd.com" + href
    driver.get(movie_url)    
    waitForElements(driver)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")
    return soup

def waitForElements(driver):
    try:
        wait = WebDriverWait(driver, 2)
        wait.until(EC.presence_of_element_located((By.ID, "tab-genres")))
        wait.until(EC.presence_of_element_located((By.ID, "tab-details")))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.average-rating')))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li.stat.filmstat-watches')))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li.stat.filmstat-lists')))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li.stat.filmstat-likes')))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.all-link.more-link')))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'p.text-link.text-footer')))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.releaseyear')))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.name.js-widont.prettify')))
    except (TimeoutException, NoSuchElementException):
        return
        

In [14]:
def getMovieUrls(): 
    return pd.read_csv('movie_url_final.csv')

In [15]:
movie_urls = getMovieUrls()

In [16]:
options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options)
movie_data = extractLetterBoxdData(driver, movie_urls, 0, 300)

In [8]:
csv_files = glob.glob('movie_data_checkpoint_*.csv')

dataframes = [pd.read_csv(file) for file in csv_files]
merged_df = pd.concat(dataframes, ignore_index=True)

merged_df.to_csv('letterbxd_scrapped.csv', index=False)

EmptyDataError: No columns to parse from file