In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import random
from bs4 import BeautifulSoup
import time
import pandas as pd
import glob

In [3]:
def toggleFilters(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    filters_to_hide = ['shorts', 'tv', 'docs', 'unreleased']

    for filter_category in filters_to_hide:
        hoverOverFilterMenu(driver, wait)
        clickOnFilter(wait, filter_category)
        time.sleep(1) 

def clickOnFilter(wait, filter_category):
    hide_url = wait.until(EC.element_to_be_clickable((By.XPATH, f"//li[@data-category='{filter_category}' and @data-type='hide']/a")))
    hide_url.click()

def hoverOverFilterMenu(driver, wait):
    menu_icon = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.smenu > label > span.ir.s.hide-toggle-icon')))
    actions = ActionChains(driver)
    actions.move_to_element(menu_icon).perform()

In [4]:
def extractLetterBoxdFilmUrls(driver, url, num_of_pages=3000, checkpoint_interval=100, start_page = 1):
    all_movie_urls = []

    for page_num in range(start_page, num_of_pages + 1):
        movie_urls = getMovieUrls(driver, url, page_num)
        random.seed(page_num)
        random.shuffle(movie_urls)
        all_movie_urls.extend(movie_urls[:3])
        
        if page_num % checkpoint_interval == 0:
            checkpoint_filename = f'urls_checkpoint_{page_num}.csv'
            save_to_csv(all_movie_urls, checkpoint_filename)
            print(f"Saved checkpoint at page {page_num} to {checkpoint_filename}")
            all_movie_urls.clear()

    driver.quit()
    print("Scraping completed.")

    return all_movie_urls

def getMovieUrls(driver, url, page_num):
    driver.get(f"{url}/page/{page_num}")
    wait = WebDriverWait(driver, 300)
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.frame")))
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")
    movie_urls = [url.get('href') for url in soup.find_all("a", class_="frame")]
    return movie_urls

def save_to_csv(urls, filename):
    df = pd.DataFrame(urls, columns=['URL'])
    df.to_csv(filename, index=False)

In [5]:
options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options)
url = "https://letterboxd.com/films/popular"
driver.get(url)
toggleFilters(driver, url)
extractLetterBoxdFilmUrls(driver, url, 3000, 100, 1) 

Saved checkpoint at page 2600 to urls_checkpoint_2600.csv
Saved checkpoint at page 2700 to urls_checkpoint_2700.csv
Saved checkpoint at page 2800 to urls_checkpoint_2800.csv
Saved checkpoint at page 2900 to urls_checkpoint_2900.csv
Saved checkpoint at page 3000 to urls_checkpoint_3000.csv
Scraping completed.


[]

In [4]:
csv_files = glob.glob('*.csv')

dataframes = [pd.read_csv(file) for file in csv_files]
merged_df = pd.concat(dataframes, ignore_index=True)

merged_df.to_csv('movie_url_final.csv', index=False)