In [2]:
%pip install selenium

Collecting selenium
  Using cached selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Using cached trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting websocket-client~=1.8 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sniffio>=1.3.0 (from trio~=0.17->selenium)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Using cached wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->seleni

In [None]:
import requests
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# ✅ Corrected OMDb API Key 
OMDB_API_KEY = "60fded84"

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening browser (faster)
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")  # Mimic real user

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Letterboxd Popular Movies URL
url = "https://letterboxd.com/films/popular/"
driver.get(url)
time.sleep(5)  # Allow JavaScript to load

# Extract movie details
movies = []
movie_elements = driver.find_elements(By.CSS_SELECTOR, ".poster-container")

for i, movie in enumerate(movie_elements[:250]):  # Scraping first 250 movies
    try:
        title = movie.find_element(By.TAG_NAME, "img").get_attribute("alt").strip()  # Movie title
        link = movie.find_element(By.TAG_NAME, "a").get_attribute("href")  # Movie link

        # Open movie page in new tab
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)

        try:
            year = driver.find_element(By.CSS_SELECTOR, "a[href*='/films/year/']").text.strip()  # Actual Release Year
        except:
            year = "N/A"

        try:
            rating = driver.find_element(By.CSS_SELECTOR, ".average-rating").text.strip()  # Letterboxd-style rating
        except:
            rating = "N/A"

        try:
            runtime = driver.find_element(By.XPATH, "//p[contains(text(),'mins')]").text.split(" ")[0]  # Runtime
        except:
            runtime = "N/A"

        try:
            synopsis = driver.find_element(By.CSS_SELECTOR, ".truncate").text.strip()  # Plot Summary
        except:
            synopsis = "N/A"

        # Close movie tab & return to main list
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # ✅ Use OMDb API to get **Genre, Country, Language, Production Studio, IMDb Rating, Budget, Box Office, and Awards**
        omdb_url = f"http://www.omdbapi.com/?t={title.replace(' ', '+')}&y={year}&apikey={OMDB_API_KEY}"
        response = requests.get(omdb_url).json()

        if "Error" in response:
            print(f"🚨 OMDb Error for {title}: {response['Error']}")

        genre = response.get("Genre", "N/A")
        country = response.get("Country", "N/A")
        language = response.get("Language", "N/A")
        studio = response.get("Production", "N/A")  # Production company from OMDb
        imdb_rating = response.get("imdbRating", "N/A")
        budget = response.get("BoxOffice", "N/A")
        box_office = response.get("BoxOffice", "N/A")
        awards = response.get("Awards", "N/A")

        # Append to dataset
        movies.append({
            "Movie Title": title,
            "Genre(s)": genre,
            "Year of Release": year,
            "Letterboxd Rating": rating,
            "IMDb Rating": imdb_rating,
            "Runtime (mins)": runtime,
            "Synopsis": synopsis,
            "Country": country,
            "Original Language": language,
            "Production Studio": studio,
            "Budget": budget,
            "Box Office Gross": box_office,
            "Awards": awards
        })

        print(f"✅ Scraped: {title} | IMDb: {imdb_rating} | Box Office: {box_office}")

    except Exception as e:
        print(f"❌ Error scraping a movie: {e}")

# Close browser
driver.quit()

# Save data to CSV
df = pd.DataFrame(movies)
df.to_csv("letterboxd_movies.csv", index=False)

print("\n🎉 Scraping complete! Data saved as letterboxd_movies.csv ✅")


✅ Scraped: Barbie | IMDb: 6.8 | Box Office: $636,238,421
✅ Scraped: Parasite | IMDb: 8.5 | Box Office: $53,369,749
✅ Scraped: Interstellar | IMDb: 8.7 | Box Office: $203,227,580
✅ Scraped: Fight Club | IMDb: 8.8 | Box Office: $37,030,102
✅ Scraped: La La Land | IMDb: 8.0 | Box Office: $151,101,803
✅ Scraped: Everything Everywhere All at Once | IMDb: 7.8 | Box Office: $77,191,785
✅ Scraped: Oppenheimer | IMDb: 8.3 | Box Office: $330,050,270
✅ Scraped: Whiplash | IMDb: 8.5 | Box Office: $14,003,391
✅ Scraped: Pulp Fiction | IMDb: 8.9 | Box Office: $107,928,762
✅ Scraped: Joker | IMDb: 8.3 | Box Office: $335,477,657
✅ Scraped: Dune | IMDb: 8.0 | Box Office: $108,897,830
✅ Scraped: The Substance | IMDb: 7.3 | Box Office: $17,539,788
✅ Scraped: Get Out | IMDb: 7.8 | Box Office: $176,196,665
✅ Scraped: Midsommar | IMDb: 7.1 | Box Office: $27,426,361
✅ Scraped: Spider-Man: Into the Spider-Verse | IMDb: 8.4 | Box Office: $190,241,310
✅ Scraped: The Truman Show | IMDb: 8.2 | Box Office: $125,61