In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# IMDb Top 1000 Movies URL
url = "https://www.imdb.com/search/title/?groups=top_1000&count=100"
driver.get(url)

# Allow time for JavaScript to load
time.sleep(5)

# Extract movie details
movies = []
movie_elements = driver.find_elements(By.CSS_SELECTOR, ".lister-item-content")

for movie in movie_elements:
    try:
        title = movie.find_element(By.TAG_NAME, "h3").text.split(". ")[1]  # Get movie title
        year = movie.find_element(By.CLASS_NAME, "lister-item-year").text.strip("()")  # Year
        genre = movie.find_element(By.CLASS_NAME, "genre").text.strip()  # Genre(s)
        rating = movie.find_element(By.CLASS_NAME, "ratings-imdb-rating").text.strip()  # IMDb rating
        runtime = movie.find_element(By.CLASS_NAME, "runtime").text.strip(" min")  # Runtime

        # Fetch movie details (Director, Cast, Budget, Box Office) - deeper scraping needed
        link = movie.find_element(By.TAG_NAME, "a").get_attribute("href")
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)  # Allow page to load

        try:
            director = driver.find_element(By.XPATH, "//span[text()='Director:']/following-sibling::a").text
        except:
            director = "N/A"

        try:
            cast_elements = driver.find_elements(By.CSS_SELECTOR, ".cast_list tr td:nth-child(2) a")
            cast = ", ".join([actor.text for actor in cast_elements[:5]])  # Get first 5 actors
        except:
            cast = "N/A"

        try:
            synopsis = driver.find_element(By.CSS_SELECTOR, ".plot_summary .summary_text").text.strip()
        except:
            synopsis = "N/A"

        try:
            budget = driver.find_element(By.XPATH, "//h4[text()='Budget:']/following-sibling::span").text
        except:
            budget = "N/A"

        try:
            box_office = driver.find_element(By.XPATH, "//h4[text()='Cumulative Worldwide Gross:']/following-sibling::span").text
        except:
            box_office = "N/A"

        try:
            awards = driver.find_element(By.XPATH, "//a[contains(text(),'wins')]").text
        except:
            awards = "N/A"

        # Close tab and switch back
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # Append to dataset
        movies.append({
            "Movie Title": title,
            "Genre(s)": genre,
            "Director": director,
            "Cast (Main Actors)": cast,
            "Year of Release": year,
            "IMDb Rating": rating,
            "Runtime (mins)": runtime,
            "Synopsis": synopsis,
            "Budget": budget,
            "Box Office Gross": box_office,
            "Awards": awards
        })

    except Exception as e:
        print(f"Error scraping a movie: {e}")

# Close browser
driver.quit()

# Save data to CSV
df = pd.DataFrame(movies)
df.to_csv("imdb_movies.csv", index=False)

print("Scraping complete! Data saved as imdb_movies.csv")


Scraping complete! Data saved as imdb_movies.csv


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode (faster)
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")  # Mimic real user

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# IMDb Top 1000 Movies URL
url = "https://www.imdb.com/search/title/?groups=top_1000&count=100"
driver.get(url)
time.sleep(5)  # Allow JavaScript to load

# Extract movie details
movies = []
movie_elements = driver.find_elements(By.CLASS_NAME, "lister-item-content")

for i, movie in enumerate(movie_elements[:100]):  # Scraping only first 100 movies
    try:
        title = movie.find_element(By.TAG_NAME, "h3").text.split(". ")[1]  # Extract movie title
        year = movie.find_element(By.CLASS_NAME, "lister-item-year").text.strip("()")  # Year
        genre = movie.find_element(By.CLASS_NAME, "genre").text.strip()  # Genre(s)
        rating = movie.find_element(By.CLASS_NAME, "ratings-imdb-rating").text.strip()  # IMDb rating
        runtime = movie.find_element(By.CLASS_NAME, "runtime").text.strip(" min")  # Runtime

        # Click on movie link to fetch deeper details (Director, Cast, Budget, Box Office)
        link = movie.find_element(By.TAG_NAME, "a").get_attribute("href")
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)

        try:
            director = driver.find_element(By.XPATH, "//span[text()='Director:']/following-sibling::a").text
        except:
            director = "N/A"

        try:
            cast_elements = driver.find_elements(By.CSS_SELECTOR, ".cast_list tr td:nth-child(2) a")
            cast = ", ".join([actor.text for actor in cast_elements[:5]])  # Get first 5 actors
        except:
            cast = "N/A"

        try:
            synopsis = driver.find_element(By.CSS_SELECTOR, ".plot_summary .summary_text").text.strip()
        except:
            synopsis = "N/A"

        try:
            budget = driver.find_element(By.XPATH, "//h4[text()='Budget:']/following-sibling::span").text
        except:
            budget = "N/A"

        try:
            box_office = driver.find_element(By.XPATH, "//h4[text()='Cumulative Worldwide Gross:']/following-sibling::span").text
        except:
            box_office = "N/A"

        try:
            awards = driver.find_element(By.XPATH, "//a[contains(text(),'wins')]").text
        except:
            awards = "N/A"

        # Close movie tab & return to main list
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # Append to dataset
        movies.append({
            "Movie Title": title,
            "Genre(s)": genre,
            "Director": director,
            "Cast (Main Actors)": cast,
            "Year of Release": year,
            "IMDb Rating": rating,
            "Runtime (mins)": runtime,
            "Synopsis": synopsis,
            "Budget": budget,
            "Box Office Gross": box_office,
            "Awards": awards
        })

        print(f"✅ Scraped: {title}")

    except Exception as e:
        print(f"❌ Error scraping a movie: {e}")

# Close browser
driver.quit()

# Save data to CSV
df = pd.DataFrame(movies)
df.to_csv("imdb_movies.csv", index=False)

print("\n🎉 Scraping complete! Data saved as imdb_movies.csv ✅")



🎉 Scraping complete! Data saved as imdb_movies.csv ✅


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening browser (faster)
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")  # Mimic real user

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Letterboxd Popular Movies URL
url = "https://letterboxd.com/films/popular/"
driver.get(url)
time.sleep(5)  # Allow JavaScript to load

# Extract movie details
movies = []
movie_elements = driver.find_elements(By.CSS_SELECTOR, ".poster-container")

for i, movie in enumerate(movie_elements[:100]):  # Scraping only first 100 movies
    try:
        title = movie.find_element(By.TAG_NAME, "img").get_attribute("alt")  # Extract movie title
        link = movie.find_element(By.TAG_NAME, "a").get_attribute("href")  # Movie link

        # Open movie page in new tab
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)

        try:
            year = driver.find_element(By.CSS_SELECTOR, ".number").text.strip()  # Year
        except:
            year = "N/A"

        try:
            genre = ", ".join([g.text for g in driver.find_elements(By.CSS_SELECTOR, "a.text-slug")])  # Genre(s)
        except:
            genre = "N/A"

        try:
            director = driver.find_element(By.XPATH, "//a[contains(@href, '/director/')]").text.strip()  # Director
        except:
            director = "N/A"

        try:
            cast_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/actor/')]")
            cast = ", ".join([actor.text for actor in cast_elements[:5]])  # First 5 actors
        except:
            cast = "N/A"

        try:
            rating = driver.find_element(By.CSS_SELECTOR, ".average-rating").text.strip()  # IMDb-style rating
        except:
            rating = "N/A"

        try:
            runtime = driver.find_element(By.XPATH, "//p[contains(text(),'mins')]").text.split(" ")[0]  # Runtime
        except:
            runtime = "N/A"

        try:
            synopsis = driver.find_element(By.CSS_SELECTOR, ".truncate").text.strip()  # Plot Summary
        except:
            synopsis = "N/A"

        # Budget & Box Office not available on Letterboxd (set as "N/A")
        budget = "N/A"
        box_office = "N/A"

        try:
            awards = driver.find_element(By.XPATH, "//a[contains(text(),'win')]").text  # Awards
        except:
            awards = "N/A"

        # Close movie tab & return to main list
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # Append to dataset
        movies.append({
            "Movie Title": title,
            "Genre(s)": genre,
            "Director": director,
            "Cast (Main Actors)": cast,
            "Year of Release": year,
            "IMDb Rating": rating,
            "Runtime (mins)": runtime,
            "Synopsis": synopsis,
            "Budget": budget,
            "Box Office Gross": box_office,
            "Awards": awards
        })

        print(f"✅ Scraped: {title}")

    except Exception as e:
        print(f"❌ Error scraping a movie: {e}")

# Close browser
driver.quit()

# Save data to CSV
df = pd.DataFrame(movies)
df.to_csv("letterboxd_movies.csv", index=False)

print("\n🎉 Scraping complete! Data saved as letterboxd_movies.csv ✅")


✅ Scraped: Barbie
✅ Scraped: Parasite
✅ Scraped: Interstellar
✅ Scraped: Fight Club
✅ Scraped: La La Land
✅ Scraped: Everything Everywhere All at Once
✅ Scraped: Oppenheimer
✅ Scraped: Whiplash
✅ Scraped: Pulp Fiction
✅ Scraped: Joker
✅ Scraped: Dune
✅ Scraped: The Substance
✅ Scraped: Get Out
✅ Scraped: Midsommar
✅ Scraped: Spider-Man: Into the Spider-Verse
✅ Scraped: The Truman Show
✅ Scraped: The Batman
✅ Scraped: Eternal Sunshine of the Spotless Mind
✅ Scraped: Knives Out
✅ Scraped: The Dark Knight
✅ Scraped: Inception
✅ Scraped: Dune: Part Two
✅ Scraped: American Psycho
✅ Scraped: Saltburn
✅ Scraped: Spider-Man: Across the Spider-Verse
✅ Scraped: Poor Things
✅ Scraped: Lady Bird
✅ Scraped: Challengers
✅ Scraped: Spirited Away
✅ Scraped: The Wolf of Wall Street
✅ Scraped: Nosferatu
✅ Scraped: The Grand Budapest Hotel
✅ Scraped: The Menu
✅ Scraped: 10 Things I Hate About You
✅ Scraped: Black Swan
✅ Scraped: The Shining
✅ Scraped: Gone Girl
✅ Scraped: Se7en
✅ Scraped: Spider-Man: No 

In [4]:
df.head(7)

Unnamed: 0,Movie Title,Genre(s),Director,Cast (Main Actors),Year of Release,IMDb Rating,Runtime (mins),Synopsis,Budget,Box Office Gross,Awards
0,Barbie,"Margot Robbie, Ryan Gosling, America Ferrera, ...",Greta Gerwig,"Margot Robbie, Ryan Gosling, America Ferrera, ...",,3.8,114,Barbie and Ken are having the time of their li...,,,Connor Swindells
1,Parasite,"Song Kang-ho, Lee Sun-kyun, Cho Yeo-jeong, Cho...",Bong Joon Ho,"Song Kang-ho, Lee Sun-kyun, Cho Yeo-jeong, Cho...",,4.5,133,"All unemployed, Ki-taek’s family takes peculia...",,,
2,Interstellar,"Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan,"Matthew McConaughey, Anne Hathaway, Michael Ca...",,4.4,169,The adventures of a group of explorers who mak...,,,Bill Irwin
3,Fight Club,"Edward Norton, Brad Pitt, Helena Bonham Carter...",David Fincher,"Edward Norton, Brad Pitt, Helena Bonham Carter...",,4.3,139,A ticking-time-bomb insomniac and a slippery s...,,,
4,La La Land,"Ryan Gosling, Emma Stone, John Legend, Rosemar...",Damien Chazelle,"Ryan Gosling, Emma Stone, John Legend, Rosemar...",,4.1,129,"Mia, an aspiring actress, serves lattes to mov...",,,
5,Everything Everywhere All at Once,"Michelle Yeoh, Stephanie Hsu, Ke Huy Quan, Jam...",Daniel Scheinert,"Michelle Yeoh, Stephanie Hsu, Ke Huy Quan, Jam...",,4.3,140,An aging Chinese immigrant is swept up in an i...,,,
6,Oppenheimer,"Cillian Murphy, Emily Blunt, Matt Damon, Rober...",Christopher Nolan,"Cillian Murphy, Emily Blunt, Matt Damon, Rober...",,4.2,181,The story of J. Robert Oppenheimer’s role in t...,,,


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening browser (faster)
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")  # Mimic real user

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Letterboxd Popular Movies URL
url = "https://letterboxd.com/films/popular/"
driver.get(url)
time.sleep(5)  # Allow JavaScript to load

# Extract movie details
movies = []
movie_elements = driver.find_elements(By.CSS_SELECTOR, ".poster-container")

for i, movie in enumerate(movie_elements[:159]):  # Scraping first 159 movies
    try:
        title = movie.find_element(By.TAG_NAME, "img").get_attribute("alt")  # Extract movie title
        link = movie.find_element(By.TAG_NAME, "a").get_attribute("href")  # Movie link

        # Open movie page in new tab
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)

        try:
            year = driver.find_element(By.CSS_SELECTOR, "a[href*='/films/year/']").text.strip()  # Actual Release Year
        except:
            year = "N/A"

        try:
            genre_elements = driver.find_elements(By.CSS_SELECTOR, "a[href*='/films/genre/']")  # Extract all genres
            genres = ", ".join([g.text for g in genre_elements])
        except:
            genres = "N/A"

        try:
            director = driver.find_element(By.XPATH, "//a[contains(@href, '/director/')]").text.strip()  # Director
        except:
            director = "N/A"

        try:
            cast_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/actor/')]")
            cast = ", ".join([actor.text for actor in cast_elements[:5]])  # First 5 actors
        except:
            cast = "N/A"

        try:
            rating = driver.find_element(By.CSS_SELECTOR, ".average-rating").text.strip()  # Letterboxd-style rating
        except:
            rating = "N/A"

        try:
            runtime = driver.find_element(By.XPATH, "//p[contains(text(),'mins')]").text.split(" ")[0]  # Runtime
        except:
            runtime = "N/A"

        try:
            synopsis = driver.find_element(By.CSS_SELECTOR, ".truncate").text.strip()  # Plot Summary
        except:
            synopsis = "N/A"

        try:
            country = driver.find_element(By.XPATH, "//a[contains(@href, '/films/country/')]").text.strip()  # Country of origin
        except:
            country = "N/A"

        try:
            language = driver.find_element(By.XPATH, "//a[contains(@href, '/films/language/')]").text.strip()  # Original Language
        except:
            language = "N/A"

        try:
            studio = driver.find_element(By.XPATH, "//a[contains(@href, '/films/studio/')]").text.strip()  # Production Studio
        except:
            studio = "N/A"

        # Budget & Box Office are not available on Letterboxd (set as "N/A")
        budget = "N/A"
        box_office = "N/A"

        try:
            awards = driver.find_element(By.XPATH, "//a[contains(text(),'win')]").text  # Awards
        except:
            awards = "N/A"

        # Close movie tab & return to main list
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # Append to dataset
        movies.append({
            "Movie Title": title,
            "Genre(s)": genres,
            "Director": director,
            "Cast (Main Actors)": cast,
            "Year of Release": year,
            "IMDb Rating": rating,
            "Runtime (mins)": runtime,
            "Synopsis": synopsis,
            "Country": country,
            "Original Language": language,
            "Production Studio": studio,
            "Budget": budget,
            "Box Office Gross": box_office,
            "Awards": awards
        })

        print(f"✅ Scraped: {title}")

    except Exception as e:
        print(f"❌ Error scraping a movie: {e}")

# Close browser
driver.quit()

# Save data to CSV
df = pd.DataFrame(movies)
df.to_csv("letterboxd_movies1.csv", index=False)

print("\n🎉 Scraping complete! Data saved as letterboxd_movies1.csv ✅")


✅ Scraped: Barbie
✅ Scraped: Parasite
✅ Scraped: Interstellar
✅ Scraped: Fight Club
✅ Scraped: La La Land
✅ Scraped: Everything Everywhere All at Once
✅ Scraped: Oppenheimer
✅ Scraped: Whiplash
✅ Scraped: Pulp Fiction
✅ Scraped: Joker
✅ Scraped: Dune
✅ Scraped: The Substance
✅ Scraped: Get Out
✅ Scraped: Midsommar
✅ Scraped: Spider-Man: Into the Spider-Verse
✅ Scraped: The Truman Show
✅ Scraped: The Batman
✅ Scraped: Eternal Sunshine of the Spotless Mind
✅ Scraped: Knives Out
✅ Scraped: The Dark Knight
✅ Scraped: Inception
✅ Scraped: Dune: Part Two
✅ Scraped: American Psycho
✅ Scraped: Saltburn
✅ Scraped: Spider-Man: Across the Spider-Verse
✅ Scraped: Poor Things
✅ Scraped: Lady Bird
✅ Scraped: Challengers
✅ Scraped: Spirited Away
✅ Scraped: The Wolf of Wall Street
✅ Scraped: Nosferatu
✅ Scraped: The Grand Budapest Hotel
✅ Scraped: The Menu
✅ Scraped: 10 Things I Hate About You
✅ Scraped: Black Swan
✅ Scraped: The Shining
✅ Scraped: Gone Girl
✅ Scraped: Se7en
✅ Scraped: Spider-Man: No 

In [None]:
import requests
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# OMDb API Key 
OMDB_API_KEY = "http://www.omdbapi.com/?i=tt3896198&apikey=60fded84"

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening browser (faster)
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")  # Mimic real user

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Letterboxd Popular Movies URL
url = "https://letterboxd.com/films/popular/"
driver.get(url)
time.sleep(5)  # Allow JavaScript to load

# Extract movie details
movies = []
movie_elements = driver.find_elements(By.CSS_SELECTOR, ".poster-container")

for i, movie in enumerate(movie_elements[:159]):  # Scraping first 159 movies
    try:
        title = movie.find_element(By.TAG_NAME, "img").get_attribute("alt")  # Movie title
        link = movie.find_element(By.TAG_NAME, "a").get_attribute("href")  # Movie link

        # Open movie page in new tab
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)

        try:
            year = driver.find_element(By.CSS_SELECTOR, "a[href*='/films/year/']").text.strip()  # Actual Release Year
        except:
            year = "N/A"

        try:
            genre_elements = driver.find_elements(By.CSS_SELECTOR, "a[href*='/films/genre/']")
            genres = ", ".join([g.text for g in genre_elements])  # Extract all genres
        except:
            genres = "N/A"

        try:
            director = driver.find_element(By.XPATH, "//a[contains(@href, '/director/')]").text.strip()  # Director
        except:
            director = "N/A"

        try:
            cast_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/actor/')]")
            cast = ", ".join([actor.text for actor in cast_elements[:5]])  # First 5 actors
        except:
            cast = "N/A"

        try:
            rating = driver.find_element(By.CSS_SELECTOR, ".average-rating").text.strip()  # Letterboxd-style rating
        except:
            rating = "N/A"

        try:
            runtime = driver.find_element(By.XPATH, "//p[contains(text(),'mins')]").text.split(" ")[0]  # Runtime
        except:
            runtime = "N/A"

        try:
            synopsis = driver.find_element(By.CSS_SELECTOR, ".truncate").text.strip()  # Plot Summary
        except:
            synopsis = "N/A"

        try:
            country = driver.find_element(By.XPATH, "//a[contains(@href, '/films/country/')]").text.strip()  # Country
        except:
            country = "N/A"

        try:
            language = driver.find_element(By.XPATH, "//a[contains(@href, '/films/language/')]").text.strip()  # Original Language
        except:
            language = "N/A"

        try:
            studio = driver.find_element(By.XPATH, "//a[contains(@href, '/films/studio/')]").text.strip()  # Production Studio
        except:
            studio = "N/A"

        # Close movie tab & return to main list
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # Use OMDb API to get Budget, Box Office, Awards, and IMDb Rating
        omdb_url = f"http://www.omdbapi.com/?t={title.replace(' ', '+')}&y={year}&apikey={OMDB_API_KEY}"
        response = requests.get(omdb_url).json()

        try:
            imdb_rating = response.get("imdbRating", "N/A")  # Actual IMDb Rating
        except:
            imdb_rating = "N/A"

        try:
            budget = response.get("BoxOffice", "N/A")  # OMDb API sometimes provides budget as BoxOffice
        except:
            budget = "N/A"

        try:
            box_office = response.get("BoxOffice", "N/A")  # Box Office Earnings
        except:
            box_office = "N/A"

        try:
            awards = response.get("Awards", "N/A")  # Awards won
        except:
            awards = "N/A"

        # Append to dataset
        movies.append({
            "Movie Title": title,
            "Genre(s)": genres,
            "Director": director,
            "Cast (Main Actors)": cast,
            "Year of Release": year,
            "Letterboxd Rating": rating,
            "IMDb Rating": imdb_rating,
            "Runtime (mins)": runtime,
            "Synopsis": synopsis,
            "Country": country,
            "Original Language": language,
            "Production Studio": studio,
            "Budget": budget,
            "Box Office Gross": box_office,
            "Awards": awards
        })

        print(f"✅ Scraped: {title} | IMDb: {imdb_rating} | Box Office: {box_office}")

    except Exception as e:
        print(f"❌ Error scraping a movie: {e}")

# Close browser
driver.quit()

# Save data to CSV
df = pd.DataFrame(movies)
df.to_csv("letterboxd_movies_extended.csv", index=False)

print("\n🎉 Scraping complete! Data saved as letterboxd_movies_extended.csv ✅")


✅ Scraped: Barbie | IMDb: N/A | Box Office: N/A
✅ Scraped: Parasite | IMDb: N/A | Box Office: N/A
✅ Scraped: Interstellar | IMDb: N/A | Box Office: N/A
✅ Scraped: Fight Club | IMDb: N/A | Box Office: N/A
✅ Scraped: La La Land | IMDb: N/A | Box Office: N/A
✅ Scraped: Everything Everywhere All at Once | IMDb: N/A | Box Office: N/A
✅ Scraped: Oppenheimer | IMDb: N/A | Box Office: N/A
✅ Scraped: Whiplash | IMDb: N/A | Box Office: N/A
✅ Scraped: Pulp Fiction | IMDb: N/A | Box Office: N/A
✅ Scraped: Joker | IMDb: N/A | Box Office: N/A
✅ Scraped: Dune | IMDb: N/A | Box Office: N/A
✅ Scraped: The Substance | IMDb: N/A | Box Office: N/A
✅ Scraped: Get Out | IMDb: N/A | Box Office: N/A
✅ Scraped: Midsommar | IMDb: N/A | Box Office: N/A
✅ Scraped: Spider-Man: Into the Spider-Verse | IMDb: N/A | Box Office: N/A
✅ Scraped: The Truman Show | IMDb: N/A | Box Office: N/A
✅ Scraped: The Batman | IMDb: N/A | Box Office: N/A
✅ Scraped: Eternal Sunshine of the Spotless Mind | IMDb: N/A | Box Office: N/A
✅

In [1]:
import requests
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# ✅ Corrected OMDb API Key 
OMDB_API_KEY = "60fded84"

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening browser (faster)
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")  # Mimic real user

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Letterboxd Popular Movies URL
url = "https://letterboxd.com/films/popular/"
driver.get(url)
time.sleep(5)  # Allow JavaScript to load

# Extract movie details
movies = []
movie_elements = driver.find_elements(By.CSS_SELECTOR, ".poster-container")

for i, movie in enumerate(movie_elements[:159]):  # Scraping first 159 movies
    try:
        title = movie.find_element(By.TAG_NAME, "img").get_attribute("alt")  # Movie title
        link = movie.find_element(By.TAG_NAME, "a").get_attribute("href")  # Movie link

        # Open movie page in new tab
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)

        try:
            year = driver.find_element(By.CSS_SELECTOR, "a[href*='/films/year/']").text.strip()  # Actual Release Year
        except:
            year = "N/A"

        try:
            genre_elements = driver.find_elements(By.CSS_SELECTOR, "a[href*='/films/genre/']")
            genres = ", ".join([g.text for g in genre_elements])  # Extract all genres
        except:
            genres = "N/A"

        try:
            director = driver.find_element(By.XPATH, "//a[contains(@href, '/director/')]").text.strip()  # Director
        except:
            director = "N/A"

        try:
            cast_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/actor/')]")
            cast = ", ".join([actor.text for actor in cast_elements[:5]])  # First 5 actors
        except:
            cast = "N/A"

        try:
            rating = driver.find_element(By.CSS_SELECTOR, ".average-rating").text.strip()  # Letterboxd-style rating
        except:
            rating = "N/A"

        try:
            runtime = driver.find_element(By.XPATH, "//p[contains(text(),'mins')]").text.split(" ")[0]  # Runtime
        except:
            runtime = "N/A"

        try:
            synopsis = driver.find_element(By.CSS_SELECTOR, ".truncate").text.strip()  # Plot Summary
        except:
            synopsis = "N/A"

        try:
            country = driver.find_element(By.XPATH, "//a[contains(@href, '/films/country/')]").text.strip()  # Country
        except:
            country = "N/A"

        try:
            language = driver.find_element(By.XPATH, "//a[contains(@href, '/films/language/')]").text.strip()  # Original Language
        except:
            language = "N/A"

        try:
            studio = driver.find_element(By.XPATH, "//a[contains(@href, '/films/studio/')]").text.strip()  # Production Studio
        except:
            studio = "N/A"

        # Close movie tab & return to main list
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # ✅ Use OMDb API to get IMDb Rating, Budget, Box Office, and Awards
        omdb_url = f"http://www.omdbapi.com/?t={title.replace(' ', '+')}&y={year}&apikey={OMDB_API_KEY}"
        response = requests.get(omdb_url).json()

        if "Error" in response:
            print(f"🚨 OMDb Error for {title}: {response['Error']}")

        imdb_rating = response.get("imdbRating", "N/A")
        budget = response.get("BoxOffice", "N/A")
        box_office = response.get("BoxOffice", "N/A")
        awards = response.get("Awards", "N/A")

        # Append to dataset
        movies.append({
            "Movie Title": title,
            "Genre(s)": genres,
            "Director": director,
            "Cast (Main Actors)": cast,
            "Year of Release": year,
            "Letterboxd Rating": rating,
            "IMDb Rating": imdb_rating,
            "Runtime (mins)": runtime,
            "Synopsis": synopsis,
            "Country": country,
            "Original Language": language,
            "Production Studio": studio,
            "Budget": budget,
            "Box Office Gross": box_office,
            "Awards": awards
        })

        print(f"✅ Scraped: {title} | IMDb: {imdb_rating} | Box Office: {box_office}")

    except Exception as e:
        print(f"❌ Error scraping a movie: {e}")

# Close browser
driver.quit()

# Save data to CSV
df = pd.DataFrame(movies)
df.to_csv("letterboxd_movies_extended1.csv", index=False)

print("\n🎉 Scraping complete! Data saved as letterboxd_movies_extended1.csv ✅")


✅ Scraped: Barbie | IMDb: 6.8 | Box Office: $636,238,421
✅ Scraped: Parasite | IMDb: 8.5 | Box Office: $53,369,749
✅ Scraped: Interstellar | IMDb: 8.7 | Box Office: $203,227,580
✅ Scraped: Fight Club | IMDb: 8.8 | Box Office: $37,030,102
✅ Scraped: La La Land | IMDb: 8.0 | Box Office: $151,101,803
✅ Scraped: Everything Everywhere All at Once | IMDb: 7.8 | Box Office: $77,191,785
✅ Scraped: Oppenheimer | IMDb: 8.3 | Box Office: $330,050,270
✅ Scraped: Whiplash | IMDb: 8.5 | Box Office: $14,003,391
✅ Scraped: Pulp Fiction | IMDb: 8.9 | Box Office: $107,928,762
✅ Scraped: Joker | IMDb: 8.3 | Box Office: $335,477,657
✅ Scraped: Dune | IMDb: 8.0 | Box Office: $108,897,830
✅ Scraped: The Substance | IMDb: 7.3 | Box Office: $17,539,788
✅ Scraped: Get Out | IMDb: 7.8 | Box Office: $176,196,665
✅ Scraped: Midsommar | IMDb: 7.1 | Box Office: $27,426,361
✅ Scraped: Spider-Man: Into the Spider-Verse | IMDb: 8.4 | Box Office: $190,241,310
✅ Scraped: The Truman Show | IMDb: 8.2 | Box Office: $125,61

In [None]:
import requests
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# ✅ Corrected OMDb API Key 
OMDB_API_KEY = "60fded84"

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening browser (faster)
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")  # Mimic real user

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Letterboxd Popular Movies URL
url = "https://letterboxd.com/films/popular/"
driver.get(url)
time.sleep(5)  # Allow JavaScript to load

# Extract movie details
movies = []
movie_elements = driver.find_elements(By.CSS_SELECTOR, ".poster-container")

for i, movie in enumerate(movie_elements[:250]):  # Scraping first 250 movies
    try:
        title = movie.find_element(By.TAG_NAME, "img").get_attribute("alt").strip()  # Movie title
        link = movie.find_element(By.TAG_NAME, "a").get_attribute("href")  # Movie link

        # Open movie page in new tab
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)

        try:
            year = driver.find_element(By.CSS_SELECTOR, "a[href*='/films/year/']").text.strip()  # Actual Release Year
        except:
            year = "N/A"

        try:
            rating = driver.find_element(By.CSS_SELECTOR, ".average-rating").text.strip()  # Letterboxd-style rating
        except:
            rating = "N/A"

        try:
            runtime = driver.find_element(By.XPATH, "//p[contains(text(),'mins')]").text.split(" ")[0]  # Runtime
        except:
            runtime = "N/A"

        try:
            synopsis = driver.find_element(By.CSS_SELECTOR, ".truncate").text.strip()  # Plot Summary
        except:
            synopsis = "N/A"

        # Close movie tab & return to main list
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # ✅ Use OMDb API to get **Genre, Country, Language, Production Studio, IMDb Rating, Budget, Box Office, and Awards**
        omdb_url = f"http://www.omdbapi.com/?t={title.replace(' ', '+')}&y={year}&apikey={OMDB_API_KEY}"
        response = requests.get(omdb_url).json()

        if "Error" in response:
            print(f"🚨 OMDb Error for {title}: {response['Error']}")

        genre = response.get("Genre", "N/A")
        country = response.get("Country", "N/A")
        language = response.get("Language", "N/A")
        studio = response.get("Production", "N/A")  # Production company from OMDb
        imdb_rating = response.get("imdbRating", "N/A")
        budget = response.get("BoxOffice", "N/A")
        box_office = response.get("BoxOffice", "N/A")
        awards = response.get("Awards", "N/A")

        # Append to dataset
        movies.append({
            "Movie Title": title,
            "Genre(s)": genre,
            "Year of Release": year,
            "Letterboxd Rating": rating,
            "IMDb Rating": imdb_rating,
            "Runtime (mins)": runtime,
            "Synopsis": synopsis,
            "Country": country,
            "Original Language": language,
            "Production Studio": studio,
            "Budget": budget,
            "Box Office Gross": box_office,
            "Awards": awards
        })

        print(f"✅ Scraped: {title} | IMDb: {imdb_rating} | Box Office: {box_office}")

    except Exception as e:
        print(f"❌ Error scraping a movie: {e}")

# Close browser
driver.quit()

# Save data to CSV
df = pd.DataFrame(movies)
df.to_csv("letterboxd_movies_extended_250.csv", index=False)

print("\n🎉 Scraping complete! Data saved as letterboxd_movies_extended_250.csv ✅")


✅ Scraped: Barbie | IMDb: 6.8 | Box Office: $636,238,421
✅ Scraped: Parasite | IMDb: 8.5 | Box Office: $53,369,749
✅ Scraped: Interstellar | IMDb: 8.7 | Box Office: $203,227,580
✅ Scraped: Fight Club | IMDb: 8.8 | Box Office: $37,030,102
✅ Scraped: La La Land | IMDb: 8.0 | Box Office: $151,101,803
✅ Scraped: Everything Everywhere All at Once | IMDb: 7.8 | Box Office: $77,191,785
✅ Scraped: Oppenheimer | IMDb: 8.3 | Box Office: $330,050,270
✅ Scraped: Whiplash | IMDb: 8.5 | Box Office: $14,003,391
✅ Scraped: Pulp Fiction | IMDb: 8.9 | Box Office: $107,928,762
✅ Scraped: Joker | IMDb: 8.3 | Box Office: $335,477,657
✅ Scraped: Dune | IMDb: 8.0 | Box Office: $108,897,830
✅ Scraped: The Substance | IMDb: 7.3 | Box Office: $17,539,788
✅ Scraped: Get Out | IMDb: 7.8 | Box Office: $176,196,665
✅ Scraped: Midsommar | IMDb: 7.1 | Box Office: $27,426,361
✅ Scraped: Spider-Man: Into the Spider-Verse | IMDb: 8.4 | Box Office: $190,241,310
✅ Scraped: The Truman Show | IMDb: 8.2 | Box Office: $125,61