# IMDB Analysis

In [22]:
"""
IMDb Top 250 Movies Scraper

This script uses Selenium to load the IMDb Top 250 page and BeautifulSoup to parse the HTML.
For each movie, it fetches additional details from the individual movie page using requests.
The final dataset is stored in a CSV file named 'imdb_top_movies.csv'.

Requirements:
- Selenium WebDriver (ensure ChromeDriver is installed and the path is set correctly)
- BeautifulSoup4
- pandas
- requests
"""

"\nIMDb Top 250 Movies Scraper\n\nThis script uses Selenium to load the IMDb Top 250 page and BeautifulSoup to parse the HTML.\nFor each movie, it fetches additional details from the individual movie page using requests.\nThe final dataset is stored in a CSV file named 'imdb_top_movies.csv'.\n\nRequirements:\n- Selenium WebDriver (ensure ChromeDriver is installed and the path is set correctly)\n- BeautifulSoup4\n- pandas\n- requests\n"

In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests

In [24]:
# IMDb Top 250 URL and HTTP headers
BASE_URL = "https://www.imdb.com/chart/top/"
HEADERS = {"User-Agent": "Mozilla/5.0"}

In [25]:
def setup_driver():
    """
    Initialize and return a Selenium WebDriver instance.
    
    Ensure that the ChromeDriver binary is available at the specified path.
    """
    chromedriver_path = r"C:\Program Files (x86)\chromedriver-win64\chromedriver-win64\chromedriver.exe"
    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service)
    return driver

In [26]:
def get_soup(url):
    """
    Retrieve the HTML content of a webpage and parse it with BeautifulSoup.
    
    Parameters:
        url (str): URL of the webpage to fetch.
    
    Returns:
        BeautifulSoup: Parsed HTML if the request is successful; otherwise, None.
    """
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return BeautifulSoup(response.text, "html.parser")
    else:
        print(f"Failed to retrieve page: {url} (Status Code: {response.status_code})")
        return None

In [27]:
def extract_movie_details(movie_soup):
    """
    Extract detailed information from a movie's detail page.
    
    Parameters:
        movie_soup (BeautifulSoup): Parsed HTML of the movie detail page.
    
    Returns:
        tuple: Contains genre, directors, box office revenue, and lead actors.
    """
    # Extract movie genres
    genre_elements = movie_soup.select(".ipc-chip-list__scroller a")
    genre = ", ".join([g.text.strip() for g in genre_elements]) if genre_elements else "Unknown"
    
    # Extract movie directors using elements with hrefs that contain '/name/'
    director_elements = movie_soup.select(".ipc-metadata-list-item__content-container a[href*='/name/']")
    directors = ", ".join({d.text.strip() for d in director_elements}) if director_elements else "Unknown"
    
    # Extract box office revenue (e.g., "Gross worldwide")
    box_office_element = movie_soup.select_one(".ipc-metadata-list__item:-soup-contains('Gross worldwide')")
    box_office_revenue = box_office_element.text.strip().split(":")[-1] if box_office_element else "Unknown"
    
    # Extract lead actors from the metadata section that contains "Stars"
    lead_actors = "Unknown"
    metadata_sections = movie_soup.find_all("li", class_="ipc-metadata-list__item")
    for section in metadata_sections:
        if section.find(string="Stars"):
            actor_elements = section.find_all("a", href=lambda x: x and "/name/" in x)
            lead_actors = ", ".join({actor.text.strip() for actor in actor_elements}) if actor_elements else "Unknown"
            break
    
    return genre, directors, box_office_revenue, lead_actors

In [28]:
def scrape_imdb_movies_with_selenium():
    """
    Scrape IMDb Top 250 movies and save the results to a CSV file.
    
    Steps:
    1. Initialize Selenium WebDriver and load the IMDb Top 250 page.
    2. Parse the loaded page with BeautifulSoup.
    3. Iterate over each movie entry to extract basic details.
    4. For each movie, fetch additional details from its individual page.
    5. Compile the data into a pandas DataFrame and export it to 'imdb_top_movies.csv'.
    """
    # Set up Selenium driver and load the IMDb Top 250 page
    driver = setup_driver()
    driver.get(BASE_URL)
    time.sleep(3)  # Wait for dynamic content to load
    
    # Parse the page source using BeautifulSoup and then close the driver
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    
    movies_data = []
    # Select movie entries (update the CSS selector if IMDb changes their layout)
    movie_rows = soup.select(".ipc-metadata-list-summary-item")
    if not movie_rows:
        print("No movie data found. Check the HTML structure.")
        return
    
    for movie in movie_rows:
        # Extract the movie title
        title_element = movie.select_one("h3")
        title = title_element.text.strip() if title_element else "Unknown"
        
        # Extract the release year 
        year_elements = movie.select(".sc-f30335b4-7")
        year = year_elements[0].text.strip() if year_elements else "Unknown"
        
        # Extract IMDb rating 
        rating_element = movie.select_one(".ipc-rating-star--imdb")
        rating = rating_element.text.strip() if rating_element else "Unknown"
        
        # Construct the full URL for the movie detail page
        movie_link_element = movie.select_one("a.ipc-title-link-wrapper")
        movie_link = "https://www.imdb.com" + movie_link_element["href"] if movie_link_element else ""
        
        # Initialize detailed movie fields with default values
        genre, directors, box_office_revenue, lead_actors = "Unknown", "Unknown", "Unknown", "Unknown"
        
        # Fetch and extract additional movie details if a valid link is found
        if movie_link:
            movie_soup = get_soup(movie_link)
            if movie_soup:
                genre, directors, box_office_revenue, lead_actors = extract_movie_details(movie_soup)
        
        # Append the collected data for the current movie
        movies_data.append({
            "Title": title,
            "Year": year,
            "Rating": rating,
            "Genre": genre,
            "Director(s)": directors,
            "Box Office Revenue": box_office_revenue,
            "Lead Actors": lead_actors,
        })
    
    # Convert the list of movies into a DataFrame and save as CSV
    df = pd.DataFrame(movies_data)
    df.to_csv("imdb_top_movies.csv", index=False)
    print("Scraping complete. Data saved to 'imdb_top_movies.csv'.")

if __name__ == "__main__":
    scrape_imdb_movies_with_selenium()

Scraping complete. Data saved to 'imdb_top_movies.csv'.
