In [None]:
# Web Scraping IMDb Top 250 Movies using Selenium and BeautifulSoup
# ChromeDriver is installed using the webdriver_manager module to automatically manage the ChromeDriver version.
# This script loads the IMDb Top 250 page using Selenium,and then parses the HTML content using BeautifulSoup to extract movie data.
# Finally verified the scrape function for the further analysis

In [1]:
!pip install selenium webdriver-manager

Defaulting to user installation because normal site-packages is not writeable


In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

# Define IMDb Top 250 URL
IMDB_URL = "https://www.imdb.com/chart/top/"

def scrape_imdb_movies_with_selenium():
    # Automatically install and manage ChromeDriver
    service = Service(ChromeDriverManager().install())
    
    # Set user-agent to avoid bot detection
    options = Options()
    options.add_argument("user-agent=Mozilla/5.0")

    driver = webdriver.Chrome(service=service, options=options)
    
    driver.get(IMDB_URL)
    time.sleep(3)  # Wait for the page to load completely

    # Extract page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()  # Close the browser

    return soup

# Verifying the scrape function
soup = scrape_imdb_movies_with_selenium()
print(soup.title.text)  # Should print IMDb page title

IMDb Top 250 Movies


In [None]:
#Extracting the dataset

In [17]:
import requests
import pandas as pd

# Extract movie data
movies_data = []
movie_rows = soup.select(".ipc-metadata-list-summary-item")  # Updated selector for movie rows

if not movie_rows:
    print("No movie data found. Check the HTML structure.")
else:
    for movie in movie_rows:
        # Extract movie title
        title_element = movie.select_one("h3")
        title = title_element.text.strip() if title_element else "Unknown"
        
        # Extract release year
        year_element = movie.select(".sc-f30335b4-7")
        year = year_element[0].text.strip() if year_element else "Unknown"
        
        # Extract IMDb rating
        rating_element = movie.select_one(".ipc-rating-star--imdb")
        rating = rating_element.text.strip() if rating_element else "Unknown"
        
        # Extract movie link
        movie_link_element = movie.select_one("a.ipc-title-link-wrapper")
        movie_link = "https://www.imdb.com" + movie_link_element["href"] if movie_link_element else ""
        
        # Initialize extra details
        genre, directors, box_office_revenue, lead_actors = "Unknown", "Unknown", "Unknown", "Unknown"
        
        # Fetch individual movie page for more details
        if movie_link:
            movie_response = requests.get(movie_link, headers={"User-Agent": "Mozilla/5.0"})
            if movie_response.status_code == 200:
                movie_soup = BeautifulSoup(movie_response.text, "html.parser")
                
                # Extract genre
                genre_elements = movie_soup.select(".ipc-chip-list__scroller a")
                genre = ", ".join([g.text.strip() for g in genre_elements]) if genre_elements else "Unknown"
                
                # Extract directors (Avoid duplication)
                director_elements = movie_soup.select(".ipc-metadata-list-item__content-container a[href*='/name/']")
                directors_set = {d.text.strip() for d in director_elements}  # Use a set to avoid duplicates
                directors = ", ".join(directors_set) if directors_set else "Unknown"
                
                # Extract box office revenue
                box_office_element = movie_soup.select_one(".ipc-metadata-list__item:-soup-contains('Gross worldwide')")
                box_office_revenue = box_office_element.text.strip().split(":")[-1] if box_office_element else "Unknown"
                
                # Extract lead actors (Updated method)
                metadata_sections = movie_soup.find_all("li", class_="ipc-metadata-list__item")
                for section in metadata_sections:
                    if section.find(string="Stars"):
                        actor_elements = section.find_all("a", href=lambda x: x and "/name/" in x)
                        lead_actors_set = {actor.text.strip() for actor in actor_elements}
                        lead_actors = ", ".join(lead_actors_set) if lead_actors_set else "Unknown"
                        break
        
        movies_data.append({
            "Title": title,
            "Year": year,
            "Rating": rating,
            "Genre": genre,
            "Director(s)": directors,
            "Box Office Revenue": box_office_revenue,
            "Lead Actors": lead_actors,
        })

    # Save to a DataFrame
    df = pd.DataFrame(movies_data)
    df.to_csv("imdb_top_movies.csv", index=False)
    print(f"Completed the webscraping and saved the dataset.")


Completed the webscraping and saved the dataset.
