In [1]:
# %pip install beautifulsoup4

In [2]:
def convert_duration_to_minutes(duration):
    # Remove 'h' and 'm' and split the duration
    parts = duration.replace('h', '').replace('m', '').split()

    # Initialize hours and minutes to 0
    hours = 0
    minutes = 0

    # Check if there are parts in the duration
    if parts:
        # If there is at least one part, treat it as hours
        hours = int(parts[0])

        # If there is a second part, treat it as minutes
        if len(parts) > 1:
            minutes = int(parts[1])

    # Calculate the total duration in minutes
    total_minutes = hours * 60 + minutes
    return total_minutes


def convert_number_of_votes_to_millions(votes):
    # Remove parentheses and convert 'K' to 'M' if necessary
    votes = votes.replace('(', '').replace(')', '')

    if 'K' in votes:
        votes = str(float(votes.replace('K', '')) / 1000) + 'M'

    return votes

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# URL of the IMDb Top 250 Movies page
url = "https://www.imdb.com/chart/top"

# Set a user-agent header to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Initialize an empty list to store data
data_list = []

try:
    # Send a GET request to the URL with headers
    with requests.Session() as session:
        response = session.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using Beautiful Soup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the elements containing the movie information
        movie_list = soup.find_all('li', class_='cli-parent')

        # Iterate through the movie list and append data to the list
        for movie in movie_list:
            title = movie.h3.text
            title = title[3:].strip()

            # Extract rating and number of votes
            rating_text = movie.find(
                'span', class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text
            rating_parts = rating_text.split('\xa0')

            # Check if there are enough parts before unpacking
            if len(rating_parts) >= 2:
                movie_rating = rating_parts[0]
                number_of_votes = rating_parts[1].strip('()')
                unified_votes = convert_number_of_votes_to_millions(
                    number_of_votes)
            else:
                # Handle the case when there are not enough parts
                print("Could not extract rating and number of votes.")
                continue  # Skip the current movie and move to the next one

            # Extract release date, duration, and category
            metadata_elements = movie.find_all(
                'span', class_='sc-479faa3c-8 bNrEFi cli-title-metadata-item')

            if len(metadata_elements) >= 3:
                release_date = metadata_elements[0].text.strip()
                duration = metadata_elements[1].text.strip()
                duration_in_minutes = convert_duration_to_minutes(duration)
                category = metadata_elements[2].text.strip()

                # Append the movie information to the list
                data_list.append({
                    "Movie Title": title,
                    "Movie Rating": movie_rating,
                    "Number of Votes M": unified_votes,
                    "Release Date": release_date,
                    "Duration (Minutes)": duration_in_minutes,
                    "Category": category
                })
            else:
                print("Could not extract release date, duration, and category.")

except Exception as e:
    print("An error occurred:", e)

# Convert the list to a DataFrame
movie_data = pd.DataFrame(data_list)

# Save the DataFrame to a CSV file
movie_data.to_csv("imdb_top_250_movies.csv", index=False)

time.sleep(2)

Could not extract release date, duration, and category.
