# Scrape IMDB movie rating and details into CSV file

## Check JSON Structure

In [19]:
import requests
from bs4 import BeautifulSoup
import json

# URL of IMDb Top 250 movies page
url = 'https://www.imdb.com/chart/top/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Apple Silicon Mac OS X 12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}

# Sending the request
result = requests.get(url, headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')

# Extracting movie data from the script tag
movieData = soup.find('script', id='__NEXT_DATA__')

# Parsing JSON data if the script is found
if movieData:
    jsonData = json.loads(movieData.string)

    # Pretty printing the entire JSON structure to inspect it
    print(json.dumps(jsonData, indent=4))

else:
    print("ERROR: Could not find movie data.")


{
    "props": {
        "pageProps": {
            "initialRefinerQueryInfo": {
                "queryContext": {
                    "personalized": false,
                    "serverSideCacheable": true
                },
                "queryVariables": {}
            },
            "pageData": {
                "chartTitles": {
                    "edges": [
                        {
                            "currentRank": 1,
                            "node": {
                                "id": "tt0111161",
                                "titleText": {
                                    "text": "Nh\u00e0 t\u00f9 Shawshank",
                                    "__typename": "TitleText"
                                },
                                "titleType": {
                                    "id": "movie",
                                    "text": "Movie",
                                    "canHaveEpisodes": false,
                                    "disp

## Extract from user watchlist

In [17]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

# URL of IMDb watchlist (replace with your desired URL)
url = 'https://www.imdb.com/user/ur174609609/watchlist/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Apple Silicon Mac OS X 12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}

# Sending the request
result = requests.get(url, headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')

# Extracting movie data from the script tag
movieData = soup.find('script', id='__NEXT_DATA__')

# Lists to store movie details
movieName = []
movieYear = []
rating = []
originalTitle = []
genres = []
runtime = []
plot = []
directors = []
cast = []
certificate = []
metascore = []
imageURL = []
voteCount = []
haveEpisode = []

# Parsing JSON data if the script is found
if movieData:
    jsonData = json.loads(movieData.string)
    movies = jsonData['props']['pageProps']['mainColumnData']['predefinedList']['titleListItemSearch']['edges']

    # Iterating over the movies to extract details
    for movie in movies:
        list_item = movie.get('listItem', {})
        
        # Extracting movie title
        title = list_item.get('titleText', {}).get('text', 'N/A')
        movieName.append(title)

        # Extracting movie release year
        year = list_item.get('releaseYear', {}).get('year', 'N/A')
        movieYear.append(year)

        # Extracting original title
        orig_title = list_item.get('originalTitleText', {}).get('text', 'N/A')
        originalTitle.append(orig_title)

        # Checking if 'ratingsSummary' exists and extracting rating and vote count
        ratings = list_item.get('ratingsSummary', {})
        rate = ratings.get('aggregateRating', None)
        vote_count = ratings.get('voteCount', 0)
        rating.append(rate)
        voteCount.append(vote_count)

        # Extracting runtime in minutes (converting from seconds)
        runtime_seconds = list_item.get('runtime', {}).get('seconds', 0)
        runtime_minutes = runtime_seconds // 60
        runtime.append(runtime_minutes)

        # Extracting certificate
        certificate_data = list_item.get('certificate', None)
        if certificate_data is None:
            movie_certificate = 'N/A'
        else:
            movie_certificate = certificate_data.get('rating', 'N/A')

        certificate.append(movie_certificate)


        # Extracting genres
        genre_list = [g['genre']['text'] for g in list_item.get('titleGenres', {}).get('genres', [])]
        genres.append(", ".join(genre_list))

        # Extracting if have episode
        have_episode = list_item.get('canHaveEpisodes', 'N/A')
        haveEpisode.append(have_episode)

        # Extracting plot safely
        movie_plot = list_item.get('plot', {}).get('plotText', {}).get('plainText', 'N/A')
        plot.append(movie_plot)

        # Extracting directors safely
        principal_credits = list_item.get('principalCredits', [])
        if len(principal_credits) > 0:
            director_list = [d['name']['nameText']['text'] for d in principal_credits[0].get('credits', [])]
            directors.append(", ".join(director_list))
        else:
            directors.append('N/A')

        # Extracting cast safely
        if len(principal_credits) > 1:
            cast_list = [c['name']['nameText']['text'] for c in principal_credits[1].get('credits', [])]
            cast.append(", ".join(cast_list))
        else:
            cast.append('N/A')

        # Extracting Metacritic score
        metacritic_data = list_item.get('metacritic', None)
        if metacritic_data is None:
            meta_score = 0  # or 'N/A' depending on how you want to represent missing scores
        else:
            meta_score = metacritic_data.get('metascore', {}).get('score', 0)

        metascore.append(meta_score)

        # Extracting image URL
        img_url = list_item.get('primaryImage', {}).get('url', 'N/A')
        imageURL.append(img_url)

else:
    print("ERROR: Could not find movie data.")

# # Printing movie details
# for name, year, rate, vote_count, orig_title, runtime_m, genre, movie_plot, movie_cert, movie_dir, movie_cast, movie_meta, img_url, episode in zip(
#         movieName, movieYear, rating, voteCount, originalTitle, runtime, genres, plot, certificate, directors, cast, metascore, imageURL, haveEpisode):
#     print(f"Title: {name} ({orig_title}), Year: {year}, Rating: {rate}, Votes: {vote_count}, Runtime: {runtime_m} min")
#     print(f"Genres: {genre}, Certificate: {movie_cert}, Metascore: {movie_meta}")
#     print(f"Plot: {movie_plot}")
#     print(f"Director(s): {movie_dir}")
#     print(f"Cast: {movie_cast}")
#     print(f"Poster URL: {img_url}")
#     print(f"haveEpisode: {episode}")
#     print("\n")

# # Outputting the number of movies found
# print(f"{len(movieName)} movies have been found")

# Creating a DataFrame from the lists
data = {
    "Movie Name": movieName,
    "Release Year": movieYear,
    "Rating": rating,
    "Original Title": originalTitle,
    "Genres": genres,
    "Runtime (min)": runtime,
    "Plot": plot,
    "Directors": directors,
    "Cast": cast,
    "Certificate": certificate,
    "Metascore": metascore,
    "Image URL": imageURL,
    "Vote Count": voteCount,
    "Have Episodes": haveEpisode
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV file
df.to_csv('imdb_watchlist.csv', index=False)

print("CSV file created successfully: imdb_watchlist.csv")


CSV file created successfully: imdb_watchlist.csv


## Extract from top chart

In [16]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

# URL of IMDb Top 250 movies page
url = 'https://www.imdb.com/chart/top/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Apple Silicon Mac OS X 12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}

# Sending the request
result = requests.get(url, headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')

# Extracting movie data from the script tag
movieData = soup.find('script', id='__NEXT_DATA__')

# Lists to store movie details
movieName = []
movieYear = []
rating = []
originalTitle = []
genres = []
runtime = []
plot = []
directors = []
cast = []
certificate = []
metascore = []
imageURL = []
voteCount = []
haveEpisode = []

# Parsing JSON data if the script is found
if movieData:
    jsonData = json.loads(movieData.string)
    movies = jsonData['props']['pageProps']['pageData']['chartTitles']['edges']

    # Iterating over the movies to extract details
    for movie in movies:
        list_item = movie.get('node', {})

        # Extracting movie title
        title = list_item.get('titleText', {}).get('text', 'N/A')
        movieName.append(title)

        # Extracting movie release year
        year = list_item.get('releaseYear', {}).get('year', 'N/A')
        movieYear.append(year)

        # Extracting original title
        orig_title = list_item.get('originalTitleText', {}).get('text', 'N/A')
        originalTitle.append(orig_title)

        # Checking if 'ratingsSummary' exists and extracting rating and vote count
        ratings = list_item.get('ratingsSummary', {})
        rate = ratings.get('aggregateRating', None)
        vote_count = ratings.get('voteCount', 0)
        rating.append(rate)
        voteCount.append(vote_count)

        # Extracting runtime in minutes (converting from seconds)
        runtime_seconds = list_item.get('runtime', {}).get('seconds', 0)
        runtime_minutes = runtime_seconds // 60
        runtime.append(runtime_minutes)

        # Extracting certificate
        certificate_data = list_item.get('certificate', None)
        if certificate_data is None:
            movie_certificate = 'N/A'
        else:
            movie_certificate = certificate_data.get('rating', 'N/A')
        certificate.append(movie_certificate)

        # Extracting genres
        genre_list = [g['genre']['text'] for g in list_item.get('titleGenres', {}).get('genres', [])]
        genres.append(", ".join(genre_list))

        # Extracting if have episode
        have_episode = list_item.get('canHaveEpisodes', 'N/A')
        haveEpisode.append(have_episode)

        # Extracting plot safely
        movie_plot = list_item.get('plot', {}).get('plotText', {}).get('plainText', 'N/A')
        plot.append(movie_plot)

        # Extracting directors safely
        principal_credits = list_item.get('principalCredits', [])
        if len(principal_credits) > 0:
            director_list = [d['name']['nameText']['text'] for d in principal_credits[0].get('credits', [])]
            directors.append(", ".join(director_list))
        else:
            directors.append('N/A')

        # Extracting cast safely
        if len(principal_credits) > 1:
            cast_list = [c['name']['nameText']['text'] for c in principal_credits[1].get('credits', [])]
            cast.append(", ".join(cast_list))
        else:
            cast.append('N/A')

        # Extracting Metacritic score
        metacritic_data = list_item.get('metacritic', None)
        if metacritic_data is None:
            meta_score = 0  # or 'N/A' depending on how you want to represent missing scores
        else:
            meta_score = metacritic_data.get('metascore', {}).get('score', 0)

        metascore.append(meta_score)

        # Extracting image URL
        img_url = list_item.get('primaryImage', {}).get('url', 'N/A')
        imageURL.append(img_url)

else:
    print("ERROR: Could not find movie data.")

# Creating a DataFrame from the lists
data = {
    "Movie Name": movieName,
    "Release Year": movieYear,
    "Rating": rating,
    "Original Title": originalTitle,
    "Genres": genres,
    "Runtime (min)": runtime,
    "Plot": plot,
    "Directors": directors,
    "Cast": cast,
    "Certificate": certificate,
    "Metascore": metascore,
    "Image URL": imageURL,
    "Vote Count": voteCount,
    "Have Episodes": haveEpisode
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV file
df.to_csv('imdb_top_movies.csv', index=False)

print("CSV file created successfully: imdb_top_movies.csv")

CSV file created successfully: imdb_top_movies.csv
