In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import json

# initialize Chrome WebDriver with explicit service management
driver = webdriver.Chrome(ChromeDriverManager().install())

# open the url of the Top 250 movies on IMDb
driver.get("https://www.imdb.com/chart/top")

# the format of Xpath is from IMDb
script_tag = driver.find_element(By.XPATH, '//*[@id="__NEXT_DATA__"]')
json_data = script_tag.get_attribute('innerHTML')

# print the json data to verify it's correct
print(json_data)

# save json data
with open('movies_data.json', 'w') as file:
    file.write(json_data)

# ensure the driver is quit properly
driver.quit()





In [13]:
import json
import pandas as pd

# load json data
with open('movies_data.json', 'r') as file:
    json_data = file.read()

# parse a JSON string
data = json.loads(json_data)

# extract movie data
movies = data['props']['pageProps']['pageData']['chartTitles']['edges']

# create a DataFrame
rows = []  

# extract details from each movie entry and collect them in a list of dictionaries
for movie in movies:
    current_rank = movie['currentRank']
    title = movie['node']['titleText']['text']
    year = movie['node']['releaseYear']['year']
    rating = movie['node']['ratingsSummary']['aggregateRating']
    genres = [genre['genre']['text'] for genre in movie['node']['titleGenres']['genres']]
    plot = movie['node']['plot']['plotText']['plainText']
    
    # Create a structured dictionary to save movie data
    row = {
        'Rank': current_rank,
        'Title': title,
        'Release Year': year,
        'Rating': rating,
        'Genres': ', '.join(genres),
        'Plot': plot
    }
    rows.append(row)

# Convert to DataFrame for easy export 
df = pd.DataFrame(rows)

# Display the head of data to verify contents 
print(df.head())


# Save DataFrame to a CSV file
df.to_csv('/Users/balgen/Desktop/IMDb_Top_250_Movies.csv', index=False)


   Rank                     Title  Release Year  Rating                Genres  \
0     1  The Shawshank Redemption          1994     9.3                 Drama   
1     2             The Godfather          1972     9.2          Crime, Drama   
2     3           The Dark Knight          2008     9.0  Action, Crime, Drama   
3     4     The Godfather Part II          1974     9.0          Crime, Drama   
4     5              12 Angry Men          1957     9.0          Crime, Drama   

                                                Plot  
0  Over the course of several years, two convicts...  
1  The aging patriarch of an organized crime dyna...  
2  When the menace known as the Joker wreaks havo...  
3  The early life and career of Vito Corleone in ...  
4  The jury in a New York City murder trial is fr...  
