Install the required libraries

In [None]:
!pip3 install pandas
!pip3 install plotly
!pip3 install selenium
!pip3 install webdriver-manager

Load the required libraries

In [13]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time
import re

In [14]:
# Initialize a webdriver with Chrome engine
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████████████████████| 6.30M/6.30M [00:04<00:00, 1.61MB/s]


Create a dataframe with the basic informations for top 250 movies

In [15]:
# Set the url link for the top 250 movies and load the webpage
url = 'https://www.imdb.com/search/title/?groups=top_250'
driver.get(url)

In [16]:
# Set up empty lists for each element
indexes = []
titles = []
links = []
years = []
reviews_links = []
genres = []

In [17]:
def keep_only_numbers(input_string):
    # Use regular expression to keep only the numbers in the input string
    return re.sub(r'\D', '', input_string)

# Find and loop through all pages
while True:
    try:
        # Find all movie blocks on the current page
        movie_blocks = driver.find_elements(By.CLASS_NAME, 'lister-item')

        # Extract the movie details on the current page
        for block in movie_blocks:
            # Extract the title
            title_element = block.find_element(By.CLASS_NAME, 'lister-item-header')
            title = title_element.text.strip()
            
            # Extract the index and year from the title
            index = block.find_element(By.CLASS_NAME, 'lister-item-index').text
            year = keep_only_numbers(block.find_element(By.CLASS_NAME, 'lister-item-year').text.strip('()'))
            
            # Remove the index and year from the title
            title = title.replace(index, '').replace(year, '').strip().replace("(", "").replace(")", "")

            # Remove the "." from the index
            index = index.replace(".", "")

            # Extract the link
            link_element = title_element.find_element(By.TAG_NAME, 'a')
            link = link_element.get_attribute('href')
            
            # Construct the URL for the reviews page
            review_link = link.replace('?ref_=adv', 'reviews?ref_=tt_urv')

            # Extract the genre
            genre_element = block.find_element(By.CLASS_NAME, 'genre')
            genre = genre_element.text.strip()            
            
            # Append the extracted values to the respective lists
            indexes.append(index)
            titles.append(title)
            links.append(link)
            years.append(year)
            reviews_links.append(review_link)
            genres.append(genre)

        # Go to the next page
        next_button = driver.find_element(By.CLASS_NAME, 'lister-page-next')
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        next_button.click()

        # Wait for a short duration to allow the page to load
        time.sleep(2)  # Adjust the duration as needed

    except NoSuchElementException:
        # If the next button is not found, exit the loop
        break

In [18]:
# Create a DataFrame from the lists
data = {'Position in IMDB Top 250':indexes, 'Title': titles, 'Year': years, 'Genre':genres, 'Link': links, 'Reviews Link': reviews_links}
df_top_250_movies = pd.DataFrame(data)

In [19]:
df_top_250_movies.to_csv('250_top_movies.csv')

Export Reviews from all the top 250 movies

In [8]:
# Convert Reviews link column to a list
review_urls = df_top_250_movies["Reviews Link"].values.tolist()

In [9]:
# Set up empty lists for storing the extracted information
review_titles = []
review_texts = []
review_ratings = []
movie_titles = []

In [10]:
# Iterate over the list of review URLs
for review_url in review_urls:
    # Visit the reviews page
    driver.get(review_url)

    # Get the page title metadata
    page_title = driver.title

    # Extract the movie title from the page title
    movie_title = page_title.split('(')[0].strip()

    # Keep scrolling and clicking the "Load More" button until it is no longer present
    prev_review_count = 0
    while True:
        try:
            # Find the "Load More" button
            load_more_button = driver.find_element(By.CLASS_NAME, 'ipl-load-more__button')

            # Execute JavaScript to scroll to the bottom of the page
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Click the "Load More" button using JavaScript
            driver.execute_script("arguments[0].click();", load_more_button)

            # Wait for a short duration to allow the page to load
            time.sleep(7)  # Adjust the duration as needed

            # Find all review elements on the page
            review_elements = driver.find_elements(By.CLASS_NAME, 'review-container')

            # Check the number of loaded reviews
            curr_review_count = len(review_elements)

            # If the number of loaded reviews remains the same, exit the loop
            if curr_review_count == prev_review_count:
                break

            # Update the previous review count
            prev_review_count = curr_review_count

        except NoSuchElementException:
            # If the "Load More" button is not found, exit the loop
            break
    
    # Find all review elements on the page
    review_elements = driver.find_elements(By.CLASS_NAME, 'review-container')

    # Extract the review title, review text, and rating for each review
    for review_element in review_elements:
        # Extract the review title
        title_element = review_element.find_element(By.CLASS_NAME, 'title')
        review_title = title_element.text.strip()

        # Extract the review text
        text_element = review_element.find_element(By.CLASS_NAME, 'text')
        review_text = text_element.text.strip()

        # Extract the rating if available, otherwise assign a default value
        try:
            rating_element = review_element.find_element(By.CLASS_NAME, 'rating-other-user-rating')
            rating = rating_element.text.strip().split('/')[0]  # Extract the rating without the "/10" suffix
        except NoSuchElementException:
            rating = 'N/A'

        # Append the extracted information to the respective lists
        review_titles.append(review_title)
        review_texts.append(review_text)
        review_ratings.append(rating)
        movie_titles.append(movie_title)

In [11]:
# Create a DataFrame from the extracted information
data = {
    'Review Title': review_titles,
    'Review Text': review_texts,
    'Rating': review_ratings,
    'Movie Title': movie_titles
}
df_reviews_top_250 = pd.DataFrame(data)

# Print the extracted information
print(df_reviews_top_250)

                                         Review Title  \
0                         One Step Above Masterpiece!   
1                           Masterful IMAX Spider-Man   
2       One of the best sequels to anything ever made   
3                           A new Empire Strikes Back   
4                                   Okay This Is Peak   
...                                               ...   
344560                                  A masterpiece   
344561                                        AWESOME   
344562                                    cry & smile   
344563                                    Touching !!   
344564                       An emotional masterpiece   

                                              Review Text Rating  \
0       It's honestly absurd how good the "Spider-Vers...     10   
1       The animation, flow of everything, genius char...     10   
2       If it wasn't already obvious in the first film...     10   
3       This film is a visual concert. The 

In [12]:
df_reviews_top_250.to_csv('250_top_movies_reviews_raw.csv')