# IMDb Data Scraping Project

## Erin Schultz & Reed Ulses

Scraping the IMDb website (https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2021-12-31&countries=US&count=250&sort=release_date,desc) for the title, release year, and IMDb rating for these movies to eventually merge with a Kaggle dataset. 

In [115]:
# Run this first and wait for it to complete. 

In [68]:
import pandas as pd
 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException 
from webdriver_manager.chrome import ChromeDriverManager
import time
import random

In [70]:
# Run this second and wait for it to complete. 

In [72]:
# function to scroll from the top to the bottom of the web page
def random_scroll(browser, total_wait_tie):
    # browser.execute_script("window.focus();")
    
    # get the total height of the page
    total_height = browser.execute_script("return document.body.scrollHeight")
    
    # number of steps to scroll (you can adjust this number)
    scroll_steps = random.randint(3, 10) # randomize how many scroll steps we will use
    
    # calculate the height to scroll on each step
    scroll_increment = total_height // scroll_steps

    # calculate the total time available for scrolling each step
    time_per_step = total_wait_time / scroll_steps
    
    # random scrolling across time
    for step in range(scroll_steps):
        # scroll by the increment (dividing total height by number of steps)
        browser.execute_script(f"window.scrollBy(0, {scroll_increment});")
        
        # random wait time between scrolls to simulate varying speed
        random_wait = random.uniform(0.5 * time_per_step, 1.5 * time_per_step)  # randomize the wait within a range
        time.sleep(random_wait)
        
    # final scroll to make sure you are at the very bottom (in case it didn't exactly match)
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [74]:
# Run this third and wait for it to complete

In [76]:
def scrape_movies(browser):
    print("scraping data")
    movies = browser.find_elements(By.XPATH, "//div[@class='ipc-metadata-list-summary-item__c']")
 
    for movie in movies:
        # TITLE
        title_elements = movie.find_elements(By.XPATH, ".//h3[@class='ipc-title__text']")
        #title_elements = movie.find_elements(By.CLASS_NAME, 'ipc-title__text')
        for title in title_elements:
            movie_title = title.text

        # YEAR
        year_elements = movie.find_elements(By.XPATH, ".//span[@class='sc-5bc66c50-6 OOdsw dli-title-metadata-item' and contains(text(), '20')]")
        if year_elements:  # Check if year_elements is not empty
            movie_year = year_elements[0].text  # Use the first matching year
        else:
            movie_year = ""  # Default to an empty string if no year is found

        # RATING
        imdb_elements = movie.find_elements(By.XPATH, ".//span[@class='ipc-rating-star--rating']")
        if imdb_elements:  # Check if year_elements is not empty
            imdb_ratings = imdb_elements[0].text  # Use the first matching year
        else:
            imdb_ratings = ""  # Default to an empty string if no year is found    

        #save the data as a list of dictionaries
        all_movies.append({
            "title": movie_title,
            "year": movie_year,
            "imdb_ratings": imdb_ratings
        })
       
    return

In [78]:
# Run this fourth and wait for it to complete. 

In [80]:
def save_data(loop, all_movies):
    df_name = f"imdb_{loop}_df"
    csv_name = f"imdb_{loop}.csv"

    # convert lists to a pandas datadataframe
    print("building the dataframe")
    df_name = pd.DataFrame(all_movies)
    
    # perist data in a CSV file
    print(f"saving the data as {csv_name}")
    df_name.to_csv(csv_name, header=True, index=False, sep=",", encoding='utf-8')

    return

In [82]:
# Run this fifth and wait for it to complete. 

In [102]:
# Initialize the Selenium web driver (Chrome)
browser = webdriver.Chrome()
# browser = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # this occasionally causes "Status code was: -9" error.

url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2021-12-31&countries=US&count=250&sort=release_date,desc'

# navigate to the web page using the URL
browser.get(url)

browser.maximize_window()

In [104]:
# Run this sixth. It is doing all the work. 
# Don't assume it is hung unless it hasn't moved or added a new message for over 4 minutes. 

In [106]:
loop = 1
# all_movies = []

while True:
    browser.execute_script("window.focus();")

    # generating a random wait time
    total_wait_time = random.uniform(15, 30)

    # scrolling down the page
    print("scrolling down the page")
    random_scroll(browser, total_wait_time)

    # there are 89421 records displayed 250 at a time
    print(f"loop {loop} of 358 loops")
        
    ####################
    ## Put your scraping code, or a call to your scraping function in here.
    # when the counter is on the even thousand, scrape and save the movies

    # create an empty all_movies list on each loop so there are not duplicates
    all_movies = []
    
    print("calling function to scrape movies")
    scrape_movies(browser)
    print("calling function to create dataframe and save CSV")
    save_data(loop, all_movies)

    total_records = 89421
    print(f"scraped {len(all_movies)} of {total_records} records")
    print(f"{round(len(all_movies)/total_records, 2)}% complete")
    
    ####################

    print("scrolling up so the '250 more' button is in view")
    # scroll up by 250 pixels so the '250 more' link is in view
    time.sleep(3)
    browser.execute_script("window.scrollBy(0, -250);")
    time.sleep(3)
    
    try:
        time.sleep(5)
        
        # look for the '250 more' button by the 'class' attribute
        next_button = browser.find_element(By.CSS_SELECTOR, 'span[class="ipc-see-more__text"]')
        
        # scroll the button into view
        browser.execute_script("arguments[0].scrollIntoView(true);", next_button)

        # waiting after it is scrolled into view to allow overlay to disappear
        time.sleep(total_wait_time)

        # clicking the button using execute_script
        print("clicked the 'next' button using JavaScript")
        print("-" * 65)
        print("\n")
        browser.execute_script("arguments[0].click();", next_button)  # Using JavaScript to click the button
        time.sleep(10)

        loop = loop + 1

    except NoSuchElementException:
        ####################
        ## Put your scraping code, or a call to your scraping function in here.
        # when the counter is on the even thousand, scrape and save the movies

        print("calling function to scrape movies")
        scrape_movies(browser)
        print("calling function to create dataframe and save CSV")
        save_data(loop, all_movies)
        
        ####################

        # Handle the case where the '250 more' button is not found
        print("No more 'Next' button found. Stopping scrape.")
        break  # This correctly breaks out of the `while` loop

scrolling down the page
loop 1 of 358 loops
calling function to scrape movies
scraping data
calling function to create dataframe and save CSV
building the dataframe
saving the data as imdb_1.csv
scraped 250 of 89421 records
0.0% complete
scrolling up so the '250 more' button is in view
clicked the 'next' button using JavaScript
-----------------------------------------------------------------


scrolling down the page
loop 2 of 358 loops
calling function to scrape movies
scraping data
calling function to create dataframe and save CSV
building the dataframe
saving the data as imdb_2.csv
scraped 500 of 89421 records
0.01% complete
scrolling up so the '250 more' button is in view
clicked the 'next' button using JavaScript
-----------------------------------------------------------------


scrolling down the page
loop 3 of 358 loops
calling function to scrape movies
scraping data
calling function to create dataframe and save CSV
building the dataframe
saving the data as imdb_3.csv
scraped 

KeyboardInterrupt: 

In [112]:
movies_df=pd.DataFrame(all_movies)
movies_df.head(15)

Unnamed: 0,title,year,imdb_ratings
0,1. Oh My Darling,2021,5.8
1,2. Blueberry,2021,6.0
2,3. H.P. Lovecraft's Witch House,2021,2.6
3,4. Malibu Road,2021,4.5
4,5. Apache Leap,2021,
5,6. Not Broken,2021,6.1
6,7. Megaboa,2021,2.7
7,8. The American Connection,2021,6.8
8,9. COVID-19: Invasion,2021,2.0
9,10. Riptide,2021,4.2


In [42]:
browser.quit()

In [None]:
# Save to csv
all_movies.to_csv('imdb_scraped.csv', encoding='utf-8')