# Scrape IMDB data
**Author:** David S. White <br /> 
**Email:** dswhite2012@gmail.com <br /> 
**Date:** 2020-05-04 <br /> 

### Goal
The goal of this notebook is to scrape data from IMDB within a specified range of years. This data will be used to build a model to predict IMDB user ratings. 

This code was developed with great help from a DataQuest turorial: https://www.dataquest.io/blog/web-scraping-beautifulsoup/. Modifications to that code include:
* grab more variables, such as rating, genre, and run time
* iterate over multiple years 
* for each year, scrape a user-specified number of pages

The data is saved using the user provide file name as a .csv.


## User parameters: 

In [None]:
start_year = 1970   # first year
end_year = 2019     # last year you want data for
pages_per_year = 5

In [None]:
file_name = 'data/imdb-data_' + str(start_year) + '-'+ str(end_year)
file_name

### Libraries

In [None]:
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
from requests import get

### Web Scraping Function using Beautiful Soup 
Note: this function is modifed from code obtained at https://www.dataquest.io/blog/web-scraping-beautifulsoup/. 

In [None]:
def _clean_year(str_value):
    # note, money will still contain currency symbol (e.g. $, EUR, ...) and will need to be removed.
    str_value = str_value.replace(' ','')
    if str_value[0] == '(':
        x = len(str_value)
        str_value = int(str_value[x-5:x-1])

    return str_value

In [None]:
def _grab_from_page(url): 
    response = get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    # movie containers (manually identified prior to function)
    movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
    
    # grab data 
    names = []
    years = []
    ratings = []
    run_times = []
    genres = []
    imdb_ratings = []
    metascores = []
    votes = []
    
    # Extract data from individual movie container
    for container in movie_containers:
    # Extract movies with metascore (only movies have metascores)
        if container.find('div', class_ = 'ratings-metascore') is not None:
        
        # Name
            name = container.h3.a.text
            names.append(name)
        
        # Year
            year = _clean_year(container.h3.find('span', class_ = 'lister-item-year').text)
            years.append(year)

        # Rating
            if container.p.find('span', class_ = 'certificate') is not None:
                rating = container.p.find('span', class_ = 'certificate').text
            else: 
                rating = "Unrated"
            ratings.append(rating)
        
        # Run Time (convert to int)
            run_time = container.p.find('span', class_ = 'runtime').text
            run_times.append(int(run_time[0:-4]))
        
        # Genre (returns up to 3 genres per movie)
            genre = container.p.find('span', class_ = 'genre').text
            # requires cleaing up to remove extra line and white space. 
            genre = genre[1:]
            genre = genre.replace(" ","")
            genre = genre.split(",")
            genres.append(genre)
        
        # IMDB rating
            imdb = float(container.strong.text)
            imdb_ratings.append(imdb)
        
        # Metascore
            m_score = container.find('span', class_ = 'metascore').text
            metascores.append(int(m_score))
        
        # The number of votes
            vote = container.find('span', attrs = {'name':'nv'})['data-value']
            votes.append(int(vote))

    # Make into data frame
    page_df = pd.DataFrame({'movie': names,
        'year': years,
        'rating': ratings,
        'runtime': run_times,
        'genre': genres,
        'imdbscore': imdb_ratings,
        'metascore': metascores,
        'votes': votes})
    
    return page_df

### Perform Web Scraping
Note: this may take a few minutes to run depending on the number of years and pages per year. 

In [None]:
years = np.arange(start_year, end_year+1)
counts = (np.arange(0, pages_per_year) * 50) + 1    # 50 movies per page, url modified by number of movies
for year in years:
    print("Year:", year)
    for count in counts:
        if year == years[0]:
            url = "https://www.imdb.com/search/title/?year="+str(year)+"&title_type=feature&"
        else:
            url = "https://www.imdb.com/search/title/?title_type=feature&year="+str(year)+"-01-01,"+str(year)+"-12-31&start="+str(count)+"&ref_=adv_nxt"
        if year == years[0] and count == counts[0]:
            imdb = _grab_from_page(url)
        else:
            df1 = _grab_from_page(url)
            imdb = pd.concat([imdb, df1], axis=0, sort=False)

### Quick look at the data

In [None]:
imdb.head()

In [None]:
imdb.describe()

### Save data

In [None]:
imdb.to_csv(file_name+".csv", index = False)
print(file_name+".csv Saved.")