## WebScraping IMDB

### Libraries to import:

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import requests
import json
import re

from bs4 import BeautifulSoup
soup = BeautifulSoup()
from urllib.request import urlopen

### Defining Functions:

In [2]:
def individual_url(year,search_number):
    '''
    Format url with year and search number.
    Each url returns 50 entries, based on pagination. 
    Example url: https://www.imdb.com/search/title/?title_type=feature&year=2006-01-01,2006-12-31&start=1&ref_=adv_nxt
    '''
    _string = 'https://www.imdb.com/search/title/?title_type=feature&year={}-01-01,{}-12-31&start={}&ref_=adv_nxt'.format(year,year,search_number)
    
    return _string

In [3]:
def search_list(list_of_years,list_of_searches):
    '''
    Input a list of years and list of searches to compile a list of urls using individual_url().
    Returns a list of tuples of (individual_url, year).
    Example: https://www.imdb.com/search/title/?title_type=feature&year=LIST_OF_YEARS&start=LIST_OFSEARCHES&ref_=adv_nxt
    '''
    search_list = []
    for year in list_of_years:
        for i in list_of_searches:
            search_list.append((individual_url(year,i),year))
    return search_list

In [4]:
time_search = search_list([2004],[1]) #outputs an example search string from search_list()
time_search

[('https://www.imdb.com/search/title/?title_type=feature&year=2004-01-01,2004-12-31&start=1&ref_=adv_nxt',
  2004)]

In [5]:
def list_individual_urls(list_of_years,list_of_searches):
    '''
    Returns a list of [individual urls, title, date]
    Note: list_of_searches should begin at 1 and increase in increments of 50 (ex. 1,51,101...) to avoid duplicates.
    '''
    
    _list = []
    _searched = search_list(list_of_years,list_of_searches) # Create list of search lists
    
    for link, year in _searched:   # For each iteration over our search
        uClient = urlopen(link)    # terms and pages, read the page's html.
        page_html = uClient.read() #
        uClient.close()            #
        
        page_soup = BeautifulSoup(page_html, 'html.parser')                  # Use the Soup parser to search for
        containers = page_soup.findAll("h3",{"class":"lister-item-header"})  # our lister-item-headers - of which
                                                                             # budget and gross earnings are in
        for container in containers:
            _string = 'https://www.imdb.com' + container.a['href']
            _list.append((_string,container.a.text, year))
            
    return _list

In [6]:
def scraped_df(list_of_years,list_of_searches):
    '''
    Returns a merged dataframe from our list of individual urls and the items retrieved from them.
    '''
    
    _list_url = pd.DataFrame(list_individual_urls(list_of_years, list_of_searches)) # Creating first dataframe of 
    _list_url.columns = ['url','title','year']                                      # urls, title, year.
    
    _individual = []
    
    for link in tqdm(_list_url['url']):     # Creates second dataframe of urls, genre, gross_usa, 
        uClient = urlopen(link)             # budget, and worldwide_gross.
        page_html = uClient.read()          #
        uClient.close()                     # Using TQDM to track progress. 
        
        page_soup = BeautifulSoup(page_html, 'html.parser')
        
        # Attempt to scrape genre. Will be creating a list of applicable genres for each movie.
        con_genre = page_soup.findAll("div",{"class":"see-more inline canwrap"})
        _genre = []
        
        try:
            for i in con_genre[-1].findAll('a'):
                _genre.append(i.text.strip(' '))
        except:
            _genre = np.nan
           
        con_info = page_soup.findAll("div",{"class":"txt-block"})
        str_con_info = str(con_info)
        _str_con_info = str_con_info.replace('\n',' ')
        
        # Attempt to scrape gross_usa earnings.
        _gross_USA = ''
        try:
            _gross_USA = re.findall(r'(?<=Gross USA:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _gross_USA = np.nan #Skip if not a number.

        # Attempt to scrape budget.
        _budget = ''
        try:
            _budget = re.findall(r'(?<=Budget:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _budget = np.nan #Skip if not a number.
            
        # Attempt to scrape worldwide_gross earnings.
        _worldwide_gross = ''
        try:
            _worldwide_gross = re.findall(r'(?<=Worldwide Gross:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _worldwide_gross = np.nan #Skip if not a number.
        
        # Attempt to scrape MPAA Rating.
        _pg_rated = ''
        try:
            _pg_rated = re.findall(r'(?<=<span>Rated).\S*',_str_con_info)[0].strip(' ')
        except:
            _pg_rated = np.nan #Skip if not a number.
            
        # Attempt to scrape user rating.
        con_rating = page_soup.findAll("div",{"class":"ratingValue"})
        _rating = ''
        try:
            _rating = con_rating[0].strong.text
        except:
            _rating = np.nan #Skip if not a number.
        
        # Attempt to scrape user popularity rating.
        _rating_pop = ''
        try:
            _rating_pop_demo = page_soup.findAll("div",{"class":"ratingValue"})[0].strong['title']
            _rating_pop = re.findall(r'(?<=based on ).\S*',_rating_pop_demo)[0]
        except:
            _rating_pop = np.nan #Skip if not a number.
            
        # Append successfully scraped information to our individual list.
        _individual.append((link,_genre,_gross_USA,_budget,_worldwide_gross,_pg_rated,_rating,_rating_pop))
            
    # With our now populated list, create a dataframe witht that information.        
    _individual_df = pd.DataFrame(_individual)
    _individual_df.columns = ['url','genre','gross_usa','budget','worldwide_gross','pg_rated','rating','rating_pop']
    
    # Merge our first database of url, title, and year with our new dataframe of scraped information.
    _merged = pd.merge(_list_url,_individual_df,on='url',how='outer')
    
    return _merged

### Scraping Each Decade: 

In [7]:
sample_albert = scraped_df([2004],[1]) # A sample iteration of year 2004 starting with movies of popularity 1 to 50.
sample_albert

100%|██████████| 50/50 [00:49<00:00,  1.02it/s]


Unnamed: 0,url,title,year,genre,gross_usa,budget,worldwide_gross,pg_rated,rating,rating_pop
0,https://www.imdb.com/title/tt0377092/,Mean Girls,2004,[Comedy],"$86,058,055","$17,000,000","$130,125,829",PG-13,7.0,321703
1,https://www.imdb.com/title/tt0332280/,The Notebook,2004,"[Drama, Romance]","$81,001,787","$29,000,000","$115,882,795",PG-13,7.8,503840
2,https://www.imdb.com/title/tt0304141/,Harry Potter and the Prisoner of Azkaban,2004,"[Adventure, Family, Fantasy, Mystery]","$249,975,996","$130,000,000","$799,972,094",PG,7.9,528467
3,https://www.imdb.com/title/tt0347149/,Howl's Moving Castle,2004,"[Animation, Adventure, Family, Fantasy]","$5,576,743","$24,000,000","$236,212,992",PG,8.2,313939
4,https://www.imdb.com/title/tt0364725/,Dodgeball,2004,"[Comedy, Sport]","$114,326,736","$20,000,000","$168,423,227",PG-13,6.7,224244
5,https://www.imdb.com/title/tt0332452/,Troy,2004,"[Drama, History]","$133,378,256","$175,000,000","$497,409,852",R,7.2,476956
6,https://www.imdb.com/title/tt0338013/,Eternal Sunshine of the Spotless Mind,2004,"[Drama, Romance, Sci-Fi]","$34,400,301","$20,000,000","$74,036,715",R,8.3,882443
7,https://www.imdb.com/title/tt0349903/,Ocean's Twelve,2004,"[Crime, Thriller]","$125,544,280","$110,000,000","$362,744,280",PG-13,6.5,349294
8,https://www.imdb.com/title/tt0265208/,The Girl Next Door,2004,"[Comedy, Drama, Romance]","$14,589,444","$25,000,000","$30,381,722",R,6.7,197444
9,https://www.imdb.com/title/tt0381707/,White Chicks,2004,"[Comedy, Crime]","$70,831,760","$37,000,000","$113,100,873",PG-13,5.6,126918


In [None]:
_1960TO1970 = list(range(1960,1971)) # List of years in the sixties decade to iterate through. 

In [None]:
_top300 = [1,51,101,151,201,251] # List of ranges of popularity to iterate through. 

In [None]:
_1960_1970 = scraped_df(_1960TO1970,_top300) # Pass these lists in to be scraped. 

In [None]:
_1960_1970.to_csv('1960_1970.csv',header=True,index=False) # Write the resulting dataframe to a CSV. Repeat these 
                                                           # lists for each decade. Final product is a CSV for each
                                                           # decade with the top 300 movies for each year. 