In [2]:
from bs4 import BeautifulSoup
import requests
import re
import pickle
import pandas as pd


### Declare IMDB home site variable and the IMDB root urls, to be used frequently in code

In [None]:
imdb_root = "https://pro.imdb.com"
imdb_home_site = "https://pro.imdb.com/boxoffice/year/world"

# Ensure first site responds: status_code = 200 indicates success
# print(requests.get(imdb_home_site + str(min(year_range))).status_code)

### Set variables to determine range of scraping from Box Office Mojo:
1. Year range (exclusive of upper bound), and 
2. Minimum box office proceeds to limit search

In [None]:
year_range = range(2005,2020)
world_box_min = 5000000

### Function to grab features from the main title summary page on IMDB Pro:
* Utilizes series of try, except statements. If feature doesn't exist on page set to ("")

In [None]:
def grab_title_summary_feats(title_summary_soup):
    """
    Navigate to individual movie's Title Summary page and extract feature data
    Input: BeautifulSoup object of individual movie's web page
    Returns: list of individual movie's Title Summary data
    """
    try:
        budget = title_summary_soup.find(text=re.compile("Budget")).findNext().text.strip()
        budget = int(budget.replace(",","").replace("$","").replace("\n",""))
    except:
        budget = 0
    
    try:
        release_date = title_summary_soup.find(text=re.compile("Release date")).findNext().find("a").text
        release_date = dateutil.parser.parse(release_date)
    except:
        release_date = ""
        
    try:
        rating = title_summary_soup.find(id="certificate").text[:-1].strip()
    except:
        rating = ""    
        
    try:
        # runtime formatted as "x min" on page, so split into list and grab first element to pull minutes
        runtime = int(title_summary_soup.find(id="running_time").text[:-1].split()[0])
    except:
        runtime = ""        
      
    try:
        genres = title_summary_soup.find(id="genres").text
    except:
        genres = ""
        
    try:
        # Director often include "\n" --- replace with blank spaces
        director = title_summary_soup.find(text=re.compile(" Director")).findNext().text.replace("\n","").strip()
    except:
        director = ""
    
    try:
        prod_company = title_summary_soup.find(href=re.compile("/pro.imdb.com/company/")).text
    except:
        prod_company = ""

    try:
        distributor = title_summary_soup.find(text=' Distributor\n').findNext().find(href=re.compile("/company/")).text
    except: 
        distributor = ""
        
    try:
        composer = title_summary_soup.find(text=re.compile("Composer")).findNext().text.replace("\n","").strip()
    except:
        composer = ""

    return [release_date, rating, runtime, genres, director, prod_company, distributor, composer, budget]


### Log into IMDB Pro (free 30-day trial available) with Selenium to begin scraping data

In [None]:
driver = webdriver.Chrome(chromedriver)
driver.get("https://pro.imdb.com/login/auth?u=https%3A%2F%2Fpro.imdb.com%2F")

username = driver.find_element_by_name("email")
username.clear()
# Enter User Name Here
username.send_keys("User_Name")

time.sleep(1)
password = driver.find_element_by_name("password")
password.clear()
# Enter Password Here
password.send_keys("User_Password")

driver.find_element_by_css_selector(".signin-button-text").click() 

### Cycle through IMDB Pro pages by year, creating a list of dictionaries with each movie and its features

In [None]:
movie_dict_list = []
for year in year_range:
    # Once logged in pause for random time to avoid being booted, then navigate to first page with movies
    time.sleep(0.5 + 2 * random.random())
    driver.get(imdb_home_site + "/" + str(year))
    source = driver.page_source
    web_page_soup = BeautifulSoup(source, "html5lib")
    
    # Narrow down HTML to rows with movies using identifier "releasegroup"
    movies_table = web_page_soup.find_all("a", href=re.compile("/releasegroup/"))

    # Iterate through each row and pull movie details    
    for row in movies_table:
        
        # If worldwide box office is < minimum set, skip movie. And since movies are
        # arranged in order of descending box office, break loop to skip to next year
        world_box_office = int(row.findNext().text.replace('$', '').replace(',', ''))
        if world_box_office < world_box_min:
            break
        
        # Pull domestic sales. International is difference between World and Domestic
        try:
            domestic_sales = int(row.findNext().findNext().text.replace('$', '').replace(',', ''))
        except: # If domestic_sales = "-" (zero) a ValueError is thrown trying to convert to int
            domestic_sales = 0
        intl_sales = world_box_office - domestic_sales
        
        # Start list of features (features_list) with movie_title
        movie_title = row.text.title()
        features_list = [movie_title]
        movie_link = imdb_root + row.attrs["href"]
        
        # Direct driver to movie_link and create new soup object for that movie's site
        time.sleep(0.5 + 2 * random.random())
        driver.get(movie_link)
        title_summary_soup = BeautifulSoup(driver.page_source, "html5lib")
        
        # Database holds some TV Episodes. If title is a TV Episode, skip to next title
        if title_summary_soup.find(id="title_type") == None:
            pass
        elif "TV" in title_summary_soup.find(id="title_type").text.upper():
            continue
        
        # Pull movie details from movie's page and add to features_list
        title_summary_features = grab_title_summary_feats(title_summary_soup)
        features_list.extend(title_summary_features)
        
        # After navigating to a movie's page the url shifts, so save new url
        movie_link = driver.current_url

        # Navigate to cast page of movie and pull main actor...
        cast_url = imdb_root + "/title/" + movie_link[movie_link.find("title/")+6:movie_link.find("/boxoffice")] + "/cast"
        
        time.sleep(0.5 + 2 * random.random()) # random pause before changing url
        driver.get(cast_url)
        cast_soup = BeautifulSoup(driver.page_source,"html5lib")
        try:
            lead_actor = cast_soup.find("a",href=re.compile("cst_1")).text
        except:
            lead_actor = ""
        
        # ...and append features to features_list
        features_list.append(lead_actor)
        features_list.append(domestic_sales)
        features_list.append(intl_sales)
        features_list.append(world_box_office)      
        
        field_names = ["Title","ReleaseDate","Rating","Runtime","Genres","Director","Prod_Company", 
               "Distributor","Composer","Budget","Lead_Actor","Domestic_Sales","Intl_Sales","Global_Sales"]
        
        # Create dictionary for movie with features
        movie_dict = dict(zip(field_names, features_list))
        
        # Append movie dictionaries to list on each iteration
        movie_dict_list.append(movie_dict)

### Pickle movie_dict_list and close driver

In [None]:
# Pickle dict for later use
with open("movie_dict_list.pkl","wb") as picklefile:
    pickle.dump(movie_dict_list, picklefile)

In [None]:
# Close driver when finished scraping
driver.close()

### Convert **movie_dict_list** to DataFrame
* Add Title_Year field for use in matching titles later, and 
* Combine domestic and intl sales into Global

In [6]:
with open("movie_dict_list.pkl","rb") as picklefile:
    movie_dict_list = pickle.load(picklefile)

In [7]:
movie_df = pd.DataFrame(movie_dict_list)

In [8]:
movie_df["Title_Year"] = movie_df["Title"] + " " + movie_df["ReleaseDate"].apply(lambda x:"(" + str(x.year) + ")")

In [9]:
movie_df["Global_Sales"] = movie_df["Domestic_Sales"] + movie_df["Intl_Sales"]

In [10]:
movie_df.head()

Unnamed: 0,Title,ReleaseDate,Rating,Runtime,Genres,Director,Prod_Company,Distributor,Composer,Budget,Lead_Actor,Domestic_Sales,Intl_Sales,Global_Sales,Title_Year
0,Harry Potter And The Goblet Of Fire,2005-11-18,PG-13,157,"Adventure, Family, Fantasy",Mike Newell,Warner Bros.,Warner Bros.,Patrick Doyle,150000000,Eric Sykes,290013036,605908000,895921036,Harry Potter And The Goblet Of Fire (2005)
1,Star Wars: Episode Iii - Revenge Of The Sith,2005-05-19,PG-13,140,"Action, Adventure, Fantasy",George Lucas (directed by),Lucasfilm,Twentieth Century Fox,John Williams (music by),113000000,Ewan McGregor,380270577,488081953,868352530,Star Wars: Episode Iii - Revenge Of The Sith (...
2,"The Chronicles Of Narnia: The Lion, The Witch ...",2005-12-09,PG,143,"Adventure, Family, Fantasy",Andrew Adamson,Walt Disney Pictures,Buena Vista Pictures,Harry Gregson-Williams,180000000,Georgie Henley,291710957,453302158,745013115,"The Chronicles Of Narnia: The Lion, The Witch ..."
3,War Of The Worlds,2005-06-29,PG-13,116,"Adventure, Sci-Fi, Thriller",Steven Spielberg,Paramount Pictures,Paramount Pictures,John Williams,132000000,Tom Cruise,234280354,369592765,603873119,War Of The Worlds (2005)
4,King Kong,2005-12-14,PG-13,187,"Action, Adventure, Drama",Peter Jackson,Universal Pictures,Universal Pictures,James Newton Howard,207000000,Naomi Watts,218080025,344283424,562363449,King Kong (2005)


### Use separate (more accurate) site for pulling movie budgets
1. Set range for how many **site_pages** you want to scrape (site has 60 pages total, 100 movies each page)
2. Create dictionary with **key = movie title**, and **value = budget**

In [None]:
budget_site = "https://www.the-numbers.com/movie/budgets/all"
site_pages = [str(x) for x in range(0,60)]
budgets_list = []
titles_list = []
year_list = []

# Iterate through each page, grabbing movie title and budget
for page in site_pages:
    if page == 0:
        budget_html = BeautifulSoup(requests.get(budget_site).text, "html5lib")
    else:
        site = budget_site + "/" + page + "01"
        budget_html = BeautifulSoup(requests.get(site).text,"html5lib")

    # Pull budget data
    budget_soup = budget_html.find_all("td", class_="data")
    
    # Each budget number is separated by three other numbers on page, so skip them
    budget_rows = budget_soup[1::4]
    budgets_list.extend([int(row.text[2:].replace(",","")) for row in budget_rows])
    
    # Pull movie titles --- some movie titles exist on side of page, limit list to 100 to exclude
    movie_soup = budget_html.find_all("a", href=re.compile("#tab=summary"), limit=100)
    titles_list.extend([movie.text.title() for movie in movie_soup])
    
    # Pull years
    date_soup = budget_html.find_all("a",href=re.compile("/box-office-chart/"))
    year_list.extend([date.text[-4:] for date in date_soup])

### Ensure budget and titles lists are aligned and convert lists to DF
* Drop movies where year < lowest year in **movie_df** or unknown,
* then pickle **title_budget_df**

In [None]:
print("titles list len:",len(titles_list))
print("budgets list len:",len(budgets_list))
print("year list len:",len(year_list))

In [None]:
title_budget_df = pd.DataFrame(zip(titles_list,year_list,budgets_list),columns=["Title","Year","Budget"])

In [None]:
title_budget_df = title_budget_df[(title_budget_df["Year"]>=str(min(year_range))) &\
                                  (title_budget_df["Year"]!="nown")]

In [13]:
title_budget_df.sample(5)

Unnamed: 0,Title,Year,Budget
5119,Das Leben Der Anderen,2007,2000000
3865,Motherhood,2009,10000000
1601,Because I Said So,2007,39000000
724,Alvin And The Chipmunks: The Squeakquel,2009,70000000
651,Children Of Men,2006,76000000


In [None]:
# Pickle df
with open("title_budget_df_create.pkl","wb") as picklefile:
    pickle.dump(title_budget_df, picklefile)

In [12]:
# Pickle df
with open("title_budget_df_create.pkl","rb") as picklefile:
    title_budget_df = pickle.load(picklefile)

### Some titles in title_budget_df in have encoding issues, including â\x80\x99 (apostrophe) or â\x80\x94 (dash)
* Clean up keys by replacing with correct values, creating new dict in the process

In [14]:
title_budget_df["Title"] = title_budget_df["Title"].apply(lambda x: x.replace("â\x80\x99","'")
                                                          .replace("â\x80\x94"," - "))


In [19]:
title_budget_df.head(10)

Unnamed: 0,Title,Year,Budget,Title_Year
0,Avengers: Endgame,2019,400000000,Avengers: Endgame (2019)
1,Pirates Of The Caribbean: On Stranger Tides,2011,379000000,Pirates Of The Caribbean: On Stranger Tides (2...
2,Avengers: Age Of Ultron,2015,365000000,Avengers: Age Of Ultron (2015)
3,Star Wars Episode 7: The Force Awakens,2015,306000000,Star Wars Episode 7: The Force Awakens (2015)
4,Avengers: Infinity War,2018,300000000,Avengers: Infinity War (2018)
5,Pirates Of The Caribbean: At World's End,2007,300000000,Pirates Of The Caribbean: At World's End (2007)
6,Justice League,2017,300000000,Justice League (2017)
7,Spectre,2015,300000000,Spectre (2015)
8,Star Wars: The Rise Of Skywalker,2019,275000000,Star Wars: The Rise Of Skywalker (2019)
9,Solo: A Star Wars Story,2018,275000000,Solo: A Star Wars Story (2018)


### And then:
Make movie title fields of **title_budget_df** and **movie_df** uniform so you can merge budget data into **movie_df**, joining on movie title. <br/>
* When **str.title( )** is applied to movie titles in above code ( 't and 's ) get capitalized - make lowercase
* Movies with multiple parts have varying formats (e.g. III vs 3) - make consistent
* Add unique ID of **Title + Year** in **title_budget_df**

In [16]:
def correct_title(string):
    string = string.replace("'S","'s").replace("'T","'t").replace("'Re","'re")
    string = string.replace("Part Iv","Part 4").replace("Iii","3").replace("Ii","2")
    string = string.replace("Episode I","Episode 1").replace("Ep. I","Ep. 1").replace("Part I", "Part 1")
    string = string.replace("Ix","9").replace("Viii","8").replace("Vii","7").replace("Part Vi","Part 6")
    string = string.replace("Vi:","6:").replace("V:","5:").replace("Iv:","4:").replace(" Iv"," 4")
    string = string.replace("Ep.","Episode")
    return string

movie_df["Title"] = movie_df["Title"].apply(correct_title)
title_budget_df["Title"]= title_budget_df["Title"].apply(correct_title)

In [17]:
title_budget_df["Title_Year"] = title_budget_df["Title"] + " (" + title_budget_df["Year"] + ")"  

In [20]:
title_budget_df.head(10)

Unnamed: 0,Title,Year,Budget,Title_Year
0,Avengers: Endgame,2019,400000000,Avengers: Endgame (2019)
1,Pirates Of The Caribbean: On Stranger Tides,2011,379000000,Pirates Of The Caribbean: On Stranger Tides (2...
2,Avengers: Age Of Ultron,2015,365000000,Avengers: Age Of Ultron (2015)
3,Star Wars Episode 7: The Force Awakens,2015,306000000,Star Wars Episode 7: The Force Awakens (2015)
4,Avengers: Infinity War,2018,300000000,Avengers: Infinity War (2018)
5,Pirates Of The Caribbean: At World's End,2007,300000000,Pirates Of The Caribbean: At World's End (2007)
6,Justice League,2017,300000000,Justice League (2017)
7,Spectre,2015,300000000,Spectre (2015)
8,Star Wars: The Rise Of Skywalker,2019,275000000,Star Wars: The Rise Of Skywalker (2019)
9,Solo: A Star Wars Story,2018,275000000,Solo: A Star Wars Story (2018)


### In *movie_df* Director, Composer, and Genres fields sometimes have multiple names. Limit to first one.

In [21]:
def clean_feature(feature):
    if "(" in feature:
        feature = feature[:feature.find("(")]
    if "|" in feature:
        feature = feature[:feature.find("|")]
    
    return feature.strip()

In [22]:
# Apply clean function to Director & Composer fields
movie_df["Director"] = movie_df["Director"].apply(clean_feature)
movie_df["Composer"] = movie_df["Composer"].apply(clean_feature)

In [23]:
# Create column with only first Genre from IMDB Pro list
movie_df["Lead_Genre"] = movie_df["Genres"].apply(lambda x: x.split(",")[0])

In [24]:
movie_df.head(10)

Unnamed: 0,Title,ReleaseDate,Rating,Runtime,Genres,Director,Prod_Company,Distributor,Composer,Budget,Lead_Actor,Domestic_Sales,Intl_Sales,Global_Sales,Title_Year,Lead_Genre
0,Harry Potter And The Goblet Of Fire,2005-11-18,PG-13,157,"Adventure, Family, Fantasy",Mike Newell,Warner Bros.,Warner Bros.,Patrick Doyle,150000000,Eric Sykes,290013036,605908000,895921036,Harry Potter And The Goblet Of Fire (2005),Adventure
1,Star Wars: Episode 3 - Revenge Of The Sith,2005-05-19,PG-13,140,"Action, Adventure, Fantasy",George Lucas,Lucasfilm,Twentieth Century Fox,John Williams,113000000,Ewan McGregor,380270577,488081953,868352530,Star Wars: Episode Iii - Revenge Of The Sith (...,Action
2,"The Chronicles Of Narnia: The Lion, The Witch ...",2005-12-09,PG,143,"Adventure, Family, Fantasy",Andrew Adamson,Walt Disney Pictures,Buena Vista Pictures,Harry Gregson-Williams,180000000,Georgie Henley,291710957,453302158,745013115,"The Chronicles Of Narnia: The Lion, The Witch ...",Adventure
3,War Of The Worlds,2005-06-29,PG-13,116,"Adventure, Sci-Fi, Thriller",Steven Spielberg,Paramount Pictures,Paramount Pictures,John Williams,132000000,Tom Cruise,234280354,369592765,603873119,War Of The Worlds (2005),Adventure
4,King Kong,2005-12-14,PG-13,187,"Action, Adventure, Drama",Peter Jackson,Universal Pictures,Universal Pictures,James Newton Howard,207000000,Naomi Watts,218080025,344283424,562363449,King Kong (2005),Action
5,Madagascar,2005-05-27,PG,86,"Animation, Adventure, Comedy",Eric Darnell,DreamWorks Animation,DreamWorks Distribution,Hans Zimmer,75000000,Ben Stiller,193595521,348468325,542063846,Madagascar (2005),Animation
6,Mr. & Mrs. Smith,2005-06-10,PG-13,120,"Action, Comedy, Crime",Doug Liman,Regency Enterprises,Twentieth Century Fox,John Powell,110000000,Brad Pitt,186336279,300951367,487287646,Mr. & Mrs. Smith (2005),Action
7,Charlie And The Chocolate Factory,2005-07-15,PG,115,"Adventure, Comedy, Family",Tim Burton,Warner Bros.,Warner Bros.,Danny Elfman,150000000,Johnny Depp,206459076,268509687,474968763,Charlie And The Chocolate Factory (2005),Adventure
8,Mission: Impossible 3,2006-05-05,PG-13,126,"Action, Adventure, Thriller",J.J. Abrams,Paramount Pictures,Paramount Pictures,Michael Giacchino,150000000,Tom Cruise,134029801,264449696,398479497,Mission: Impossible Iii (2006),Action
9,Batman Begins,2005-06-15,PG-13,140,"Action, Adventure",Christopher Nolan,Warner Bros.,Warner Bros.,James Newton Howard,150000000,Christian Bale,205343774,166510009,371853783,Batman Begins (2005),Action


### Pickle cleaned DFs

In [None]:
with open("movie_df_cleaned.pkl","wb") as picklefile:
    pickle.dump(movie_df, picklefile)
    
with open("title_budget_df_cleaned.pkl","wb") as picklefile:
    pickle.dump(title_budget_df, picklefile)

### Merge budget data into **movie_df**, joining on movie title.
* Use **fuzzywuzzy** string matching to account for any titles that have slight differences between DataFrames

In [None]:
from fuzzywuzzy import process
title_matched_from_budget_df = []
fuzzy_scores = []
for title in movie_df["Title_Year"]:
    fuzzy_scores.append(process.extract(title, title_budget_df["Title_Year"], limit=1)[0][1])
    title_matched_from_budget_df.append(process.extract(title, title_budget_df["Title_Year"], limit=1)[0][0])        

In [None]:
title_fuzzy_df = pd.DataFrame([list(movie_df["Title_Year"]),title_matched_from_budget_df,fuzzy_scores]).T
title_fuzzy_df.columns=["Title_Year","title_matched_from_budget_df","Fuzzy_score"]

In [None]:
title_fuzzy_df.sample(5)

### Pickle title_fuzzy_df

In [None]:
with open("Title_and_Fuzzy.pkl","wb") as picklefile:
    pickle.dump(title_fuzzy_df, picklefile)

### Join fuzzy score and matched movie title from title_budget_df onto movie_df

In [None]:
movie_df_joined = movie_df.join(title_fuzzy_df.set_index("Title_Year"), on="Title_Year")

In [None]:
movie_df_joined = movie_df_joined.join(title_budget_df.set_index("Title_Year"), 
                                       on='title_matched_from_budget_df', rsuffix="_Numbers")

In [None]:
cols = ['Title_Year','Fuzzy_score','ReleaseDate','Budget_Numbers','Rating',
        'Runtime','Genres','Lead_Genre','Director','Prod_Company','Distributor','Composer','Lead_Actor',
        'Domestic_Sales','Intl_Sales','Global_Sales']

movie_df_joined = movie_df_joined[cols]

* **Rename Budget data from The Numbers to be final Budget column**
* **Add in Net Profit field**

In [None]:
movie_df_joined = movie_df_joined.rename(columns={"Budget_Numbers":"Budget","Title_Year":"Title"})

In [None]:
movie_df_joined["Net_Profit"] = movie_df_joined["Global_Sales"] - movie_df_joined["Budget"]

In [None]:
movie_df_joined.sample(5)

### Narrow down data to those with valid Ratings and a Title matching score >= 95

In [None]:
df_model_data = movie_df_joined[(movie_df_joined["Fuzzy_score"]>=95)
                & (movie_df_joined["Rating"].isin(["G","PG","PG-13","R","NC-17"]))
                & (movie_df_joined["Domestic_Sales"]>0)][["Title","Budget","ReleaseDate",
                                                           "Rating","Runtime","Lead_Genre","Lead_Actor",
                                                           "Director","Prod_Company","Domestic_Sales",
                                                           "Intl_Sales","Global_Sales","Net_Profit"]]

### Scrape top Actor names from The-Numbers.com

In [None]:
Numbers_actors_url = "https://www.the-numbers.com/box-office-star-records/domestic/yearly-acting/highest-grossing-2020-stars"
page_soup = BeautifulSoup(requests.get(Numbers_actors_url).text,"html5lib")

actor_soup = page_soup.find_all("a",href=re.compile("/person/"),limit=106)
actor_list = [line.text for line in actor_soup[6:]]

In [None]:
df_model_data["Is_Pop_Actor"] = df_model_data["Lead_Actor"].apply(lambda x: 1 if x in actor_list else 0)

### Scrape top Director names from The-Numbers.com

In [None]:
Numbers_directors_url = "https://www.the-numbers.com/box-office-star-records/domestic/lifetime-specific-technical-role/director"
page_soup = BeautifulSoup(requests.get(Numbers_directors_url).text,"html5lib")

director_soup = page_soup.find_all("a",href=re.compile("/person/"),limit=106)
director_list = [line.text for line in director_soup[6:]]

In [None]:
df_model_data["Is_Pop_Director"] = df_model_data["Director"].apply(lambda x: 1 if x in director_list else 0)

### Scrape Top Production Companies from The-Numbers.com

In [None]:
Numbers_production_url = "https://www.the-numbers.com/movies/production-companies/"
page_soup = BeautifulSoup(requests.get(Numbers_production_url).text,"html5lib")

In [None]:
prod_soup = page_soup.find_all("a",href=re.compile("production-company"),limit=100)
prod_list = [line.text for line in prod_soup]

In [None]:
df_model_data["Is_Pop_ProdComp"] = df_model_data["Prod_Company"].apply(lambda x: 1 if x in prod_list else 0)

In [None]:
df_model_data.sample(5)