### Add in specific features (genre, mpaa rating, runtime, and budget) from each of the 4341 movies
This program uses headless browsing to pull relevant feature information from each movie's main page on BoxOfficeMojo.

In [47]:
import os
import re
import time
import numpy as np
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs4

In [48]:
# Set-up headless browsing
chrome_options = Options()
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('headless')
driver = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',options=chrome_options)

In [None]:
def add_features_to_list(feature_list,row_list):
    newlist=[]
    for f in feature_list:
        if f in row_list:
            f_index = row_list.index(f)
            newlist.append(row_list[f_index+1])
            continue
        else:
            newlist.append(np.nan)
            continue
    return newlist

features_list=[]

for year in range(1977,2020):
    # Read in each movie's URL from the previously scraped list of movies by year:
    df = pd.read_csv(str(year)+'_domestic_releases.csv')
    
    features=[]
    features_list=[]
    for i in range(0,len(df)):
        if i%20 == 0:
            print(year, i)
        moviepage=str(df.iloc[i][7])
        url='https://www.'+moviepage
        ##print(url)
        driver.get(url)
        html = driver.page_source
        soup = bs4(html, 'html.parser')

        list_text_rows=[]
        
        # Drill down to table area of relevance for capturing features:
        divs = soup.find_all('div',{'class':'mojo-summary-values'})
        for div in divs:
            rows = div.find_all('span')
            for row in rows:
                if(row.text.find('Domestic')):
                    list_text_rows.append(row.text)

        features = ['Budget','Genres','MPAA','Running Time']
        features_list.append(add_features_to_list(features, list_text_rows))
        
        #=================================
        #wait 1 seconds between iterations
        time.sleep(1)

    # Create feature columns and populate:
    temp = list(zip(*features_list))

    # Assign each column the feature values accumulated over all years:
    for i in range(0,3):
        df[features[i]] = list(temp[i])
    
    try:
        # Clean up Genres column
        df['Genres'] = df['Genres'].fillna('-')
        a = [g.rstrip() for g in df['Genres']]
        b = [re.sub('\n\s+',' ',s) for s in a]
        df['Genres'] = b
        
        
        # Write new dataframe to .csv file:
        df.to_csv(str(year)+'_movies_features_TEST.csv', index = False)
        
    except: # Skip a year if error encountered, re-run again separately if needed
        pass

In [66]:
def add_features_to_list(feature_list,row_list):
    newlist=[]
    for f in feature_list:
        if f in row_list:
            f_index = row_list.index(f)
            newlist.append(row_list[f_index+1])
            continue
        else:
            newlist.append(np.nan)
            continue
    return newlist

features_list=[]

# Previously ran for 1977 --> 2010
for year in range(2010,2020):
    # Read in each movie's URL from the previously scraped list of movies by year:
    df = pd.read_csv(str(year)+'_domestic_releases.csv')
    
    features=[]
    features_list=[]
    for i in range(0,len(df)):
        if i%20 == 0:
            print(year, i)
        moviepage=str(df.iloc[i][7])
        url='https://www.'+moviepage
        ##print(url)
        driver.get(url)
        html = driver.page_source
        soup = bs4(html, 'html.parser')

        list_text_rows=[]
        
        # Drill down to table area of relevance for capturing features:
        divs = soup.find_all('div',{'class':'mojo-summary-values'})
        for div in divs:
            rows = div.find_all('span')
            for row in rows:
                if(row.text.find('Domestic')):
                    list_text_rows.append(row.text)

        features = ['Budget','Genres','MPAA','Running Time']
        features_list.append(add_features_to_list(features, list_text_rows))
        
        #=================================
        #wait 1 seconds between iterations
        time.sleep(1)

    # Create feature columns and populate:
    temp = list(zip(*features_list))

    # Assign each column the feature values accumulated over all years:
    for i in range(0,3):
        df[features[i]] = list(temp[i])
    
    try:
        # Clean up Genres column
        df['Genres'] = df['Genres'].fillna('-')
        a = [g.rstrip() for g in df['Genres']]
        b = [re.sub('\n\s+',' ',s) for s in a]
        df['Genres'] = b
        
        
        # Write new dataframe to .csv file:
        df.to_csv(str(year)+'_movies_features_TEST.csv', index = False)
        
    except: # Skip a year if error encountered, re-run again separately if needed
        pass

2010 0
2010 20
2010 40
2010 60
2010 80
2010 100
2010 120
2010 140
2010 160
2010 180
2010 200
2010 220
2010 240
2010 260
2010 280
2010 300
2010 320
2010 340
2010 360
2010 380
2010 400
2010 420
2010 440
2010 460
2010 480
2010 500
2010 520
2010 540
2010 560
2010 580
2010 600
2010 620
2010 640
2011 0
2011 20
2011 40
2011 60
2011 80
2011 100
2011 120
2011 140
2011 160
2011 180
2011 200
2011 220
2011 240
2011 260
2011 280
2011 300
2011 320
2011 340
2011 360
2011 380
2011 400
2011 420
2011 440
2011 460
2011 480
2011 500
2011 520
2011 540
2011 560
2011 580
2011 600
2011 620
2011 640
2011 660
2011 680
2011 700
2011 720
2012 0
2012 20
2012 40
2012 60
2012 80
2012 100
2012 120
2012 140
2012 160
2012 180
2012 200
2012 220
2012 240
2012 260
2012 280
2012 300
2012 320
2012 340
2012 360
2012 380
2012 400
2012 420
2012 440
2012 460
2012 480
2012 500
2012 520
2012 540
2012 560
2012 580
2012 600
2012 620
2012 640
2012 660
2012 680
2012 700
2012 720
2012 740
2012 760
2012 780
2012 800
2013 0
2013 20
2013

In [1]:
# Kept for posterity: Finds a rarely seen error in 'Genres' element for movies on BOM. It has since been incorporated above.
##df['Genres'].isna().sum()