In [1]:
# Program to scrape the weekly info for the movie links supplied. Also scrapes a unique IMDB movie ID
import time
import regex as re
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs4

In [2]:
chrome_options = Options()
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('headless')
driver = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',options=chrome_options)

In [3]:
# Read in dataframe containing links:
#df_movies_list = pd.read_csv('00_budgeted_movies_list_TEST.csv')
df_movies_list = pd.read_csv('00_budgeted_movies_list.csv')

In [4]:
def get_weekly(text_rows):
    num_weeks = int(len(text_rows)/10)
    nmax = 15*10
    # if there is no weekly data, kick back a list of zeros to preserve the structure
    if num_weeks == 0:
        return [0]*15*5
    if num_weeks >= 15:
        a=text_rows[2:nmax:10]
        b=text_rows[3:nmax:10]
        c=text_rows[4:nmax:10]
        d=text_rows[5:nmax:10]
        e=text_rows[6:nmax:10]
        return a+b+c+d+e
    if num_weeks < 15: # 5 columns scraped over 15 rows (weeks), grouped into 1 output column
        return [0]*15*5
# ==============================
# ==============================
features_list = []
df_weekly_stats = pd.DataFrame()
features = ['Weekly','Theaters','Change','Avg','To Date']


#for i in range(0,len(df_movies_list)):
# files for loopset2of4, 3of4, and 4of4 do the exact same thing, just over different ranges of the total movies_list
for i in range(0,1000): # test range
    if i%20 == 0:
        print(i)
    try:
        movie_page=str(df_movies_list.loc[i]['URL'])
        url = movie_page+'weekly/'
        driver.get(url)
        html = driver.page_source
        soup = bs4(html, 'html.parser')
    except:
        print("1: An exception occurred")
    # ==============================
    # Get IMDB unique ID for subsequent scraping:
    #divs = soup.find_all('div',{'class':'a-box-inner'})
    divs = soup.select('div')
    
    pattern = 'title\/(.*?)\/'
    str_IMDB_ID = re.search(pattern,str(divs[0])).group(1)
    # ==============================
    # Get all weekly info for each movie:
    list_text_rows = []
    divs = soup.find_all('div',{'class':'mojo-gutter-sides'})
    for div in divs:
        rows = div.select('tr td')
        for row in rows:
            if(row.text.find('Date')):
                list_text_rows.append(row.text)

    
    df_weekly_stats[str_IMDB_ID] = get_weekly(list_text_rows)
    
    time.sleep(1)

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980


In [8]:
# QC outputs: we want list_IMDB_IDs = 4341, and df_weekly_stats.shape = (75,4341)

# domain-decomposed: should be (75,1000)
print('corresponding weekly stats shape:',  df_weekly_stats.shape)

corresponding weekly stats shape: (75, 793)


#### the shape of df_weekly_stats is not (75,4341) , meaning not all movie pages were successfully scraped

In [9]:
df_weekly_stats.head(10)

Unnamed: 0,tt0076759,tt0075860,tt0077631,tt0078346,tt0079945,tt0078748,tt0078723,tt0079753,tt0080684,tt0081505,...,tt0241303,tt0188160,tt0189584,tt0210567,tt0141399,tt0171356,tt0186253,tt0216772,tt0220100,tt0138946
0,0,0,0,0,0,"$4,758,639",0,0,0,0,...,"$249,471",0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,"$3,623,506",0,0,0,0,...,"$2,147,062",0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,"$4,300,753",0,0,0,0,...,"$2,507,221",0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,"$3,783,140",0,0,0,0,...,"$2,256,354",0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,"$8,312,008",0,0,0,0,...,"$2,560,661",0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,"$7,626,254",0,0,0,0,...,"$4,348,195",0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,"$5,334,535",0,0,0,0,...,"$4,691,785",0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,"$4,210,249",0,0,0,0,...,"$4,747,888",0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,"$3,566,295",0,0,0,0,...,"$4,900,265",0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,"$2,860,473",0,0,0,0,...,"$6,892,596",0,0,0,0,0,0,0,0,0


In [10]:
df_weekly_stats.to_csv('06_weekly_movies_features_part01.csv', index=True)

In [11]:
# Transpose the 'df_weekly_stats' dataframe so that each row's weekly stats correspond to a different movie:
df_new_features = df_weekly_stats.transpose()
print(df_weekly_stats.shape)
print(df_new_features.shape)
df_new_features.head(10)

(75, 793)
(793, 75)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
tt0076759,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0075860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0077631,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0078346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0079945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0078748,"$4,758,639","$3,623,506","$4,300,753","$3,783,140","$8,312,008","$7,626,254","$5,334,535","$4,210,249","$3,566,295","$2,860,473",...,"$10,236","$7,046","$5,743","$5,131","$4,200","$3,939","$3,468","$3,680","$3,465","$3,441"
tt0078723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0079753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0080684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0081505,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,Release,Calendar Gross,Max Theaters,Total Gross,Release Date,Distributor,URL,Budget,Genres,MPAA
0,Star Wars: Episode IV - A New Hope,"$195,666,111",1750,"$307,263,857",May 25,Twentieth Century Fox\n\n,https://www.boxofficemojo.com/release/rl275903...,11000000,Action Adventure Fantasy Sci-Fi,PG
1,Close Encounters of the Third Kind,"$16,172,445",650,"$116,395,460",Nov 16,Columbia Pictures\n\n,https://www.boxofficemojo.com/release/rl340428...,20000000,Drama Sci-Fi,
2,Grease,"$159,978,870",862,"$159,978,870",16-Jun,Paramount Pictures\n\n,https://www.boxofficemojo.com/release/rl205255...,6000000,Musical Romance,PG
3,Close Encounters of the Third Kind,"$91,700,143",650,"$116,395,460",16-Nov,Columbia Pictures\n\n,https://www.boxofficemojo.com/release/rl340428...,20000000,Drama Sci-Fi,
4,Star Wars: Episode IV - A New Hope,"$30,676,545",1750,"$307,263,857",25-May,Twentieth Century Fox\n\n,https://www.boxofficemojo.com/release/rl275903...,11000000,Action Adventure Fantasy Sci-Fi,PG


In [13]:
df_new_features.to_csv('07_transposed_weekly_features_part01.csv', index=False)