In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import re
import dateutil.parser
from datetime import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
def get_movie_value(soup, field_name):
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_element = obj.findNext()
    if next_element:
        return next_element.text 
    else:
        return None

In [4]:
def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except :
        pass
    
def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [5]:
def movie_info(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    title_string = soup.find('title').text
    title_string.split('-')
    title = title_string.split('-')[0].strip()
    
    dtg = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0].text
    
    runtime = get_movie_value(soup,'Run')
    
    rating = get_movie_value(soup,'MPAA')
    
    release_date = get_movie_value(soup,'Release Date')
    release_date = release_date.split('\n')[0]
    
    distributor = get_movie_value(soup,'Distributor')
    distributor = distributor[:-30]
    
    budget = get_movie_value(soup,'Budget')
    budget = money_to_int(budget)
    
    raw_domestic_total_gross = dtg
    domestic_total_gross = money_to_int(raw_domestic_total_gross)

    raw_runtime = get_movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)

    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
    release_date = to_date(raw_release_date)
    
    
    
    genre = get_movie_value(soup,'Genres')
    res = [] 
    for sub in genre: 
        res.append(re.sub('\n', '', sub)) 
    res = ''.join(res)
    res = re.sub(' +', ', ', res)
    
    
    headers = ['Movie_Title','Distributor','Domestic_Total_Gross','Budget',
               'Runtime(mins)', 'Rating', 'Release_Date','Genre']

    movie_data = []
    movie_dict = dict(zip(headers, [title,
                                    distributor,
                                    domestic_total_gross,
                                    budget,
                                    runtime,
                                    rating, 
                                    release_date,
                                    res]))

    movie_data.append(movie_dict)
    return movie_data

In [138]:
def get_urls(years):
    '''Takes a 4 digit year input, or a list of them'''
    url = "https://www.boxofficemojo.com/year/{}/?sort=gross&grossesOption=calendarGrosses"
    dfs = []
    for year in years:
        file_url = url.format(year)
        response = requests.get(file_url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        
        k = soup.find_all('a')
        k3 = []
        for item in k:
            if 'path' not in str(item):
                k3.append(item)
        moviepaths = pd.DataFrame(np.array(k3)[26:-11])
        #print(moviepaths)
        #for item in moviepaths:
        dfs.append(moviepaths)
          #  print(item)
        
        #print(moviepaths)
        #df = pd.DataFrame(np.array(list(k)[26:426]).reshape(-1,2))
        #moviepaths = df.loc[:,0]
        #print(pd.DataFrame(moviepaths))
        #for item in moviepaths:
        #    dfs.append(item)
        #dfs.append(k3)
        #print(dfs)
    return dfs

years = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010',
         '2009','2008','2007','2006','2005']#,'2004','2003','2002','2001','2000',]
url_list = get_urls(years)
#url_list

In [139]:
len(url_list)

15

In [169]:
url_list[1].iloc[0][0]

<a class="a-link-normal" href="/release/rl2992866817/?ref_=bo_yld_table_1">Black Panther</a>

In [137]:
#for i in range(len(url_list)):
 #   print(len(url_list[i]))

906
993
852
856
846
849
826
805
729
651


In [173]:
urls = []
for i in range(len(url_list)):
    for j in range(len(url_list[i])):
        for a_tag in url_list[i].iloc[j]:
            url = a_tag['href']
            if not url.startswith('http'):
                url = "https://www.boxofficemojo.com/"+url
                urls.append(url)
len(urls)

11879

In [175]:
t = movie_info(urls[2])
def getList(dict): 
      
    return [*dict] 
Header = getList(t[0])

In [177]:
dft = pd.DataFrame(columns=Header)

for i in range(len(urls)):
    try:
        t = movie_info(urls[i])
        dft = dft.append(t, ignore_index=True)
    except:
        pass
    
dft.head()

Unnamed: 0,Movie_Title,Distributor,Domestic_Total_Gross,Budget,Runtime(mins),Rating,Release_Date,Genre
0,Avengers: Endgame,Walt Disney Studios Motion Pictures,858373000,356000000.0,181,PG-13,2019-04-26,"Action, Adventure, Drama, Sci-Fi"
1,The Lion King,Walt Disney Studios Motion Pictures,543638043,260000000.0,118,PG,2019-07-19,"Adventure, Animation, Drama, Family, Musical"
2,Toy Story 4,Walt Disney Studios Motion Pictures,434038008,,100,G,2019-06-21,"Adventure, Animation, Comedy, Family, Fantasy"
3,Frozen II,Walt Disney Studios Motion Pictures,466850574,,103,PG,2019-11-22,"Adventure, Animation, Comedy, Family, Fantasy,..."
4,Captain Marvel,Walt Disney Studios Motion Pictures,426829839,,123,PG-13,2019-03-08,"Action, Adventure, Sci-Fi"


In [180]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11268 entries, 0 to 11267
Data columns (total 8 columns):
Movie_Title             11268 non-null object
Distributor             11268 non-null object
Domestic_Total_Gross    11268 non-null object
Budget                  2260 non-null object
Runtime(mins)           10864 non-null object
Rating                  6036 non-null object
Release_Date            11268 non-null datetime64[ns]
Genre                   11268 non-null object
dtypes: datetime64[ns](1), object(7)
memory usage: 704.4+ KB


In [178]:
#dft.to_pickle('Full15yr.pkl')

In [181]:
dft

Unnamed: 0,Movie_Title,Distributor,Domestic_Total_Gross,Budget,Runtime(mins),Rating,Release_Date,Genre
0,Avengers: Endgame,Walt Disney Studios Motion Pictures,858373000,356000000,181,PG-13,2019-04-26,"Action, Adventure, Drama, Sci-Fi"
1,The Lion King,Walt Disney Studios Motion Pictures,543638043,260000000,118,PG,2019-07-19,"Adventure, Animation, Drama, Family, Musical"
2,Toy Story 4,Walt Disney Studios Motion Pictures,434038008,,100,G,2019-06-21,"Adventure, Animation, Comedy, Family, Fantasy"
3,Frozen II,Walt Disney Studios Motion Pictures,466850574,,103,PG,2019-11-22,"Adventure, Animation, Comedy, Family, Fantasy,..."
4,Captain Marvel,Walt Disney Studios Motion Pictures,426829839,,123,PG-13,2019-03-08,"Action, Adventure, Sci-Fi"
...,...,...,...,...,...,...,...,...
11263,The Comedians of Comedy,Vitagraph Films,549,,103,,2005-11-11,"Comedy, Documentary"
11264,The Dark Hours,,423,,80,R,2005-10-13,"Horror, Thriller"
11265,The Brown Bunny,Wellspring Media,366301,,93,,2004-08-27,Drama
11266,Yes Nurse! No Nurse! 2004 Re,Regent Releasing,13325,,100,,2004-09-03,"Comedy, Musical"


In [195]:
 def get_urls_budgetL(years):   
    
    dfs = {}
    url = "https://www.the-numbers.com/movie/budgets/all/{}"
    asdf=[]
    Budget = []
    for year in years:
        file_url = url.format(year)
        response = requests.get(file_url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        k = soup.find_all('a')
        df = (pd.DataFrame(np.array(list(k))[71:-62].reshape(-1,2)))
        movie_title = df.loc[:,1]
        asdf=[]
        Budget = []
        
        j = list(soup.find_all('td'))
        
        for i in range(len(j)):
            asdf.append(str(j[i]))
        budgets2 = pd.DataFrame(np.array(asdf).reshape(-1,6))[3]
        for i in range(len(budgets2)):
            Budget.append(budgets2[i][18:-5])
        dfs['Movie_Titles{}'.format(year)] = movie_title
        dfs['Budget{}'.format(year)] = Budget
    #dfs['Movie_Title'] = movie_title
    #dfs['Budget'] = Budget
    return dfs
jkLower = get_urls_budgetL(['001','101','201','301','401'])

In [193]:
def get_urls_budgetU(years):   
    
    dfs = {}
    url = "https://www.the-numbers.com/movie/budgets/all/{}"
    asdf=[]
    Budget = []
    for year in years:
        file_url = url.format(year)
        response = requests.get(file_url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        k = soup.find_all('a')
        df = (pd.DataFrame(np.array(list(k))[71:-63].reshape(-1,2)))
        movie_title = df.loc[:,1]
        asdf=[]
        Budget = []
        
        j = list(soup.find_all('td'))
        
        for i in range(len(j)):
            asdf.append(str(j[i]))
        budgets2 = pd.DataFrame(np.array(asdf).reshape(-1,6))[3]
        for i in range(len(budgets2)):
            Budget.append(budgets2[i][18:-5])
        dfs['Movie_Titles{}'.format(year)] = movie_title
        dfs['Budget{}'.format(year)] = Budget
    #dfs['Movie_Title'] = movie_title
    #dfs['Budget'] = Budget
    return dfs
#jkUpper = get_urls_budgetU(list(str(range(501:5001:100))))
#jkl1 = get_urls_budgetU(['501','601','701','801','901',
 #                        '1001','1101','1201','1301','1401','1501','1601','1701','1801','1901'])

In [219]:
pagelist = list(range(501,5501,100))
plist = []
for i in range(len(pagelist)):
    plist.append(str(pagelist[i]))
jkUpper = get_urls_budgetU(plist)
#jkUpper

In [220]:
titles = []
budgets = []
for i in range(len(jkLower)):
    if i%2 == 0:
        titles.append(list(jkLower.values())[i])
    else:
        budgets.append(list(jkLower.values())[i])
        
for i in range(len(jkUpper)):
    if i%2 == 0:
        titles.append(list(jkUpper.values())[i])
    else:
        budgets.append(list(jkUpper.values())[i])

In [222]:
len(titles)

55

In [226]:
type(titles[1])

pandas.core.series.Series

In [230]:
x = []
y = []
for i in range(len(titles)):
    x = x+list(titles[i])
for i in range(len(budgets)):
    y = y+list(budgets[i])


In [250]:
dfBud = pd.DataFrame(columns=['Movie_Titles','Budget'])
dfBud['Movie_Titles'] = x
dfBud['Budget'] = y

In [255]:
#dfBud.to_pickle('Budgets5500.pkl')

RecursionError: maximum recursion depth exceeded while pickling an object

In [253]:
dfBud.to_csv('Budgets5500.csv')