In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import re
import dateutil.parser
from datetime import datetime as dt

In [2]:
def get_movie_value(soup, field_name):
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_element = obj.findNext()
    if next_element:
        return next_element.text 
    else:
        return None

In [3]:
def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except :
        pass
    
def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [4]:
def movie_info(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    title_string = soup.find('title').text
    title_string.split('-')
    title = title_string.split('-')[0].strip()
    
    dtg = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0].text
    
    runtime = get_movie_value(soup,'Run')
    
    rating = get_movie_value(soup,'MPAA')
    
    release_date = get_movie_value(soup,'Release Date')
    release_date = release_date.split('\n')[0]
    
    distributor = get_movie_value(soup,'Distributor')
    distributor = distributor[:-30]
    
    budget = get_movie_value(soup,'Budget')
    budget = money_to_int(budget)
    
    raw_domestic_total_gross = dtg
    domestic_total_gross = money_to_int(raw_domestic_total_gross)

    raw_runtime = get_movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)

    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
    release_date = to_date(raw_release_date)
    
    
    
    genre = get_movie_value(soup,'Genres')
    res = [] 
    for sub in genre: 
        res.append(re.sub('\n', '', sub)) 
    res = ''.join(res)
    res = re.sub(' +', ', ', res)
    
    
    headers = ['Movie_Title','Distributor','Domestic_Total_Gross','Budget',
               'Runtime(mins)', 'Rating', 'Release_Date','Genre']

    movie_data = []
    movie_dict = dict(zip(headers, [title,
                                    distributor,
                                    domestic_total_gross,
                                    budget,
                                    runtime,
                                    rating, 
                                    release_date,
                                    res]))

    movie_data.append(movie_dict)
    return movie_data

In [5]:
def Action(item):
    if 'Action' in item:
        return 1
    else:
        return 0
    
def Adventure(item):
    if 'Adventure' in item:
        return 1
    else:
        return 0
    
def Animation(item):
    if 'Animation' in item:
        return 1
    else:
        return 0
    
def Biography(item):
    if 'Biography' in item:
        return 1
    else:
        return 0
    
def Comedy(item):
    if 'Comedy' in item:
        return 1
    else:
        return 0
    
def Crime(item):
    if 'Crime' in item:
        return 1
    else:
        return 0
    
def Documentary(item):
    if 'Documentary' in item:
        return 1
    else:
        return 0
    
def Drama(item):
    if 'Drama' in item:
        return 1
    else:
        return 0
    
def Family(item):
    if 'Family' in item:
        return 1
    else:
        return 0
    
def Fantasy(item):
    if 'Fantasy' in item:
        return 1
    else:
        return 0
    
def History(item):
    if 'History' in item:
        return 1
    else:
        return 0    
    
def Horror(item):
    if 'Horror' in item:
        return 1
    else:
        return 0
    
def Music(item):
    if 'Music' in item:
        return 1
    else:
        return 0
    
def Music(item):
    if 'Music' in item:
        return 1
    else:
        return 0
    
def Musical(item):
    if 'Musical' in item:
        return 1
    else:
        return 0
    
def Mystery(item):
    if 'Mystery' in item:
        return 1
    else:
        return 0
    
def Romance(item):
    if 'Romance' in item:
        return 1
    else:
        return 0
    
def SciFi(item):
    if 'Sci-Fi' in item:
        return 1
    else:
        return 0
    
def Sport(item):
    if 'Sport' in item:
        return 1
    else:
        return 0
    
def Thriller(item):
    if 'Thriller' in item:
        return 1
    else:
        return 0

def War(item):
    if 'War' in item:
        return 1
    else:
        return 0
    
def Western(item):
    if 'Western' in item:
        return 1
    else:
        return 0


In [38]:
def get_urls(years):
    '''Takes a 4 digit year input, or a list of them'''
    url = "https://www.boxofficemojo.com/weekly/by-year/{}/"
    dfs = []
    for year in years:
        file_url = url.format(year)
        response = requests.get(file_url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        
        k = soup.find_all('a')

        df = pd.DataFrame(np.array(list(k)[27:-12]).reshape(-1,3))
        moviepaths = df.loc[:,2]
        #print(pd.DataFrame(moviepaths))
        for item in moviepaths:
            dfs.append(item)
    return dfs

years = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010',
         '2009','2008','2007','2006','2005','2004','2003','2002','2001','2000',]
url_list = get_urls(years)
#url_list

In [39]:
urls = []
for a_tag in url_list:
    url = a_tag['href']
    if not url.startswith('http'):
        url = "https://www.boxofficemojo.com/"+url
    urls.append(url)
len(urls)

1043

In [8]:
t = movie_info(urls[2])
def getList(dict): 
      
    return [*dict] 
Header = getList(t[0])
#Header

In [9]:
t

[{'Movie_Title': 'Jumanji: The Next Level',
  'Distributor': 'Sony Pictures Releasing',
  'Domestic_Total_Gross': 260912975,
  'Budget': 125000000,
  'Runtime(mins)': 123,
  'Rating': 'PG-13',
  'Release_Date': datetime.datetime(2019, 12, 13, 0, 0),
  'Genre': 'Action, Adventure, Comedy, Fantasy'}]

In [10]:
list(t[0].values())[1][:-30]

''

In [40]:
dft = pd.DataFrame(columns=Header)

for i in range(len(urls)):
    try:
        t = movie_info(urls[i])
        dft = dft.append(t, ignore_index=True)
    except:
        pass
    
dft.head()

Unnamed: 0,Movie_Title,Distributor,Domestic_Total_Gross,Budget,Runtime(mins),Rating,Release_Date,Genre
0,Star Wars: The Rise Of Skywalker,Walt Disney Studios Motion Pictures,483645801,,142,PG-13,2019-12-20,"Action, Adventure, Fantasy, Sci-Fi"
1,Star Wars: The Rise Of Skywalker,Walt Disney Studios Motion Pictures,483645801,,142,PG-13,2019-12-20,"Action, Adventure, Fantasy, Sci-Fi"
2,Jumanji: The Next Level,Sony Pictures Releasing,260912975,125000000.0,123,PG-13,2019-12-13,"Action, Adventure, Comedy, Fantasy"
3,Frozen II,Walt Disney Studios Motion Pictures,461151690,,103,PG,2019-11-22,"Adventure, Animation, Comedy, Family, Fantasy,..."
4,Frozen II,Walt Disney Studios Motion Pictures,461151690,,103,PG,2019-11-22,"Adventure, Animation, Comedy, Family, Fantasy,..."


In [44]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018 entries, 0 to 1017
Data columns (total 8 columns):
Movie_Title             1018 non-null object
Distributor             1018 non-null object
Domestic_Total_Gross    1018 non-null object
Budget                  905 non-null object
Runtime(mins)           1003 non-null object
Rating                  995 non-null object
Release_Date            1018 non-null datetime64[ns]
Genre                   1018 non-null object
dtypes: datetime64[ns](1), object(7)
memory usage: 63.8+ KB


In [13]:
dft['strpdate'] = pd.to_datetime(dft.Release_Date, format = "%Y-%M-%D")
dft['year'] = pd.DatetimeIndex(dft['strpdate']).year

In [43]:
#dft.to_pickle('MovieData20.pkl')
#dft.to_csv('MovieData20.csv')

In [14]:
dfttest = dft.copy()
dfttest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 10 columns):
Movie_Title             916 non-null object
Distributor             916 non-null object
Domestic_Total_Gross    916 non-null object
Budget                  814 non-null object
Runtime(mins)           903 non-null object
Rating                  895 non-null object
Release_Date            916 non-null datetime64[ns]
Genre                   916 non-null object
strpdate                916 non-null datetime64[ns]
year                    916 non-null int64
dtypes: datetime64[ns](2), int64(1), object(7)
memory usage: 71.7+ KB


In [15]:
Glist = list(dft['Genre'].str.split(' ', expand=True).stack().unique())
Glist2 = []
for item in Glist:
    Glist2.append(item.strip(','))
Glistset = list(set(Glist2))
len(Glistset)

21

In [16]:
Glistset.sort()
Glistset

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western']

In [17]:
funclist = [Action, Adventure, Animation, Biography, Comedy, Crime, Documentary, Drama, Family, Fantasy, 
            History, Horror, Music, Musical, Mystery, Romance, SciFi, Sport, Thriller, War, Western]

In [18]:
for i in range(len(Glistset)):
        dfttest[Glistset[i]] = dfttest['Genre'].apply(funclist[i])
dfttest.info()
    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 31 columns):
Movie_Title             916 non-null object
Distributor             916 non-null object
Domestic_Total_Gross    916 non-null object
Budget                  814 non-null object
Runtime(mins)           903 non-null object
Rating                  895 non-null object
Release_Date            916 non-null datetime64[ns]
Genre                   916 non-null object
strpdate                916 non-null datetime64[ns]
year                    916 non-null int64
Action                  916 non-null int64
Adventure               916 non-null int64
Animation               916 non-null int64
Biography               916 non-null int64
Comedy                  916 non-null int64
Crime                   916 non-null int64
Documentary             916 non-null int64
Drama                   916 non-null int64
Family                  916 non-null int64
Fantasy                 916 non-null int64
History   

In [47]:
dfttest.groupby(['Distributor']).Genre.count()

Distributor
                                         2
Destination Films                        1
Dimension Films                          5
DreamWorks                              22
DreamWorks Distribution                 19
FilmDistrict                             1
Focus Features                           2
Lionsgate                               46
Metro-Goldwyn-Mayer (MGM)               12
Miramax                                  7
New Line Cinema                         29
Newmarket Films                          5
Open Road Films (II)                     3
Paramount Pictures                      68
Relativity Media                         3
Revolution Studios                      10
STX Entertainment                        1
Screen Gems                             24
Sony Pictures Releasing                 89
Summit Entertainment                     4
The Weinstein Company                    4
TriStar Pictures                         3
Twentieth Century Fox                   95

In [42]:
dfttest


Unnamed: 0,Movie_Title,Distributor,Domestic_Total_Gross,Budget,Runtime(mins),Rating,Release_Date,Genre,strpdate,year,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,Star Wars: The Rise Of Skywalker,Walt Disney Studios Motion Pictures,483645801,,142,PG-13,2019-12-20,"Action, Adventure, Fantasy, Sci-Fi",2019-12-20,2019,...,0,0,0,0,0,1,0,0,0,0
1,Star Wars: The Rise Of Skywalker,Walt Disney Studios Motion Pictures,483645801,,142,PG-13,2019-12-20,"Action, Adventure, Fantasy, Sci-Fi",2019-12-20,2019,...,0,0,0,0,0,1,0,0,0,0
2,Jumanji: The Next Level,Sony Pictures Releasing,260912975,125000000,123,PG-13,2019-12-13,"Action, Adventure, Comedy, Fantasy",2019-12-13,2019,...,0,0,0,0,0,0,0,0,0,0
3,Frozen II,Walt Disney Studios Motion Pictures,461151690,,103,PG,2019-11-22,"Adventure, Animation, Comedy, Family, Fantasy,...",2019-11-22,2019,...,0,1,1,0,0,0,0,0,0,0
4,Frozen II,Walt Disney Studios Motion Pictures,461151690,,103,PG,2019-11-22,"Adventure, Animation, Comedy, Family, Fantasy,...",2019-11-22,2019,...,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,Scream 3,Dimension Films,89143175,40000000,116,R,2000-02-04,"Horror, Mystery",2000-02-04,2000,...,1,0,0,1,0,0,0,0,0,0
912,Eye of the Beholder,Destination Films,16500786,35000000,109,R,2000-01-28,"Drama, Mystery, Thriller",2000-01-28,2000,...,0,0,0,1,0,0,0,1,0,0
913,Next Friday,New Line Cinema,57328603,11000000,98,R,2000-01-12,Comedy,2000-01-12,2000,...,0,0,0,0,0,0,0,0,0,0
914,Next Friday,New Line Cinema,57328603,11000000,98,R,2000-01-12,Comedy,2000-01-12,2000,...,0,0,0,0,0,0,0,0,0,0


In [46]:
pd.get_dummies(dfttest['Distributor'], drop_first=True)

Unnamed: 0,Destination Films,Dimension Films,DreamWorks,DreamWorks Distribution,FilmDistrict,Focus Features,Lionsgate,Metro-Goldwyn-Mayer (MGM),Miramax,New Line Cinema,...,Screen Gems,Sony Pictures Releasing,Summit Entertainment,The Weinstein Company,TriStar Pictures,Twentieth Century Fox,United Artists,Universal Pictures,Walt Disney Studios Motion Pictures,Warner Bros.
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
912,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
913,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
914,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
