# Libraries
import required libraries: we use 'BeautifulSoup' for reading webpages and re for extract specific pieces of a string 

In [1]:
import numpy as np 
import requests
from bs4 import BeautifulSoup
import re 
import pandas as pd 

# Collecting Movie IDs
For collecting information about a movie from its page on boxofficemojo, first we should know its url. Each movie page url is specified by a unique ID. We collect the IDs from boxofficemojo.com domestic box office page by year using BeautifulSoup and re libraries and save the IDs in separate txt file for each year. 

In [40]:
def movie_id(year):
    with open('Box Office Mojo IDs/%s.txt'%year,'w') as f:
        pass 

    url='https://www.boxofficemojo.com/year/%s/'%(year)
    page=requests.get(url) 
    soup=BeautifulSoup(page.text,'html.parser')
    movies=soup.find_all('td',attrs={'class':'a-text-left mojo-field-type-release mojo-cell-wide'})
    
    movie_id=[]
    for index,movie in enumerate(movies):
        link=movie.select('a')[0]['href']
        movie_id.append(re.search(r'(?<=rl)(.*)(?=/)',link).group())
    np.savetxt('Box Office Mojo IDs/%s.txt'%year,movie_id,fmt='%s',delimiter='\n')


for year in range(2010,2020):
    print(year,'**********')
    movie_id(year)

2010 **********
2011 **********
2012 **********
2013 **********
2014 **********
2015 **********
2016 **********
2017 **********
2018 **********
2019 **********


# Data Collection
now we use these ids to go to the movie page on boxofficemojo.ocm and collect our data

In [6]:
def data(year):
    global columns,df 
    ids=np.loadtxt('Box Office Mojo IDs/%s.txt'%year,dtype=str)
    for index,ID in enumerate(ids):
        print(index,':',ID)
        df.loc[index,"Mojo ID"]=ID
        try:
            #url of the movie webpage:
            url='https://www.boxofficemojo.com/release/rl%s/'%ID 
            #read the page:
            page=requests.get(url) 
            soup=BeautifulSoup(page.text,'html.parser') 
            #################################################################################
            #collect domestic gross and worldwide gross:
            moneys=soup.find('div',attrs={'class':'a-section a-spacing-none mojo-performance-summary-table'}).find_all('span',attrs={'class':'money'})
            df.loc[index,'Domestic Gross ($)']=int(moneys[0].string.replace('$','').replace(',',''))
            df.loc[index,'Worldwide Gross ($)']=int(moneys[-1].string.replace('$','').replace(',',''))
            #################################################################################
            #Collect Distributor, opening, budget, Release Date, MPAA, Genres, In release , Widest release 
            io=soup.find('div',attrs={'class':'a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile'}).find_all('div',attrs={'class':'a-section a-spacing-none'})
            for item in io:
                spans=item.find_all('span')
                name=spans[0].string.strip()

                if name=='Distributor':
                    dist=spans[1].find('br')
                    if dist:
                        df.loc[index,'Distributor']="".join(dist.previous_siblings)

                if name=='Opening':
                    opening=spans[1].find('span')
                    if opening:
                        df.loc[index,'Opening ($)']=int(opening.string.strip().replace('$','').replace(',',''))
                    opening=spans[1].find('br')
                    if opening:
                        df.loc[index,'Opening Theaters']=int(re.findall(r'\d+',"".join(opening.next_siblings).replace(' ','').replace(',',''))[0])

                if name=='Budget':
                    budget=spans[1].find('span')
                    if budget:
                        df.loc[index,'Budget']=int(budget.string.strip().replace('$','').replace(',',''))

                if name=='Release Date' :
                    release=spans[1].find('a')
                    if release:
                        df.loc[index,'Release Date']=release.string.strip()
            
                if name=='MPAA':
                    df.loc[index,'MPAA']=spans[1].string.strip() 
            
                if name=='Genres':
                    df.loc[index,'Genres']=spans[1].string.strip()
            
                if name=='In Release':
                    df.loc[index,'In Realease (Days)']=int(re.search(r'(?<=)(.*)(?= days)',spans[1].string.strip()).group().replace(',',''))

                if name=='Widest Release':
                    df.loc[index,'Widest Release']=int(re.findall(r'\d+',spans[1].string.strip().replace(',',''))[0])

                if name=='IMDbPro':
                    imdb_id=re.search(r'(?<=https://pro.imdb.com/title/)(.*)(?=ref)',spans[1].find('a')['href']).group().replace('?','')
                    df.loc[index,'IMDB ID']=imdb_id
        
            
            ################################################################################
            df.loc[index,'title']=soup.find('h1',attrs={'class':'a-size-extra-large'}).string #title 
            ################################################################################
            df.loc[index,'Plot Outline']=soup.find('p',attrs={'class':'a-size-medium'}).string #plot outline 
            ################################################################################
            df.loc[index,'Year']=year
            ###############################################################################

            ##### IMDB PART ######:
            url='https://www.imdb.com/title/%s/'%imdb_id
            page=requests.get(url) 
            soup=BeautifulSoup(page.text,'html.parser')
            #IMDB scores:
            df.loc[index,'IMDB score']=float(soup.find('span',attrs={'itemprop':'ratingValue'}).string)
            #IMDB votes: 
            df.loc[index,'IMDB votes']=int(soup.find('span',attrs={'class':'small','itemprop':'ratingCount'}).string.replace(',',''))
            #######################################################################################
            meta=soup.find('div',attrs={'class':'metacriticScore score_favorable titleReviewBarSubItem'})
            #MetaScore 
            bool=1
            if meta:
                df.loc[index,'Metascore']=int(meta.find('span').string)
                bool=0
            elif bool==1:
                meta=soup.find('div',attrs={'class':'metacriticScore score_mixed titleReviewBarSubItem'})
                if meta:
                    df.loc[index,'Metascore']=int(meta.find('span').string)
                    bool=0
            elif bool==1:
                meta=soup.find('div',attrs={'class':'metacriticScore score_unfavorable titleReviewBarSubItem'})
                if meata:
                    df.loc[index,'Metascore']=int(meta.find('span').string)

            users=soup.find('div',attrs={'class':'titleReviewBarItem titleReviewbarItemBorder'})
            if users:
                users=users.find_all('a')
                string=''
                if users:
                    for user in users:
                        string+=user.string+'\n'
                    df.loc[index,'Meta Users']=string
        
            #country, language, runtime 
            io=soup.find('div',attrs={'class':'article', 'id':'titleDetails'}).find_all('div',attrs={'class':'txt-block'})
            countt=0
            for item in io:
                exist=item.find('h4')
                if exist:
                    name=exist.string
                    if name=='Country:':
                        countt+=1
                        countries=item.find_all('a')
                        string=''
                        for country in countries:
                            string+=country.string.strip()+'\n'
                        df.loc[index,'Country']=string.strip()

                    elif name=='Language:':
                        countt+=1
                        if item.find('a'):
                            df.loc[index,'Language']=item.find('a').string.strip()

                    elif name=='Runtime:':
                        countt+=1
                        if item.find('time'):
                            df.loc[index,'Running time (min)']=int(re.findall(r'\d+',item.find('time').string)[0])
                    if countt==3:
                        break

            #######################
            #actors, directors, producers and writers: 
            url='https://www.imdb.com/title/%s/fullcredits?ref_=tt_ov_wr#writers/'%imdb_id
            page=requests.get(url) 
            soup=BeautifulSoup(page.text,'html.parser')

            actors=soup.find('table',attrs={'class':'cast_list'})
            if actors:
                actors=actors.find_all('tr',limit=6)
                if actors:
                    string=''
                    for s in range(1,len(actors)):
                        if actors[s].find('td') and actors[s].find('td').find('a'):
                            string+=re.search(r'(?<=title=")(.*)(?=" )',str(actors[s].find('td').find('a'))).group().strip()+'\n'
                df.loc[index,'Stars']=string.strip()
    
            io=soup.find('div',attrs={'id':'fullcredits_content','class':'header'})
            if io:
                directors=io.find('h4',id="director")
                if directors:
                    directors=directors.find_next_sibling().find_all('a')   
                    string=''
                    for director in directors:
                        string+=(director.string)
                    df.loc[index,'Director(s)']=string.strip()

                string=''
                writers=io.find('h4',id="writer")
                if writers:
                    writers=writers.find_next_sibling().find_all('a')
                    for writer in writers:
                        string+=writer.string
                    df.loc[index,'Writer(s)']=string.strip() 

                string=''
                producers=io.find('h4',id="producer")
                if producers:
                    producers=producers.find_next_sibling().find_all('a')
                    for producer in producers:
                        string+=producer.string
                    df.loc[index,'Producer(s)']=string.strip()
        
        except:
            print('oops!',ID)

columns=['IMDB ID','Mojo ID','title','Genres','Year','Domestic Gross ($)',
        'Worldwide Gross ($)','Opening ($)','Budget','Opening Theaters',
        'Release Date','MPAA','In Realease (Days)','Widest Release','Stars','Director(s)',
        'Writer(s)','Producer(s)','Running time (min)','IMDB score','IMDB votes','Metascore',
        'Meta Users','Country','Language','Distributor','Plot Outline']
#make a data set 
df=pd.DataFrame(columns=columns)
for year in range(2011,2020):
    print('###### %s #######'%year)
    data(year)
    df.to_csv('Mojo tables/%s.csv'%year)

###### 2011 #######
0 : 1265337857
1 : 2977400321
2 : 3292956161
3 : 408323585
4 : 4117267969
5 : 877889025
6 : 3091760641
7 : 3094644225
8 : 225216001
9 : 1900578305
10 : 626624001
11 : 2000848385
12 : 3579610625
13 : 1434093057
14 : 1214940673
15 : 292324865
16 : 292587009
17 : 2255652353
18 : 3614344705
19 : 3479864833
20 : 1365411329
21 : 3042739713
22 : 576226817
23 : 1280804353
24 : 391874049
25 : 2624161281
26 : 2456520193
27 : 3564602881
28 : 1095075329
29 : 2085455361
30 : 1515488769
31 : 1247249921
32 : 3713959425
33 : 1766032897
34 : 275613185
35 : 1515030017
36 : 3225781761
37 : 779650561
38 : 3445982721
39 : 880641537
40 : 139232769
41 : 1668777473
42 : 3595208193
43 : 459638273
44 : 927434241
45 : 810452481
46 : 2388297217
47 : 878020097
48 : 4234511873
49 : 2286388737
50 : 929203713
51 : 3631646209
52 : 1849722369
53 : 2086045185
54 : 2121827841
55 : 3831399937
56 : 2825029121
57 : 810911233
58 : 3428353537
59 : 4033906177
60 : 1213433345
61 : 4000810497
62 : 3428550145


In [6]:
df

Unnamed: 0,IMDB ID,Mojo ID,title,Genres,Year,Domestic Gross ($),Worldwide Gross ($),Opening ($),Budget,Opening Theaters,...,Producer(s),Running time (min),IMDB score,IMDB votes,Metascore,Meta Users,Country,Language,Distributor,Plot Outline
0,tt0499549,876971521,Avatar,Action\n \n Adventure\n \n ...,2010,749766139,2744336793,77025481,237000000,3452,...,Brooke Breton\n James Cameron\n Laeta Kalogrid...,162,7.8,1122652,83,"3,483 user\n536 critic\n",USA,English,Twentieth Century Fox,A paraplegic Marine dispatched to the moon Pan...
1,tt0435761,1383564801,Toy Story 3,Adventure\n \n Animation\n \n ...,2010,415004880,1066969703,110307189,200000000,4028,...,Darla K. Anderson\n John Lasseter\n Nicole Par...,103,8.2,760625,92,912 user\n479 critic\n,USA,English,Walt Disney Studios Motion Pictures,The toys are mistakenly delivered to a day-car...
2,tt1014759,3393226241,Alice in Wonderland,Adventure\n \n Family\n \n ...,2010,334191110,1025467110,116101023,200000000,3728,...,Katterli Frauenfelder\n Derek Frey\n Chris Leb...,108,6.4,385465,53,802 user\n459 critic\n,USA\nUK,English,Walt Disney Studios Motion Pictures,Nineteen-year-old Alice returns to the magical...
3,tt1228705,1515881985,Iron Man 2,Action\n \n Adventure\n \n ...,2010,312433331,623933331,128122480,200000000,4380,...,Victoria Alonso\n Louis D'Esposito\n Susan Dow...,124,7,726511,57,911 user\n498 critic\n,USA,English,Paramount Pictures,With the world now aware of his identity as Ir...
4,tt1325004,659654145,The Twilight Saga: Eclipse,Action\n \n Adventure\n \n ...,2010,300531751,698491347,64832191,68000000,4468,...,Bill Bannerman\n Marty Bowen\n Wyck Godfrey\n ...,124,5,229111,58,528 user\n293 critic\n,USA,English,Summit Entertainment,As a string of mysterious killings grips Seatt...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,tt0975684,2774304257,Kisses,Drama,2010,81680,96933,15485,,2,...,Lance Daly\n Donna Eperon\n Tomas Eskilsson\n ...,72,7.3,1790,68,16 user\n57 critic\n,Ireland\nSweden,English,Oscilloscope,"Two kids, Dylan and Kylie, run away from home ..."
400,tt1159961,4000941569,Waking Sleeping Beauty,Documentary,2010,80741,84918,33115,,5,...,Don Hahn\n Connie Nartonis Thompson\n Peter Sc...,86,7.6,2719,70,9 user\n54 critic\n,USA,English,Walt Disney Studios Motion Pictures,"The story of the Disney Renaissance, an incred..."
401,tt1247644,810124801,Breaking Upwards,Romance,2010,77389,77389,15467,15000,1,...,Peter Duchan\n Sheena Lister\n Zoe Lister-Jone...,,6.2,1320,56,9 user\n23 critic\n,USA,English,IFC Films,A young New York couple intricately strategize...
402,tt1431181,323126785,Another Year,Comedy\n \n Drama,2010,3205706,19722766,111869,8000000,6,...,Danielle Brandon\n Gail Egan\n Georgina Lowe\n...,129,7.3,28025,80,148 user\n255 critic\n,UK\nUSA,English,Sony Pictures Classics,A look at four seasons in the lives of a happi...
