# Using Beautifulsoup to scrape data from BoxOfficeMojo

In [305]:
from __future__ import print_function, division
import re
import time
import numpy as np
import pandas as pd
import requests
import sys
import random
from multiprocessing import Pool
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import dateutil.parser


## GET LINKS 1995-2017

In [357]:
for y in range (1995,2018):
    for x in range (1,6):
        url = 'http://www.boxofficemojo.com/yearly/chart/?page={}&view=releasedate&view2=domestic&yr={}&p=.htm'.format(x,y)
        ua = UserAgent()
        user_agent = {'User-agent': ua.random}
        response  = requests.get(url, headers = user_agent)
        page = response.text
        soup = BeautifulSoup(page, "lxml")

        tables = soup.find_all('table')
        table = tables[4]

        links = table.find_all(href = re.compile('/movies/\?id='))
        for link in links:
            link_list.append('http://www.boxofficemojo.com{}&adjust_yr=1&p=.htm'.format(link['href']))



In [359]:
link_list

['http://www.boxofficemojo.com/movies/?id=toystory.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=batmanforever.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=apollo13.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=pocahontas.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=aceventura2.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=goldeneye.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=jumanji.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=casper.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=seven.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=diehardwithavengeance.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=crimsontide.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=waterworld.htm&adjust_yr=1&p=.htm',
 'http://www.boxofficemojo.com/movies/?id=dangerousminds.htm&adju

In [358]:
len(link_list)

10325

### Creating functions to automate scraping process - beautifulsoup will go through every previously scraped link and obtain the same information for each movie.

In [236]:
#Function to find information in the html of each movie page
def get_movie_value(soup, field):
    obj = soup.find(text=re.compile(field))
    if not obj: 
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [264]:
tickets = get_movie_value(soup,'Domestic Total')
print(tickets)

42,036,600


In [265]:
runtime = get_movie_value(soup,'Runtime')
print(runtime)

1 hrs. 49 min.


In [266]:
rating = get_movie_value(soup,'MPAA Rating')
print(rating)

PG


In [267]:
release_date = get_movie_value(soup,'Release Date')
print(release_date)

March 5, 2010


In [268]:
distributor = get_movie_value(soup,'Distributor')
print(distributor)

Buena Vista


In [269]:
production_budget = get_movie_value(soup,'Production Budget')
print(production_budget)

$200 million


In [270]:
#similar to the above function, but searches by the 'td' tag
def get_movie_value_td(soup, field_name):
    obj = soup.find('td', text=re.compile(field_name))
    if not obj: 
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [318]:
def get_title(soup):
    obj = soup.find('td', valign = 'top').find('font', face = 'Verdana')
    return obj.text

In [317]:
get_title(soup)

'Alice in Wonderland (2010)'

In [273]:
widest_release = get_movie_value_td(soup, 'Widest').replace(',','').split ('t')[0].strip()
print (widest_release)

3739


In [294]:
in_release_raw = get_movie_value_td(soup, 'In Release').split('d')[0].strip()
print (in_release)

126


In [353]:
#defining some more functions to clean our inputs
def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def intify(string):
    string = string.replace('$', '').replace(',', '')
    return int(string)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def widest_release_split():
    widest_release_raw = get_movie_value_td(soup, 'Widest')
    if not widest_release_raw: 
        return None
    if 't' in widest_release_raw:
        return widest_release_raw.replace(',','').split ('t')[0].strip()
    else:
        return None

def in_release_split():
    in_release_raw = get_movie_value_td(soup, 'In Release')
    if not in_release_raw:
        return None
    if 'd' in in_release_raw:
        return in_release_raw.split('d')[0].strip()
    else:
        return None

In [310]:
release_date_raw = get_movie_value(soup,'Release Date')
release_date = to_date(release_date_raw)
print (release)

tickets_raw = get_movie_value(soup,'Domestic Total')
tickets = intify(tickets_raw)
print (tickets)

runtime_raw = get_movie_value(soup,'Runtime')
runtime = runtime_to_minutes(runtime_raw)
print (runtime)

rating = get_movie_value(soup,'MPAA Rating')
print (rating)

distributor = get_movie_value(soup,'Distributor')
print(distributor)

production_budget = get_movie_value(soup,'Production Budget')
print(production_budget)

title = get_title(soup)
print (title)


widest_release = widest_release_split()
print (widest_release)


in_release = in_release_split()
print (in_release)

2010-03-05 00:00:00
42036600
109
PG
Buena Vista
$200 million
Alice in Wonderland (2010)
3739
126


In [360]:
movie_data

[]

In [425]:
    for link in link_list[10122:]:
        url = link
        ua = UserAgent()
        user_agent = {'User-agent': ua.random}
        response  = requests.get(url, headers = user_agent)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        
        release_date_raw = get_movie_value(soup,'Release Date')
        release_date = to_date(release_date_raw)
        print (release_date)

        tickets_raw = get_movie_value(soup,'Domestic Total')
        tickets = intify(tickets_raw)
        print (tickets)

        runtime_raw = get_movie_value(soup,'Runtime')
        runtime = runtime_to_minutes(runtime_raw)
        print (runtime)

        rating = get_movie_value(soup,'MPAA Rating')
        print (rating)

        distributor = get_movie_value(soup,'Distributor')
        print(distributor)

        production_budget = get_movie_value(soup,'Production Budget')
        print(production_budget)

        title = get_title(soup)
        print (title)


        widest_release = widest_release_split()
        print (widest_release)


        in_release = in_release_split()
        print (in_release)
        
        headers = ['movie title', 'total tickets',
           'release date', 'runtime (mins)', 'rating', 'distributor', 'budget', 'widest release', 'in_release']

        movie_dict = dict(zip(headers, [title,
                                tickets,
                                release_date,
                                runtime,
                                rating, distributor, production_budget, widest_release, in_release]))
        movie_data.append(movie_dict)



2017-09-08 00:00:00
42400
106
PG-13
IFC
N/A
Rebel in the Rye
82
56
2017-04-19 00:00:00
40700
140
Unrated
Fathom
N/A
Boston:An American Running Story
481
14
2017-01-27 00:00:00
41000
108
Not Yet Rated
Well Go USA
$65 million
Kung Fu Yoga
27
28
2017-10-21 00:00:00
40300
60
Unrated
Fathom
N/A
Disney Junior HalloVeen Party
396
6
2017-06-02 00:00:00
40200
95
Unrated
Vitagraph
N/A
Letters from Baghdad
20
101
2017-01-13 00:00:00
40300
135
Unrated
FIP
N/A
Ok Jaanu
121
14
2017-05-25 00:00:00
39300
241
R
Abramorama
N/A
Long Strange Trip - The Untold Story of The Grateful Dead
56
8
2017-09-29 00:00:00
39100
123
Unrated
Abramorama
N/A
Pearl Jam - Let's Play Two
201
17
2017-10-13 00:00:00
37900
116
Unrated
Kino Lorber
N/A
Tom of Finland
12
108
2017-05-06 00:00:00
38400
None
Unrated
Fathom
N/A
Canelo vs. Chavez Jr.
273
2
2017-05-05 00:00:00
37700
105
Unrated
Well Go USA
N/A
This Is Not What I Expected
36
56
2017-12-01 00:00:00
35700
106
PG-13
Warner Bros.
N/A
Dunkirk (Re-Issue 12/1)
53
None
2017-10-

2017-07-21 00:00:00
11200
140
Unrated
Eros
N/A
Munna Michael
75
7
2017-10-20 00:00:00
10700
90
Not Yet Rated
Music Box Films
N/A
Aida's Secrets
9
101
2017-03-02 00:00:00
11000
94
PG-13
Purdie Distribution
N/A
Tim Timmerman, Hope of America
12
56
2017-09-08 00:00:00
10900
128
PG-13
Sony / Columbia
N/A
Poster Boys
50
14
2017-04-28 00:00:00
10800
126
Unrated
Well Go USA
N/A
The Mayor
29
28
2017-07-21 00:00:00
10700
99
Unrated
CFI Releasing
N/A
The Fencer
7
182
2017-10-31 00:00:00
10500
89
R
Sony / Screen Gems
N/A
Keep Watching
805
1
2017-09-08 00:00:00
10200
118
Unrated
Well Go USA
N/A
Memoir of a Murderer
8
42
2017-04-28 00:00:00
10200
114
Unrated
CJ Entertainment
N/A
The King's Case Note
7
49
2017-09-15 00:00:00
10200
107
PG-13
ArtAffects
N/A
Because of Gracia
32
28
2017-10-06 00:00:00
10100
102
Unrated
The Orchard
N/A
Dina
24
49
2017-10-13 00:00:00
10000
96
Unrated
Janus Films
N/A
Night of the Living Dead (2017 4k Restoration)
8
28
2017-06-16 00:00:00
9400
130
Unknown
FIP
N/A
Warriors 

## Putting all the data into a dataframe

In [426]:
pd.DataFrame(movie_data, columns = ['movie title', 'total tickets',
           'release date', 'runtime (mins)', 'rating', 'distributor', 'budget', 'widest release', 'in_release'])

Unnamed: 0,movie title,total tickets,release date,runtime (mins),rating,distributor,budget,widest release,in_release
0,Toy Story,43868300,1995-11-22,81.0,G,Buena Vista,,2574,
1,Batman Forever,42306000,1995-06-16,122.0,PG-13,Warner Bros.,$100 million,2893,147
2,Apollo 13,39556600,1995-06-30,140.0,PG,Universal,,2347,
3,Pocahontas,32547100,1995-06-16,81.0,G,Buena Vista,,2757,
4,Ace Ventura:When Nature Calls,24898500,1995-11-10,94.0,PG-13,Warner Bros.,,2705,
5,GoldenEye,24403900,1995-11-17,130.0,PG-13,MGM,,2667,
6,Jumanji,22876600,1995-12-15,104.0,PG,Sony / Columbia,$65 million,2530,
7,Casper,23064000,1995-05-26,101.0,PG,Universal,,2757,
8,Seven,22961100,1995-09-22,127.0,R,New Line,$33 million,2528,
9,Die Hard:With A Vengeance,22991400,1995-05-19,131.0,R,Fox,$90 million,2579,


In [427]:
pd.DataFrame(movie_data, columns = ['movie title', 'total tickets',
           'release date', 'runtime (mins)', 'rating', 'distributor', 'budget', 'widest release', 'in_release']).to_csv('movies1.csv')