# IMDB Crawler

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs

## Search

In [2]:
TITLE_QUERY = (
    'http://www.imdb.com/find'
    '?q={title}&s=tt&ttype=ft&exact=true&ref_=fn_tt_ex'
)

In [3]:
movie_name="The Matrix"

In [4]:
def convert_title(title):
    return title.replace(' ', '%20').lower()

In [5]:
convert_title(movie_name)

'the%20matrix'

In [6]:
query = TITLE_QUERY.format(title=convert_title(movie_name))
search_res = bs(urlopen(query), "html.parser")

In [7]:
res_table = search_res.find_all("table", {"class": "findList"})[0]

In [8]:
first_row = res_table.find_all("tr")[0]

### Extracting the movie code

In [9]:
import re

In [10]:
MOVIE_CODE_REGEX = r'/title/([a-z0-9]+)/'

In [11]:
MOVIE_CODE = re.findall(MOVIE_CODE_REGEX, str(first_row))[0]

In [12]:
MOVIE_CODE

'tt0133093'

## Movie Profile

In [137]:
PROFILE_URL = 'http://www.imdb.com/title/{code}/' #?region=us

In [138]:
cur_profile_url = PROFILE_URL.format(code=MOVIE_CODE)

In [139]:
prof_page = bs(urlopen(cur_profile_url), "html.parser")

### Rating

In [16]:
rating = float(prof_page.find_all("span", {"itemprop": "ratingValue"})[0].contents[0])

In [64]:
rating_count = int(prof_page.find_all("span", {"itemprop": "ratingCount"})[0].contents[0].replace(',', ''))

### Genres

In [131]:
genres = []

In [132]:
for span in prof_page.find_all("span", {"itemprop": "genre"}):
    genres.append(span.contents[0])

In [133]:
genres

['Action', 'Sci-Fi']

### Review counts

In [72]:
REVIEW_COUNT_REGEX = r'([0-9,]+) ([a-zA-Z]+)'

In [78]:
user_review_count = 0
critic_review_count = 0

In [87]:
for span in prof_page.find_all("span", {"itemprop": "reviewCount"}):
    span_str = span.contents[0]
    res = re.findall(REVIEW_COUNT_REGEX, span_str)[0]
    if res[1] == 'user':
        user_review_count = int(res[0].replace(',', ''))
    elif res[1] == 'critic':
        critic_review_count = int(res[0].replace(',', ''))

In [88]:
user_review_count

3673

In [89]:
critic_review_count

315

### Metascore

In [17]:
metascore = int(prof_page.find_all("div", {"class": "metacriticScore score_favorable titleReviewBarSubItem"})[0].contents[1].contents[0])

### Year

In [18]:
year = int(prof_page.find_all("span", {"id": "titleYear"})[0].contents[1].contents[0])

In [19]:
year

1999

### Duration

In [44]:
MOVIE_DURATION_REGEX = r'PT([0-9]+)M'

In [45]:
duration_str = prof_page.find_all("time", {"itemprop": "duration"})[0]['datetime']

In [50]:
duration_in_minutes = int(re.findall(MOVIE_DURATION_REGEX, duration_str)[0])

In [51]:
duration_in_minutes

136

### Box office section

In [223]:
BOX_CONTENT_REGEX = r"<h3.*>Box Office</h3>([\s\S]+?)<h3"

In [224]:
box_contents = re.findall(BOX_CONTENT_REGEX, str(prof_page))[0]

In [225]:
box_contents

'\n<div class="txt-block">\n<h4 class="inline">Budget:</h4>        $63,000,000        \n\n      <span class="attribute">(estimated)</span>\n</div>\n<div class="txt-block">\n<h4 class="inline">Opening Weekend:</h4>         $27,788,331        \n\n      (USA)\n      <span class="attribute">(2 April 1999)</span>\n</div>\n<div class="txt-block">\n<h4 class="inline">Gross:</h4>        $171,383,253        \n\n      <span class="attribute">(USA)</span>\n<span class="attribute">(17 September 1999)</span>\n</div>\n<span class="see-more inline">\n<a href="business?ref_=tt_dt_bus" itemprop="url">See more</a>\xa0»\n  </span>\n<hr/>\n'

#### Budget

In [226]:
BUDGET_REGEX = r"<h4.*>Budget:</h4>\s*\$([0-9,]+)"

In [233]:
budget = int(re.findall(BUDGET_REGEX, box_contents)[0].replace(',', ''))

In [234]:
budget

63000000

#### Opening Weekend

In [254]:
from datetime import datetime

In [251]:
OPEN_DATE_REGEX = r"<h4.*>Opening Weekend:</h4>[\s\S]*?\(USA\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)[\s\S]*?<h4"

In [252]:
open_date_str = re.findall(OPEN_DATE_REGEX, box_contents)[0]

In [256]:
open_date = datetime.strptime(open_date_str, "%d %B %Y").date()

In [257]:
open_date

datetime.date(1999, 4, 2)

In [240]:
OPEN_PROF_REGEX = r"<h4.*>Opening Weekend:</h4>\s*\$([0-9,]+)[\s\S]*?\(USA\)"

In [241]:
opening_weekend_profit = int(re.findall(OPEN_PROF_REGEX, box_contents)[0].replace(',', ''))

In [242]:
opening_weekend_profit

27788331

#### Gross

In [262]:
GROSS_DATE_REGEX = r"<h4.*>Gross:</h4>[\s\S]*?\(USA\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)"

In [263]:
gross_date_str = re.findall(GROSS_DATE_REGEX, box_contents)[0]

In [264]:
gross_date = datetime.strptime(gross_date_str, "%d %B %Y").date()

In [265]:
gross_date

datetime.date(1999, 9, 17)

In [243]:
GROSS_REGEX = r"<h4.*>Gross:</h4>\s*\$([0-9,]+)[\s\S]*?\(USA\)"

In [244]:
gross = int(re.findall(GROSS_REGEX, box_contents)[0].replace(',', ''))

In [245]:
gross

171383253

In [140]:
# ## Business page
# BUSINESS_URL = 'http://www.imdb.com/title/{code}/business?ref_=tt_dt_bus'
# cur_business_url = BUSINESS_URL.format(code=MOVIE_CODE)
# busi_page = bs(urlopen(cur_business_url), "html.parser")
# busi_str = str(busi_page)
# #### Budget
# BUDGET_REGEX = r"<h5>Budget</h5>\n\s*\$([0-9,]+)"
# budget_dollar = int(re.findall(BUDGET_REGEX, busi_str)[0].replace(',', ''))
# ### Opening Weekend (USA)
# OPEN_WEEKEND_CONTENT_REGEX = r"<h5>Opening Weekend</h5>([\s\S]+?)<h5>"
# open_weekend_contents = re.findall(OPEN_WEEKEND_CONTENT_REGEX, busi_str)[0]
# US_OPEN_WEEKEND_REGEX = r"\$([0-9,]+)\s*\(USA\)"
# us_open_weekend = int(re.findall(US_OPEN_WEEKEND_REGEX, open_weekend_contents)[0].replace(',', ''))
# ### Gross Earnings
# GROSS_CONTENT_REGEX = r"<h5>Gross</h5>([\s\S]+?)<h5>"
# gross_contents = re.findall(GROSS_CONTENT_REGEX, busi_str)[0]
# GROSS_REGEX = r"<h5>Gross</h5>\n\s*\$([0-9,]+)\s*\(USA\)"
# gross_inc_dollar = int(re.findall(GROSS_REGEX, busi_str)[0].replace(',', ''))

## Ratings page

In [246]:
RATINGS_URL = 'http://www.imdb.com/title/{code}/ratings'
cur_ratings_url = RATINGS_URL.format(code=MOVIE_CODE)
ratings_page = bs(urlopen(cur_ratings_url), "html.parser")