In [1]:
import re
import requests
import lxml
import pandas as pd
import time
import numpy as np
from random import randint
from bs4 import BeautifulSoup as bs

In [2]:
class WeeklyCharts():
  def __init__(self, stop):
    self.prefix = 'https://www.boxofficemojo.com/weekly/'
    self.date = 2020.00
    self.stop = stop
    self.string_url = ''
    self.movie_dict = {}
    self.string_sfx = ''
    self.final_df = pd.DataFrame()

  def string_suff(self):
    # URL suffix format is a 2 digit year, W, 2 digit week; starting at 20W01
    # Weeks start on friday. Converted to a float for ease of calculations
    string_sfx = str('{:.2f}'.format(self.date))
    string_sfx = string_sfx.replace('.','W')
    self.string_sfx = string_sfx
    return (string_sfx + '/')

  def next_page(self):
    if self.date == self.stop:
      return (self.stop)
    elif round(self.date % 1, ndigits=2) == 0.52:
      self.date += 1
      self.date -= 0.51
    else:
      self.date += .01
    self.string_url = self.prefix + self.string_suff()
    return self.string_url

  def crawl(self):
    while self.date < self.stop:
      time.sleep(randint(1, 3))
      page = requests.get(self.next_page(), headers=header)
      soup = bs(page.text, 'lxml')

      table = soup.table.children
      for i in table:
        movie_name = i.find('a').text
        movie_url = i.find('a')['href']
        if movie_name not in self.movie_dict.keys():
          self.movie_dict[movie_name] = movie_url

      df = pd.read_html(page.text)
      df = df[0]
      df['YRWK'] = '{:2f}'.format(self.date)
      self.final_df = pd.concat([self.final_df, df])
      
  def save_final_df(self):
    self.final_df.to_csv(f'WeeklyChartsDataFrame.csv')


In [3]:
class MoviePage():
  def __init__(self, release, url_sfx):
    self.prefix = 'https://www.boxofficemojo.com/'
    self.name = release
    self.url_sfx = self.url_clean(url_sfx)
    self.gross_pattern  = re.compile(r'.+(mojo-performance-summary-table$)')
    self.budget_pattern  = re.compile(r'Budget')
    self.mpaa_pattern  = re.compile(r'MPAA')
    self.genre_pattern  = re.compile(r'Genres')
    self.ser = pd.Series(dtype=object)

  def __repr__(self):
    return(f'{self.name} has a url of {self.url_sfx}')

  def url_clean(self, sfx):
    clean_url = sfx.split('?')
    return clean_url[0]

  def crawl(self):
    gross_list = []
    time.sleep(randint(2, 4))
    page = requests.get(self.prefix + self.url_sfx,
                        headers={'User-Agent': 'Mozilla/5.0'})
    soup = bs(page.text, 'lxml')
    self.ser['Budget'] = self.get_budget(soup)
    self.ser['MPAA'] = self.get_mpaa(soup)
    self.ser['Genre'] = self.get_genre(soup)
    gross_list = self.get_gross(soup)
    self.ser['D.Gross'] = gross_list[0]
    self.ser['I.Gross'] = gross_list[1]
    self.ser['Total Gross'] = gross_list[2]

  def get_budget(self, soup):
    try:
      budget = soup.find(text=self.budget_pattern).next_element
      budget_amount = int(re.sub('[$,]', '', budget.text))
    except AttributeError:
      budget_amount = np.nan
    return(budget_amount)
  
  def get_mpaa(self, soup):
    try:
      mpaa = soup.find(text=self.mpaa_pattern).next_element.text
    except AttributeError:
      mpaa = self.crawl_deeper(soup)
    return mpaa.strip()
  
  def get_genre(self, soup):
    try:
      genre = soup.find(text=self.genre_pattern).next_element.text.split()
      return(genre)
    except AttributeError:
      return('attrerror')

  def get_gross(self, soup):
    try:
      # grab gross class
      grosses =soup.find(class_=movie.gross_pattern)
      money_pattern = re.compile('\$[\d+,]+|–')
      # find text matching money pattern
      grosses = grosses.find_all('span', text=money_pattern)
      # iterate through text, replacing - and removing $,
      for idx, gross in enumerate(grosses):
        if gross.text == '–':
          grosses[idx] = np.nan
          continue
        else:
          grosses[idx] = int(re.sub('[$,]', '', gross.text))
    except AttributeError:
      grosses = [np.nan, np.nan, np.nan]
    return(grosses)

  def crawl_deeper(self, soup):
    title_pattern = re.compile('.*(mojo-title-link).*')
    title_link = soup.find(class_=title_pattern)
    # href section we want is before ?ref : /title/tt0000000/?ref
    try:
      title_sfx, _ = title_link.get('href').split('?')
    except AttributeError:
      return('Not Rated')
    page = requests.get('https://pro.imdb.com/' + title_sfx, headers={'User-Agent': 'Mozilla/5.0'})
    soup = bs(page.text, 'lxml')
    rating = soup.find(id='certificate')
    if self.ser['Budget'] == np.nan:
      deeper = IMDB_crawl(title_sfx)
      self.ser['Budget'] = deeper.get_budget()
    try:
      return(rating.text)
    except AttributeError:
      return('Not Rated')


In [4]:
class IMDB_crawl():
  def __init__(self, url_sfx):
    self.url_prefix = 'https://www.imdb.com/'
    self.url_sfx = url_sfx
    self.url = self.url_prefix + self.url_sfx
  def get_budget(self):
    page = requests.get(self.url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = bs(page.text, 'lxml')
    try:
      budget_element = soup.find(attrs={'data-testid':'title-boxoffice-section'})
      budget = budget_element.find(text=re.compile(r'Budget')).children.span.text
      budget = self.clean_money(budget)
      return budget
    except AttributeError:
      return(np.nan)
  def clean_money(self, money_string):
    money_string = re.sub('[$,a-z]', '', money_string)
    return(money_string)

###MoviePage.url_clean() alternative
splitting the string at a ? should be fine because of the way ?'s are treated in html. Still, it seems a bit arbitrary. We could instead use a REGEX to identify the part we want, instead of looking for the part we do not.

In [None]:
#insert REGEX here
# /rl\d+/$

#Aggregate Weekly Performances

In [None]:
header = {'User-Agent': 'Mozilla/5.0'}
WeeklyPage = WeeklyCharts(2022.15)
WeeklyPage.crawl()
WeeklyPage.save_final_df()

breaktime!


In [None]:
url_df = pd.DataFrame.from_dict(WeeklyPage.movie_dict, orient='index')

In [None]:
url_df.to_csv('Movie_url_list.csv')

#Movie Page
Aggregate each movie's attributes and performance.

In [5]:
#load url_list
df = pd.read_csv('Movie_url_list.csv')
# Transpose
movie_dict = df.set_index('Rank').T.to_dict('records')
movie_dict = movie_dict[0]
movie_df = pd.DataFrame()

In [6]:
# loop through movie dict, creating Series objects and appending to DataFrame
series_dict = {}
for idx, movie in enumerate(movie_dict):
  movie = MoviePage(movie, movie_dict[movie])
  movie.crawl()
  series_dict[movie.name] = movie.ser
# I think a better solution to series_dict would be a numpy array/records
  print(movie.name)



Star Wars: Episode IX - The Rise of Skywalker
Jumanji: The Next Level
Little Women
Frozen II
The Grudge
Spies in Disguise
Knives Out
Uncut Gems
Bombshell
Cats
Richard Jewell
Ford v Ferrari
A Beautiful Day in the Neighborhood
Queen & Slim
Parasite
1917
Ip Man 4: The Finale
21 Bridges
Jojo Rabbit
A Hidden Life
Mystify: Michael Hutchence
Dark Waters
Merci pour tout
Midway
Black Christmas
Doctor Who Live Q&A And Screening
Harriet
Playing with Fire
Joker
Maleficent: Mistress of Evil
Just Mercy
Fantastic Fungi
No Safe Spaces
Pain and Glory
The Good Liar
The Song of Names
Zombieland: Double Tap
63 Up
Cunningham
Ashfall
Clemency
Invisible Life
Honey Boy
The Lighthouse
Charlie's Angels
Waves
Doctor Sleep
Once Upon a Time... In Hollywood
Countdown
Advocate
Judy
The White Sheik
Branagh Theatre Live: The Winter's Tale
Playmobil: The Movie
American Dharma
Synonyms
Les Misérables: The Staged Concert
Midnight Family
The Kingmaker
Linda Ronstadt: The Sound of My Voice
Varda by Agnès
Recorder: The Mari

In [7]:
movie_df = pd.concat([movie_df, pd.DataFrame.from_dict(series_dict)])

In [8]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Budget to Total Gross
Columns: 906 entries, Star Wars: Episode IX - The Rise of Skywalker to Stanleyville
dtypes: object(906)
memory usage: 42.5+ KB


In [9]:
movie_df = movie_df.T

In [10]:
movie_df = movie_df.astype({'Budget':'float64', 'D.Gross':'float64', 'I.Gross':'float64', 'Total Gross':'float64'})

In [11]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 906 entries, Star Wars: Episode IX - The Rise of Skywalker to Stanleyville
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Budget       80 non-null     float64
 1   MPAA         906 non-null    object 
 2   Genre        906 non-null    object 
 3   D.Gross      887 non-null    float64
 4   I.Gross      617 non-null    float64
 5   Total Gross  906 non-null    float64
dtypes: float64(4), object(2)
memory usage: 81.8+ KB


In [12]:
movie_df.head()

Unnamed: 0,Budget,MPAA,Genre,D.Gross,I.Gross,Total Gross
Star Wars: Episode IX - The Rise of Skywalker,275000000.0,PG-13,"[Action, Adventure, Fantasy, Sci-Fi]",515202542.0,558941706.0,1074144000.0
Jumanji: The Next Level,125000000.0,PG-13,"[Action, Adventure, Comedy, Fantasy]",320314960.0,479744747.0,800059700.0
Little Women,40000000.0,PG,"[Drama, Romance]",108101214.0,108500000.0,216601200.0
Frozen II,150000000.0,PG,"[Adventure, Animation, Comedy, Family, Fantasy...",477373578.0,972653355.0,1450027000.0
The Grudge,10000000.0,R,"[Fantasy, Horror]",21221803.0,28289516.0,49511320.0


In [13]:
new_frame = movie_df.copy()

In [14]:
nf_transpose = new_frame.T

In [15]:
nf_transpose.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Budget to Total Gross
Columns: 906 entries, Star Wars: Episode IX - The Rise of Skywalker to Stanleyville
dtypes: object(906)
memory usage: 42.7+ KB


Instantly, we see an issue. Our budget column only reported 80 non-null counts. Budget is one of the most important data instances we could have sequestered. We will need to investigate this.

In [16]:
nf_transpose.head(10)

Unnamed: 0,Star Wars: Episode IX - The Rise of Skywalker,Jumanji: The Next Level,Little Women,Frozen II,The Grudge,Spies in Disguise,Knives Out,Uncut Gems,Bombshell,Cats,...,The Unbearable Weight of Massive Talent,Y cómo es él?,Vivo,Petite Maman,The Duke,Unplugging,Charlotte,Hit the Road,Take Me to the River: New Orleans,Stanleyville
Budget,275000000.0,125000000.0,40000000.0,150000000.0,10000000.0,100000000.0,40000000.0,19000000.0,32000000.0,95000000.0,...,,,,,,,,,,
MPAA,PG-13,PG-13,PG,PG,R,PG,PG-13,R,R,PG,...,R,PG-13,Not Rated,PG,R,R,Not Rated,Not Rated,Not Rated,Not Rated
Genre,"[Action, Adventure, Fantasy, Sci-Fi]","[Action, Adventure, Comedy, Fantasy]","[Drama, Romance]","[Adventure, Animation, Comedy, Family, Fantasy...","[Fantasy, Horror]","[Action, Adventure, Animation, Comedy, Family,...","[Comedy, Crime, Drama, Mystery, Thriller]","[Crime, Drama, Thriller]","[Biography, Drama]","[Comedy, Drama, Family, Fantasy, Musical]",...,"[Action, Comedy, Crime, Thriller]",[Comedy],[Documentary],"[Drama, Fantasy]","[Biography, Comedy, Drama]","[Comedy, Romance]",[Animation],[Drama],[Documentary],[Comedy]
D.Gross,515202542.0,320314960.0,108101214.0,477373578.0,21221803.0,66757013.0,165363234.0,50023780.0,31762808.0,27166770.0,...,18229696.0,1341918.0,351494.0,580776.0,1001474.0,20500.0,18520.0,91753.0,14631.0,2137.0
I.Gross,558941706.0,479744747.0,108500000.0,972653355.0,28289516.0,104859751.0,146242347.0,,29641586.0,46666578.0,...,5715546.0,3125233.0,315458.0,1097421.0,11097404.0,13795.0,,303877.0,,
Total Gross,1074144248.0,800059707.0,216601214.0,1450026933.0,49511319.0,171616764.0,311605581.0,50023780.0,61404394.0,73833348.0,...,23945242.0,4467151.0,666952.0,1678197.0,12098878.0,34295.0,18520.0,395630.0,14631.0,2137.0


In [17]:
nf_transpose.to_csv('movie_performance.csv')