In [2]:
import re
import requests
import lxml
import pandas as pd
import time
import numpy as np
from random import randint
from bs4 import BeautifulSoup as bs

In [None]:
class WeeklyCharts():
  def __init__(self, stop):
    self.prefix = 'https://www.boxofficemojo.com/weekly/'
    self.date = 2020.00
    self.stop = stop
    self.string_url = ''
    self.movie_dict = {}
    self.string_sfx = ''
    self.final_df = pd.DataFrame()

  def string_suff(self):
    # URL suffix format is a 2 digit year, W, 2 digit week; starting at 20W01
    # Weeks start on friday. Converted to a float for ease of calculations
    string_sfx = str('{:.2f}'.format(self.date))
    string_sfx = string_sfx.replace('.','W')
    self.string_sfx = string_sfx
    return (string_sfx + '/')

  def next_page(self):
    if self.date == self.stop:
      return (self.stop)
    elif round(self.date % 1, ndigits=2) == 0.52:
      self.date += 1
      self.date -= 0.51
    else:
      self.date += .01
    self.string_url = self.prefix + self.string_suff()
    return self.string_url

  def crawl(self):
    while self.date < self.stop:
      time.sleep(randint(1, 3))
      page = requests.get(self.next_page(), headers=header)
      soup = bs(page.text, 'lxml')

      table = soup.table.children
      for i in table:
        movie_name = i.find('a').text
        movie_url = i.find('a')['href']
        if movie_name not in self.movie_dict.keys():
          self.movie_dict[movie_name] = movie_url

      df = pd.read_html(page.text)
      df = df[0]
      df['YRWK'] = '{:2f}'.format(self.date)
      self.final_df = pd.concat([self.final_df, df])
      
  def save_final_df(self):
    self.final_df.to_csv(f'WeeklyChartsDataFrame.csv')


In [3]:
class MoviePage():
  def __init__(self, release, url_sfx):
    self.prefix = 'https://www.boxofficemojo.com/'
    self.name = release
    self.url_sfx = self.url_clean(url_sfx)
    self.gross_pattern  = re.compile(r'.+(mojo-performance-summary-table$)')
    self.budget_pattern  = re.compile(r'Budget')
    self.mpaa_pattern  = re.compile(r'MPAA')
    self.genre_pattern  = re.compile(r'Genres')
    self.ser = pd.Series(dtype=object)

  def __repr__(self):
    return(f'{self.name} has a url of {self.url_sfx}')

  def url_clean(self, sfx):
    clean_url = sfx.split('?')
    return clean_url[0]

  def crawl(self):
    gross_list = []
    time.sleep(randint(2, 4))
    page = requests.get(self.prefix + self.url_sfx,
                        headers={'User-Agent': 'Mozilla/5.0'})
    soup = bs(page.text, 'lxml')
    self.ser['Budget'] = self.get_budget(soup)
    self.ser['MPAA'] = self.get_mpaa(soup)
    self.ser['Genre'] = self.get_genre(soup)
    gross_list = self.get_gross(soup)
    self.ser['D.Gross'] = gross_list[0]
    self.ser['I.Gross'] = gross_list[1]
    self.ser['Total Gross'] = gross_list[2]

  def get_budget(self, soup):
    try:
      budget = soup.find(text=self.budget_pattern).next_element
      budget_amount = int(re.sub('[$,]', '', budget.text))
    except AttributeError:
      budget_amount = np.nan
    return(budget_amount)
  
  def get_mpaa(self, soup):
    try:
      mpaa = soup.find(text=self.mpaa_pattern).next_element.text
    except AttributeError:
      return('attrerror')
    return(mpaa) 
  
  def get_genre(self, soup):
    try:
      genre = soup.find(text=self.genre_pattern).next_element.text.split()
      return(genre)
    except AttributeError:
      return('attrerror')

  def get_gross(self, soup):
    try:
      # grab gross class
      grosses =soup.find(class_=movie.gross_pattern)
      money_pattern = re.compile('\$[\d+,]+|–')
      # find text matching money pattern
      grosses = grosses.find_all('span', text=money_pattern)
      # iterate through text, replacing - and removing $,
      for idx, gross in enumerate(grosses):
        if gross.text == '–':
          grosses[idx] = np.nan
          continue
        else:
          grosses[idx] = int(re.sub('[$,]', '', gross.text))
    except AttributeError:
      grosses = [np.nan, np.nan, np.nan]
    return(grosses)

###MoviePage.url_clean() alternative
splitting the string at a ? should be fine because of the way ?'s are treated in html. Still, it seems a bit arbitrary. We could instead use a REGEX to identify the part we want, instead of looking for the part we do not.

In [None]:
#insert REGEX here
# /rl\d*/$

#Aggregate Weekly Performances

In [None]:
header = {'User-Agent': 'Mozilla/5.0'}
WeeklyPage = WeeklyCharts(2022.15)
WeeklyPage.crawl()
WeeklyPage.save_final_df()
print('Ding!')

breaktime!


In [None]:
url_df = pd.DataFrame.from_dict(WeeklyPage.movie_dict, orient='index')

In [None]:
url_df.to_csv('Movie_url_list.csv')

#Movie Page
Aggregate each movie's attributes and performance.

In [37]:
#load url_list
df = pd.read_csv('Movie_url_list.csv')
# Transpose
movie_dict = df.set_index('Rank').T.to_dict('records')
movie_dict = movie_dict[0]
movie_df = pd.DataFrame()

In [38]:
# loop through movie dict, creating Series objects and appending to DataFrame
for idx, movie in enumerate(movie_dict):
  movie = MoviePage(movie, movie_dict[movie])
  movie.crawl()
  movie_df[movie.name] = movie.ser

movie_df.head(5)

  """


Unnamed: 0,Star Wars: Episode IX - The Rise of Skywalker,Jumanji: The Next Level,Little Women,Frozen II,The Grudge,Spies in Disguise,Knives Out,Uncut Gems,Bombshell,Cats,...,The Unbearable Weight of Massive Talent,Y cómo es él?,Vivo,Petite Maman,The Duke,Unplugging,Charlotte,Hit the Road,Take Me to the River: New Orleans,Stanleyville
Budget,275000000,125000000,40000000,150000000,10000000,100000000,40000000,19000000,32000000,95000000,...,,,,,,,,,,
MPAA,PG-13,PG-13,PG,PG,R,PG,PG-13,R,R,PG,...,R,PG-13,attrerror,PG,R,R,attrerror,attrerror,attrerror,attrerror
Genre,"[Action, Adventure, Fantasy, Sci-Fi]","[Action, Adventure, Comedy, Fantasy]","[Drama, Romance]","[Adventure, Animation, Comedy, Family, Fantasy...","[Fantasy, Horror]","[Action, Adventure, Animation, Comedy, Family,...","[Comedy, Crime, Drama, Mystery, Thriller]","[Crime, Drama, Thriller]","[Biography, Drama]","[Comedy, Drama, Family, Fantasy, Musical]",...,"[Action, Comedy, Crime, Thriller]",[Comedy],[Documentary],"[Drama, Fantasy]","[Biography, Comedy, Drama]","[Comedy, Romance]",[Animation],[Drama],[Documentary],[Comedy]
D.Gross,515202542,320314960,108101214,477373578,21221803,66757013,165363234,50023780,31762808,27166770,...,16831956,1284711,351494,362934,587819,20500,17419,74609,13619,2137
I.Gross,558941706,479744747,108500000,972653355,28289516,104859751,146242347,,29641586,46666578,...,5312159,3125233,315458,1097421,11165215,13795,,303877,,


In [36]:
movie_df.T.head()

Unnamed: 0,Budget,MPAA,Genre,D.Gross,I.Gross,Total Gross
Star Wars: Episode IX - The Rise of Skywalker,275000000,PG-13,"[Action, Adventure, Fantasy, Sci-Fi]",515202542,558941706,1074144248
Jumanji: The Next Level,125000000,PG-13,"[Action, Adventure, Comedy, Fantasy]",320314960,479744747,800059707
Little Women,40000000,PG,"[Drama, Romance]",108101214,108500000,216601214
Frozen II,150000000,PG,"[Adventure, Animation, Comedy, Family, Fantasy...",477373578,972653355,1450026933
The Grudge,10000000,R,"[Fantasy, Horror]",21221803,28289516,49511319


In [39]:
new_frame = movie_df.copy()

In [40]:
nf_transpose = new_frame.T

In [41]:
nf_transpose.info()

<class 'pandas.core.frame.DataFrame'>
Index: 906 entries, Star Wars: Episode IX - The Rise of Skywalker to Stanleyville
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Budget       80 non-null     object
 1   MPAA         906 non-null    object
 2   Genre        906 non-null    object
 3   D.Gross      887 non-null    object
 4   I.Gross      617 non-null    object
 5   Total Gross  906 non-null    object
dtypes: object(6)
memory usage: 81.8+ KB


In [43]:
nf_transpose.head(10)

Unnamed: 0,Budget,MPAA,Genre,D.Gross,I.Gross,Total Gross
Star Wars: Episode IX - The Rise of Skywalker,275000000,PG-13,"[Action, Adventure, Fantasy, Sci-Fi]",515202542,558941706.0,1074144248
Jumanji: The Next Level,125000000,PG-13,"[Action, Adventure, Comedy, Fantasy]",320314960,479744747.0,800059707
Little Women,40000000,PG,"[Drama, Romance]",108101214,108500000.0,216601214
Frozen II,150000000,PG,"[Adventure, Animation, Comedy, Family, Fantasy...",477373578,972653355.0,1450026933
The Grudge,10000000,R,"[Fantasy, Horror]",21221803,28289516.0,49511319
Spies in Disguise,100000000,PG,"[Action, Adventure, Animation, Comedy, Family,...",66757013,104859751.0,171616764
Knives Out,40000000,PG-13,"[Comedy, Crime, Drama, Mystery, Thriller]",165363234,146242347.0,311605581
Uncut Gems,19000000,R,"[Crime, Drama, Thriller]",50023780,,50023780
Bombshell,32000000,R,"[Biography, Drama]",31762808,29641586.0,61404394
Cats,95000000,PG,"[Comedy, Drama, Family, Fantasy, Musical]",27166770,46666578.0,73833348


In [47]:
nf_transpose.to_csv('movie_performance.csv')