In [1]:
import requests
from bs4 import BeautifulSoup
import re 
import pandas as pd 
import json
import time
import ast

In [17]:
def get_data_frames():
    '''
    This scrapes the site www.the-numbers.com for the years of 2009 until 2019
    '''
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    years = range(2009,2020)
    dfs = []
    for year in years:
        url = f'https://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/released-in-{year}'
        time.sleep(2)
        html = requests.get(url, headers=headers) # avoid the 403
        yearly_df = pd.read_html(html.text)[1]
        yearly_df.dropna(inplace=True)
        yearly_df['year'] = year
        dfs.append(yearly_df)
    return dfs

frames = get_data_frames()
overall_df = pd.concat(frames, axis=0)

In [18]:
def clean_movie_names(dataframe):
    '''
    The movie names were coming back with some no ascii characters so this function cleans up 
    those characters and adds an apostrophe where those characters were located
    '''
    overall_df['clean_movie'] = overall_df.Movie.map(lambda x: x.replace("â", "'"))
    return overall_df

In [None]:
overall_df = clean_movie_names(overall_df)

In [None]:
def get_ratings(dataframe):
    '''
    This calls an api from omdbapi.com and returns a dictionary of values for the specified movies
    that were in the yearly gross profits from get_data_frames
    '''
    movie_rating_list = []
    for api_req,year in zip(dataframe.clean_movie, dataframe.year): 
        movie_name = f"http://www.omdbapi.com/?t={api_req}&y={year}&apikey=b213361e"
        try:
            movie_ratings = requests.get(movie_name).json()
            year_2016_list.append(movie_ratings)
        except ValueError:
            print('Name did not work')
    return pd.DataFrame(movie_rating_list)


In [None]:
ratings_df = get_ratings(overall_df)

In [None]:
def get_tomato_score(row):
    try:
        return [x.get('Value') for x in row if x.get('Source') == 'Rotten Tomatoes'][0]
    except:
        return (None)

In [None]:
def imdb_clean(row):
    try:
        return float(row) * 10
    except:
        return(None)

In [None]:
def get_meta_score(row):
    try:
        return int(row)
    except:
        return(None)

In [None]:
ratings_df['Rotten_Score'] = ratings_df.Ratings.map(get_tomato_score)
ratings_df['Metacritic_Score'] = ratings_df.Metascore.map(get_meta_score)
ratings_df['Imdb_Score'] = ratings_df.imdbRating.map(imdb_clean)

In [None]:
joining_df = ratings_df[['Title','Rotten_Score','Metacritic_Score','Imdb_Score']].set_index('Title')

In [None]:
overall_df = overall_df_df[['Worldwide Box Office','Domestic Box Office','International Box Office',
                               'DomesticShare','year','clean_movie']].set_index('clean_movie')

In [None]:
overall_df.join(joining_df,on='clean_movie',how='inner')

In [None]:
final_df = overall_df.dropna()