In [1]:
!pip install tmdbsimple

Collecting tmdbsimple
  Downloading tmdbsimple-2.9.1-py3-none-any.whl (38 kB)
Installing collected packages: tmdbsimple
Successfully installed tmdbsimple-2.9.1


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os, json, math, time
import tmdbsimple as tmdb
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)
from tqdm.notebook import tqdm_notebook
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
tmdb.API_KEY = "2e7e239c25274c3cdc1060a9df22dfc7"

In [4]:
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    for country in releases['countries']:
        if country['iso_3166_1'] == 'US':
            info['certification'] = country['certification']
    return info

In [5]:
get_movie_with_rating(601)

{'adult': False,
 'backdrop_path': '/9Kg322bGsEbmp94LjCCVGz3cpNw.jpg',
 'belongs_to_collection': None,
 'budget': 10500000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 10751, 'name': 'Family'},
  {'id': 14, 'name': 'Fantasy'}],
 'homepage': 'http://www.et20.com/',
 'id': 601,
 'imdb_id': 'tt0083866',
 'original_language': 'en',
 'original_title': 'E.T. the Extra-Terrestrial',
 'overview': 'An alien is left behind on Earth and saved by the 10-year-old Elliot who decides to keep him hidden in his home. While a task force hunts for the extra-terrestrial, Elliot, his brother, and his little sister Gertie form an emotional bond with their new friend, and try to help him find his way home.',
 'popularity': 82.026,
 'poster_path': '/an0nD6uq6byfxXCfk6lQBzdL2J1.jpg',
 'production_companies': [{'id': 33,
   'logo_path': '/8lvHyhjr8oUKOOy2dKXoALWKdp0.png',
   'name': 'Universal Pictures',
   'origin_country': 'US'},
  {'id': 56,
   'logo_path':

In [6]:
def write_json(new_data, filename): 

    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [7]:
basics = pd.read_csv("refined_basics.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'refined_basics.csv'

In [None]:
YEARS_TO_GET = [2001, 2002]
errors = []

In [None]:
for YEAR in tqdm_notebook(YEARS_TO_GET, desc="YEARS", position=0):
#Defining the JSON file to store results for year
    JSON_FILE=F'{FOLDER}tmdb_api_results_{YEAR}.json'
    #Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    #if it does not exist: create it
    if file_exists==False:
        #print a message indicating the file is being created
        print(f"Creating {JSON_FILE} for API results for {YEAR}.")
        #save an empty dict with just "imdb_id" to the new json file
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
        #Saving new year as the current df
        df=basics.loc[basics["startYear"]==YEAR].copy()
        #saving movie ids to separate variable
        movie_ids=df["tconst"].copy()
        movie_ids
        
        #load existing data from json into a dataframe alled "previous_df"
        previous_df=pd.read_json(JSON_FILE)
        
        previous_df

        #filter out any ids that are already in the JSON_FILE
        movie_ids_to_get=movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

        #Loop through movie_ids_to_get with a tqdm progress bar
        for movie_id in tqdm_notebook(movie_ids_to_get,
                                      f"Movies from {YEAR}"):
        
            #get index and movie id from list
            try:
                #Retrieve then data for the movie id
                temp=get_movie_with_rating(movie_id)
                #append/extend results to existing fil using a pre_made function
                write_json(temp,JSON_FILE)
                #short 20 ms sleep to prevent overwhelming server
                time.sleep(.02)
        
            except Exception as e:
                errors.append([movie_id,e])
                continue
            final_year_df=pd.read_json(JSON_FILE)
            final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip",index=False)

In [None]:
print(f' - Total Errors occurred: {len(errors)}')

In [None]:
import glob
tmdb_files = sorted(glob.glob("Data/final_tmdb_data*.csv.gz"))
tmdb_files

In [None]:
df = pd.concat([pd.read_csv(f) for f in tmdb_files] )
df

In [None]:
df.info()

In [None]:
df.to_csv(f"{FOLDER}tmdb_results_combined.csv.gz", compression="gzip",index=False)

## How many movies are there in each of the certification categories?

In [None]:
order = df['certification'].value_counts().index
ax = sns.countplot(data=df, x='certification', order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);

In [None]:
certification_count = df['certification'].value_counts()
r = 0
for value in certification_count.index:
    print(f"{value} certifications has {certification_count.values[r]} movies.")
    r += 1

### How many movies had at least some valid financial information (values > 0 for budget OR revenue)?

In [None]:
movies_with_financials = (df['revenue'] > 0) | (df['budget'] > 0)
movies_with_financials.sum()

There are 727 movies that had valid financial information for either revenue or budget.

### What is the average revenue per certification category?

In [None]:
ax = sns.barplot(data=df, x='certification', y='revenue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);
ax.set(title="Average Revenue Per Certification");

In [None]:
certification_group = df.groupby('certification')

In [None]:
certification_group['revenue'].mean()

In [None]:
ax = sns.barplot(data=df, x='certification', y='budget')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);

In [None]:
budget_group = df.groupby('budget')
certification_group['budget'].mean()