In [1]:
import os, time,json
import tmdbsimple as tmdb
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook


FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title_basics_filtered.csv.gz',
 '.tmdb_api_results_2001.0.json.icloud',
 '.title.basics.tsv.gz.icloud',
 '.ipynb_checkpoints',
 '.title-akas-us-only.csv.icloud',
 'title_ratings_filtered.csv.gz',
 '.title.ratings.tsv.gz.icloud',
 '.title-akas-us-only_filtered.csv.icloud']

In [2]:
with open('/Users/ashik/.secret/tmdb.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [3]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']

# Functions

## Movie Rating

In [5]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
           info['certification'] = c['certification']
        
    return info

## Append list of records to JSON File

In [6]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

## Load in the Title Basics data


In [7]:
# Load in the dataframe from project part 1 as basics:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics_filtered.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,titleId,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
2,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
3,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama
4,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama


In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91374 entries, 0 to 91373
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   titleId         91374 non-null  object 
 1   titleType       91374 non-null  object 
 2   primaryTitle    91374 non-null  object 
 3   originalTitle   91374 non-null  object 
 4   isAdult         91374 non-null  int64  
 5   startYear       91374 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  91374 non-null  int64  
 8   genres          91374 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.3+ MB


## Create Required Lists for the Loop


In [9]:
#Define a list of the Years to Extract from the API
YEARS_TO_GET = [2000.0,2001.0]

#Define an errors list
errors = [ ]

# Outer Loop

In [10]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    
    # If it does not exist: create it
    if file_exists == False:
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
            
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['titleId'].copy()
    movie_ids
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    previous_df
    
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    movie_ids_to_get
    
    # Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])
            
        final_year_df = pd.read_json(JSON_FILE)
        final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000.0:   0%|          | 0/1773 [00:00<?, ?it/s]

Movies from 2001.0:   0%|          | 0/1937 [00:00<?, ?it/s]

In [11]:
print(f"- Total errors: {len(errors)}")

- Total errors: 456


In [12]:
import json


with open("Data/tmdb_api_results_2000.0.json") as f:
    jsondata= json.load(f)
type(jsondata)

list

In [13]:
# what does the first entry of business look like?
jsondata[1].keys()

dict_keys(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'certification'])

In [14]:
import json


with open("Data/tmdb_api_results_2001.0.json") as f:
    jsondata1= json.load(f)
type(jsondata1)

list

In [15]:
two_thousand = pd.DataFrame(jsondata)
two_thousand.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,False,5.5,22.0,
2,tt0113092,False,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,5.45,10.0,
3,tt0113092,False,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,5.45,10.0,
4,tt0116391,False,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,False,4.0,1.0,


In [16]:
two_thousand_one = pd.DataFrame(jsondata1)
two_thousand_one.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,False,6.331,1225.0,PG-13
2,tt0035423,False,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,False,6.331,1225.0,PG-13
3,tt0114447,False,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,False,5.0,3.0,
4,tt0116916,False,/rFpHBidSlhjflmnLu7BZilyKeQR.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,73549.0,en,The Dark Mist,...,0.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Dark Mist,False,3.5,2.0,PG


## EDA

In [19]:
tmdb_results_combined = pd.concat([two_thousand, two_thousand_one])
tmdb_results_combined.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,False,5.5,22.0,
2,tt0113092,False,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,5.45,10.0,
3,tt0113092,False,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,5.45,10.0,
4,tt0116391,False,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,False,4.0,1.0,


### How many movies had at least some valid financial information (values > 0 for budget OR revenue)?
* Please exclude any movies with 0's for budget AND revenue from the remaining visualizations.

In [20]:
tmdb_results_combined.to_csv(f"{FOLDER}tmdb_results_combined.csv.gz", compression="gzip", index=False)

In [25]:
finances = (tmdb_results_combined['budget'] > 0) | (tmdb_results_combined['revenue'] > 0)   
finances.sum()

958

958 movies had at least some valid financial information (values > 0 for budget OR revenue)

In [30]:
tmdb_eda = tmdb_results_combined[(tmdb_results_combined['budget'] > 0) | (tmdb_results_combined['revenue'] > 0)]

### How many movies are there in each of the certification categories (G/PG/PG-13/R)?


In [32]:
tmdb_eda['certification'].value_counts()

R        343
PG-13    220
         169
PG        67
G         40
NR        29
Name: certification, dtype: int64

### What is the average revenue per certification category?


In [33]:
average_revenue_per_certification = tmdb_eda.groupby('certification')['revenue'].mean()
average_revenue_per_certification.head()

certification
         3.208560e+07
G        1.361394e+08
NR       2.209491e+07
PG       1.222921e+08
PG-13    1.161215e+08
Name: revenue, dtype: float64

### What is the budget revenue per certification category?


In [34]:
average_budget_per_certification = tmdb_eda.groupby('certification')['budget'].mean()
average_budget_per_certification.head()

certification
         1.125990e+07
G        3.815000e+07
NR       1.816328e+07
PG       5.413802e+07
PG-13    4.858480e+07
Name: budget, dtype: float64