In [1]:
#Imports
import pandas as pd
import numpy as np
import os, time, json
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

# Loading in data as Local Files

In [2]:
# files saved after part 1
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)

# Part 2

## Setup to use the tmdb API

In [3]:
with open('/Users/Meekrius/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [4]:
#Setting the API_KEY variable to my api-key
tmdb.API_KEY =  login['api-key']

In [5]:
# Designate a folder to save API call data in
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz']

In [6]:
#Function to get the .info results with the certification added 
#of a movie

def get_movie_with_rating(movie_id):

# Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
# save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
# Loop through countries in releases
    for c in releases['countries']:
    # if the country abbreviation==US
        if c['iso_3166_1'] =='US':
        ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return  movie_info

### Testing the Function

In [7]:
test1 = get_movie_with_rating('tt0848228')
test1

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 225.297,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [8]:
test2 = get_movie_with_rating('tt0332280')
test2

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 73.78,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 260

In [9]:
# Function to append new results to existing JSON file
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [10]:
#Results from the first to years
YEARS_TO_GET = [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,
                2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]

# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    
    # If it does not exist: create it
    if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    
    # Load in the dataframe from project part 1 as basics:
    basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
    
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()#.to_list()
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

YEARS:   0%|          | 0/23 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1408 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1528 [00:00<?, ?it/s]

Movies from 2002:   0%|          | 0/1517 [00:00<?, ?it/s]

Movies from 2003:   0%|          | 0/1637 [00:00<?, ?it/s]

Movies from 2004:   0%|          | 0/1842 [00:00<?, ?it/s]

Movies from 2005:   0%|          | 0/2136 [00:00<?, ?it/s]

Movies from 2006:   0%|          | 0/2355 [00:00<?, ?it/s]

Movies from 2007:   0%|          | 0/2487 [00:00<?, ?it/s]

Movies from 2008:   0%|          | 0/2837 [00:00<?, ?it/s]

Movies from 2009:   0%|          | 0/3461 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3766 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4139 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4434 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4640 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4783 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/4930 [00:00<?, ?it/s]

Movies from 2016:   0%|          | 0/5142 [00:00<?, ?it/s]

Movies from 2017:   0%|          | 0/5505 [00:00<?, ?it/s]

Movies from 2018:   0%|          | 0/5631 [00:00<?, ?it/s]

Movies from 2019:   0%|          | 0/5726 [00:00<?, ?it/s]

Movies from 2020:   0%|          | 0/4869 [00:00<?, ?it/s]

Movies from 2021:   0%|          | 0/4830 [00:00<?, ?it/s]

Movies from 2022:   0%|          | 0/2647 [00:00<?, ?it/s]