# Movie Analysis Part 2 API
Cameron Peace

<mark> ***Use an API to extract box office revenue and profit data to add to our IMDB data and perform exploratory data analysis.***

### Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb
import json
import time
import os

### Loading in csvs

In [2]:
basics = pd.read_csv('Data/title_basics.csv.gz')
basics.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama


In [3]:
# changing year to int because the float may be causing an error in later code
basics['startYear'] = basics['startYear'].astype(int)

In [4]:
# confirming
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


## Setting up the API

In [5]:
# loading api-key
with open('/Users/cameron/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
# confirming
login.keys()

dict_keys(['api-key'])

In [6]:
# setting key in tmdb module
tmdb.API_KEY =  login['api-key']

### Setting up folder

In [7]:
folder = "Data/"
os.makedirs(folder, exist_ok=True)
os.listdir(folder);

## Setting up functions

In [8]:
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()

    for c in releases['countries']:
        if c['iso_3166_1' ] == 'US':
           info['certification'] = c['certification']
    
    return info

In [9]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename, 'r+') as file:
        file_data = json.load(file)
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        file.seek(0)
        json.dump(file_data, file)

### Testing Functions

In order to ensure your function for extracting movie data from TMDB is working, test your function on these 2 movie ids: tt0848228 ("The Avengers") and tt0332280 ("The Notebook"). Make sure that your function runs without error and that it returns the correct movie's data for both test ids.

In [10]:
# using function
avengers = get_movie_with_rating('tt0848228')
notebook = get_movie_with_rating('tt0332280')

In [11]:
# confirming results
for i in [avengers, notebook]:
    print(i['title'], i['release_date'], i['certification'])

The Avengers 2012-04-25 PG-13
The Notebook 2004-06-25 PG-13


<mark><u>**Comment:**</u>

<font color='dodgerblue' size=4><i>
Looks like everything worked here
</i></font>

### Loop to Gather Data

In [12]:
# years to get from api call
years_to_get = [2003]

In [13]:
# list to catch errors
errors = []

In [14]:
for year in tqdm_notebook(years_to_get, desc='YEARS', position=0):
    json_file = f'{folder}tmdb_api_results_{year}.json'
    if os.path.isfile(json_file) == False:
        with open(json_file, 'w') as f:
            json.dump([{'imdb_id':0}], f)

    df = basics.loc[basics['startYear'] == year].copy()
    movie_ids = df['tconst'].copy()
    previous_df = pd.read_json(json_file)
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {year}',
                                  position=1,
                                  leave=True):
        try:
            temp = get_movie_with_rating(movie_id)  
            write_json(temp, json_file)
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])
        
    final_year_df = pd.read_json(json_file)
    final_year_df.to_csv(f"{folder}final_tmdb_data_{year}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/1 [00:00<?, ?it/s]

Movies from 2003:   0%|          | 0/1671 [00:00<?, ?it/s]

### Checking Results

In [15]:
# checking the number of errors
print(f"- Total errors: {len(errors)}")

# demonstrating that the errors are related to imdb ids not found in the tmdb
count = 0
not_404 = []
for i in errors:
    if '404 Client Error' in str(i[1]):
        count += 1
    else:
        not_404.append(i)
print(f'- 404 Client Errors: {count}')

# printing any errors that are not 404 if they exist
if not_404: 
    print(not_404)

- Total errors: 354
- 404 Client Errors: 354


In [16]:
# taking a look at errors
errors[:3]

[['tt0153516',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0153516?api_key=4d4c9815bcf18b420f748fabcf653225')],
 ['tt0208412',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0208412?api_key=4d4c9815bcf18b420f748fabcf653225')],
 ['tt0222710',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0222710?api_key=4d4c9815bcf18b420f748fabcf653225')]]

In [26]:
# confirming data
df_2003 = pd.read_json('Data/tmdb_api_results_2003.json')
df_2004 = pd.read_json('Data/tmdb_api_results_2004.json')

In [27]:
# confirming
display(df_2003.shape, df_2004.shape)

(1318, 26)

(1470, 26)

In [24]:
# confirming columns
df_2003.columns

Index(['imdb_id', 'adult', 'backdrop_path', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'certification'],
      dtype='object')

In [25]:
# inspecting movies missing rating
df_2000[df_2000['certification'].isna()].sample(5).T

Unnamed: 0,1197,1161,601,735,545
imdb_id,tt2131677,tt0479692,tt0223856,tt0239955,tt0218616
adult,0.0,0.0,0.0,0.0,0.0
backdrop_path,,/xIdrWvHAQkibTs6MnytvqhYNXBK.jpg,,,/oK0EwQzU3SqG3PfZwTnkQC4bAjO.jpg
belongs_to_collection,,,,,
budget,0.0,0.0,0.0,0.0,0.0
genres,"[{'id': 18, 'name': 'Drama'}]","[{'id': 18, 'name': 'Drama'}, {'id': 10752, 'n...","[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",[],"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
homepage,,,,,
id,626058.0,696367.0,112878.0,187911.0,83581.0
original_language,ja,zh,en,es,en
original_title,月,葵花劫,The Opponent,Sé quién eres,Some Voices


<mark><u>**Comment:**</u>

<font color='dodgerblue' size=4><i>
Looks like everything worked here.  We will be taking a closer look at the data in a separate notebook.
</i></font>