The Movie Database API – 
- Objective: Use this API to analyze the top 5 movies, by genre, starring any of the top 10 paid actresses or the top 10 paid actors in the US.​

# Import Dependencies

In [101]:
#import Dependencies
import json 
import requests
import pandas as pd
import numpy as np
from pprint import pprint
import os
import collections
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv


# THBD API Connection

In [102]:
# load api key variable
load_dotenv()

True

In [103]:
# Set Authentication

API_KEY = os.getenv("client_api_key")
base_url = "https://api.themoviedb.org/"
auth_url = "https://api.themoviedb.org/3/authentication"

headers = {	"accept":          "application/json", 
	         "Authorization": f'Bearer {API_KEY}'}
params = {'': ''}

In [104]:
# Endpoints

#Movie Lists Folder
top_rated_movie = '3/movie/top_rated'
page = 1
top_rated_movie_params = {'language': 'en-US', 'page': {page}, 'region': 'US'}

# Genre List
genre_list = '3/genre/movie/list'
genre_list_param = {'language' : 'en'}

# Actors
movie_id = 569094
actor_info = f'/3/movie/{movie_id}/credits'
actor_info_param = {'movie_id' : {movie_id}}


In [105]:
# Check Authentication & Response check
def response_check():
    try:
        response = requests.get(auth_url,headers=headers)
        response.raise_for_status()
        print(auth_url + ' {}'.format(response) )
    except requests.exceptions.HTTPError as err:
        raise SystemExit(err)
response_check()

https://api.themoviedb.org/3/authentication <Response [200]>


In [106]:
# Check Endpoint & Response check
def response_endpoint_check(end_point, headers, params):
    try:
        response = requests.get(base_url + end_point,headers=headers, params=params)
        response.raise_for_status()
        print(base_url + end_point + ' {}'.format(response) )
    except requests.exceptions.HTTPError as err:
        print("Http Error:",err)
    
response_endpoint_check(end_point=top_rated_movie,headers=headers, params=top_rated_movie_params)
response_endpoint_check(end_point=genre_list,headers=headers, params=genre_list_param)
response_endpoint_check(end_point=actor_info,headers=headers, params=actor_info_param)

https://api.themoviedb.org/3/movie/top_rated <Response [200]>
https://api.themoviedb.org/3/genre/movie/list <Response [200]>
https://api.themoviedb.org//3/movie/569094/credits <Response [200]>


# End point Parameters update functions

In [107]:
# Update movie id parameter inside the Actor endpoint within a function
def get_actor_endpoint(movie_id_info):
    return f'/3/movie/{movie_id_info}/credits'

In [108]:
#Update movie id parameter within a function
def get_actor_param(movie_id_info):
    return {'movie_id' : {movie_id_info}}

In [109]:
#Update page id parameter within a function
def get_movie_endpoint_param(page_info):
    return {'language': 'en-US', 'page': {page_info}, 'region': 'US'}

#  Import CSV: Top 10 Paid US Actors

In [110]:
# Import the CSV file & Load to Dataframe

#Create relative path to the .csv file. 
file_path = os.path.join("Resources","top_paid_us_actors.csv") 
file = "top_paid_us_actors.csv"

# Read data into dataframe(df) and show top 10 All time paid US actors.  
top_paid_actor = pd.read_csv(file_path,encoding="utf8", sep=',')
top_ten_paid_actors = pd.DataFrame(top_paid_actor)
top_ten_paid_actors

Unnamed: 0,full_name
0,Samuel L. Jackson
1,Robert Downey Jr.
2,Tom Hanks
3,Tom Cruise
4,Zoe Saldana
5,Chris Pratt
6,Chris Hemsworth
7,Bradley Cooper
8,Chris Evans
9,Harrison Ford


### Convert Json to Dataframe

In [111]:
#Get Json Endpoint and convert to dataframe
def convert_json_dataframe(endpoint , headers ,params=params):
    
    response = requests.get(base_url + endpoint, headers=headers, params=params)

    data = json.loads(response.text)

    format_data = json.dumps(data, indent=4)

    df = pd.read_json(format_data)

    return df


### JSON Nested Columns Extraction

In [112]:
#Get Nested Json Endpoint and convert to dataframe for overview of 1 page of records
def convert_json_nested_dataframe(endpoint , headers , record_paths, params=params, **argv):
    
    response = requests.get(base_url + endpoint, headers=headers, params=params)

    data = json.loads(response.text)
    
    for s in argv.values():
        f = pd.json_normalize(data, record_path=[record_paths],
                            meta=
                                        [s],
                                errors='ignore', record_prefix=' ')
    return f


### Find Duplicate Records

In [113]:
def find_duplicate_records(dataframe,col):
    return dataframe[dataframe.duplicated([col])]
    

## Table: Top Rated US Movies

In [114]:
# First look at Top Rated Movies Table
top_rated_movie_df = convert_json_dataframe(endpoint = top_rated_movie , headers=headers , params=top_rated_movie_params)
top_rated_movie_df.head(1)

Unnamed: 0,page,results,total_pages,total_results
0,1,"{'adult': False, 'backdrop_path': '/nGxUxi3PfX...",511,10217


### Top Rated Movies: Nested Columns Extracted

In [115]:
# The results column has been extracted to showcase nested results of the Top Rated Movies
nested_columns = ['id']
record_paths = 'results'
top_rated_movie_details_df = convert_json_nested_dataframe(endpoint = top_rated_movie , headers=headers , record_paths = record_paths, argv = nested_columns, params=top_rated_movie_params )
top_rated_movie_details_df.head(5)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,id.1
0,False,/nGxUxi3PfXDRm7Vg95VBNgNM8yc.jpg,"[28, 12, 16, 878]",569094,en,Spider-Man: Across the Spider-Verse,"After reuniting with Gwen Stacy, Brooklyn’s fu...",2860.755,/8Vt6mWEReuy4Of61Lnj5Xj704m8.jpg,2023-06-02,Spider-Man: Across the Spider-Verse,False,8.8,925,
1,False,/tmU7GeKVybMWFButWEGl2M4GeiP.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",98.495,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.7,18048,
2,False,/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg,"[18, 80]",278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,74.768,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,1994-09-23,The Shawshank Redemption,False,8.7,23906,
3,False,/oo4PVK6AyLZN49BokxDFGyclN86.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,46.786,/bMadFzhjy9T7R8J48QGq1ngWNAK.jpg,1974-12-20,The Godfather Part II,False,8.6,10899,
4,False,/vI3aUGTuRRdM7J78KIdW98LdxE5.jpg,"[35, 18, 10749]",19404,hi,दिलवाले दुल्हनिया ले जायेंगे,"Raj is a rich, carefree, happy-go-lucky second...",25.066,/ktejodbcdCPXbMMdnpI9BUxW6O8.jpg,1995-10-20,Dilwale Dulhania Le Jayenge,False,8.6,4142,


### Extract 30 Pages of Top Rated US Movie Records

In [116]:
movie_rows = []
nested_columnss = ['title']
count = 1
pages = 30

while count < pages:
    movie_info_df = convert_json_nested_dataframe(endpoint = top_rated_movie , headers=headers , record_paths = 'results', argv = nested_columnss, params=get_movie_endpoint_param(count))
    df = pd.DataFrame(movie_info_df)
    df.columns = df.columns.str.replace(' ','')
    movie_rows.append(df)
    count += 1



### Transform Top Rated US Movies Table

In [117]:
#Choose needed Columns        
extracted_movie_records_df = pd.concat(movie_rows, axis=0)
extracted_movie_records_df = extracted_movie_records_df.iloc[:, [2,3,9,10,12]]

# Extract single genre datapoint from list as main genre
single_genre = df['genre_ids'].apply(pd.Series)
genre_datapoint = single_genre[0]

#Add genre extracted from list as a new column to df
extracted_movie_records_df['genre_id'] = genre_datapoint
extracted_movie_records_df = extracted_movie_records_df.astype({'genre_id':'int'})
movies_info_df = extracted_movie_records_df.drop('genre_ids', axis=1)

the_movie_info_df = pd.DataFrame(movies_info_df)

the_movie_info_df

Unnamed: 0,id,release_date,title,vote_average,genre_id
0,569094,2023-06-02,Spider-Man: Across the Spider-Verse,8.8,18
1,238,1972-03-14,The Godfather,8.7,18
2,278,1994-09-23,The Shawshank Redemption,8.7,18
3,240,1974-12-20,The Godfather Part II,8.6,18
4,19404,1995-10-20,Dilwale Dulhania Le Jayenge,8.6,18
...,...,...,...,...,...
15,10191,2010-03-26,How to Train Your Dragon,7.8,14
16,142,2005-12-09,Brokeback Mountain,7.8,18
17,393559,2017-02-24,My Life as a Zucchini,7.8,16
18,107,2001-01-19,Snatch,7.8,80


#### Check for duplicated records in Top Rated US Movies


In [118]:
# Check for duplicates
find_duplicate_records(dataframe = the_movie_info_df,col= "id")


Unnamed: 0,id,release_date,title,vote_average,genre_id


### Movie ID and Genre ID Dataframe

In [119]:
#Create new data frame that has only genre and movie Id's to join with another table
genre_movie_id_df = pd.DataFrame(the_movie_info_df)
genre_movie_id_df = genre_movie_id_df.drop(['release_date','title','vote_average'], axis=1)
genre_movie_id_df


Unnamed: 0,id,genre_id
0,569094,18
1,238,18
2,278,18
3,240,18
4,19404,18
...,...,...
15,10191,14
16,142,18
17,393559,16
18,107,80


## Table: Genre

In [120]:
# First look at Genre Table
genre_list_nested_df = convert_json_dataframe(endpoint = genre_list , headers=headers , params=genre_list_param)
genre_list_nested_df.head(1)

Unnamed: 0,genres
0,"{'id': 28, 'name': 'Action'}"


### Genres: Nested Columns Extracted

In [121]:
nested_columns = ['id']
record_paths = 'genres'

genre_list_df = convert_json_nested_dataframe(endpoint = genre_list , headers=headers , record_paths = record_paths, argv = nested_columns, params=genre_list_param )
genre_list_df = genre_list_df.drop('id', axis=1)
genre_list_df.columns = genre_list_df.columns.str.replace(' ','')
genre_list_df.rename(columns={'id': 'genres_id','name': 'genre_name'}, inplace=True)
genre_list_df

Unnamed: 0,genres_id,genre_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


#### Check for duplicated records in Genre Table

In [122]:
# Check for duplicates in genre table
find_duplicate_records(dataframe = genre_list_df,col= "genres_id")

Unnamed: 0,genres_id,genre_name


## Table: Actor Details

In [123]:
# Extract json nested list to dataframe
actor_rows = []
nested_columnss = ['genre']

for i in genre_movie_id_df['id']:
    actor_info_df = convert_json_nested_dataframe(endpoint = get_actor_endpoint(i) , headers=headers , record_paths = 'cast', argv = nested_columnss, params=get_actor_param(i) )
    actor_df = pd.DataFrame(actor_info_df).reset_index()
    actor_df.columns = actor_df.columns.str.replace(' ','')
    actor_df['movie_id'] = i
    actor_df = actor_df.head(1)
    actor_rows.append(actor_df)
       
extract_df = pd.concat(actor_rows, axis=0)
extract_df = extract_df.astype({'id':'int'})
extract_df = extract_df.astype({'gender':'int'})
extract_df.rename(columns={'id': 'actor_id'}, inplace=True) 

# Drop unneeded columns
actors_details_df = extract_df.drop(['index','adult','original_name', 'popularity', 'profile_path', 'cast_id', 'credit_id', 'order','genre'], axis=1)

actors_details_df

Unnamed: 0,gender,actor_id,known_for_department,name,character,movie_id
0,2,587506,Acting,Shameik Moore,Miles Morales / Spider-Man (voice),569094
0,2,1158,Acting,Al Pacino,Michael Corleone,238
0,2,504,Acting,Tim Robbins,Andy Dufresne,278
0,2,1158,Acting,Al Pacino,Don Michael Corleone,240
0,2,35742,Acting,Shah Rukh Khan,Raj Malhotra,19404
...,...,...,...,...,...,...
0,2,449,Acting,Jay Baruchel,Hiccup Horrendous Haddock III (voice),10191
0,2,1810,Acting,Heath Ledger,Ennis Del Mar,142
0,0,1615534,Acting,Gaspard Schlatter,Courgette (voice),393559
0,2,976,Acting,Jason Statham,Turkish,107


#### Check for duplicated records in Actor Details Table

In [124]:
find_duplicate_records(dataframe = actors_details_df,col= "movie_id")

Unnamed: 0,gender,actor_id,known_for_department,name,character,movie_id


In [125]:
join_df=pd.merge(genre_movie_id_df,actors_details_df, left_on=['id'],right_on=['movie_id'], how='left')
join_df

Unnamed: 0,id,genre_id,gender,actor_id,known_for_department,name,character,movie_id
0,569094,18,2.0,587506.0,Acting,Shameik Moore,Miles Morales / Spider-Man (voice),569094.0
1,238,18,2.0,1158.0,Acting,Al Pacino,Michael Corleone,238.0
2,278,18,2.0,504.0,Acting,Tim Robbins,Andy Dufresne,278.0
3,240,18,2.0,1158.0,Acting,Al Pacino,Don Michael Corleone,240.0
4,19404,18,2.0,35742.0,Acting,Shah Rukh Khan,Raj Malhotra,19404.0
...,...,...,...,...,...,...,...,...
575,10191,14,2.0,449.0,Acting,Jay Baruchel,Hiccup Horrendous Haddock III (voice),10191.0
576,142,18,2.0,1810.0,Acting,Heath Ledger,Ennis Del Mar,142.0
577,393559,16,0.0,1615534.0,Acting,Gaspard Schlatter,Courgette (voice),393559.0
578,107,80,2.0,976.0,Acting,Jason Statham,Turkish,107.0


##### These particular records did not have known_for_department: Acting information when further researched

In [126]:
join_df[join_df.isna().any(axis=1)]

Unnamed: 0,id,genre_id,gender,actor_id,known_for_department,name,character,movie_id
151,399106,16,,,,,,
185,831827,16,,,,,,
254,574074,37,,,,,,
499,779047,18,,,,,,


### Actor Table

In [127]:
df2=join_df.dropna().reset_index(drop=True)
df2 = df2.astype({'gender':'int'})
df2 = df2.astype({'actor_id':'int'})
df2 = df2.astype({'movie_id':'int'})
actor_known_details_df = df2
actor_known_details_df

Unnamed: 0,id,genre_id,gender,actor_id,known_for_department,name,character,movie_id
0,569094,18,2,587506,Acting,Shameik Moore,Miles Morales / Spider-Man (voice),569094
1,238,18,2,1158,Acting,Al Pacino,Michael Corleone,238
2,278,18,2,504,Acting,Tim Robbins,Andy Dufresne,278
3,240,18,2,1158,Acting,Al Pacino,Don Michael Corleone,240
4,19404,18,2,35742,Acting,Shah Rukh Khan,Raj Malhotra,19404
...,...,...,...,...,...,...,...,...
571,10191,14,2,449,Acting,Jay Baruchel,Hiccup Horrendous Haddock III (voice),10191
572,142,18,2,1810,Acting,Heath Ledger,Ennis Del Mar,142
573,393559,16,0,1615534,Acting,Gaspard Schlatter,Courgette (voice),393559
574,107,80,2,976,Acting,Jason Statham,Turkish,107


## Merge Actor Details and Movie Details Tables

In [129]:
movie_genre_actor_join_df=pd.merge(the_movie_info_df,actor_known_details_df, on=['id'], how='inner')
movie_genre_actor_join_df = movie_genre_actor_join_df.drop(['movie_id','genre_id_x'], axis=1)
movie_genre_actor_join_df.rename(columns={'genre_id_y': 'genre_id'}, inplace=True) 
movie_genre_actor_join_df

Unnamed: 0,id,release_date,title,vote_average,genre_id,gender,actor_id,known_for_department,name,character
0,569094,2023-06-02,Spider-Man: Across the Spider-Verse,8.8,18,2,587506,Acting,Shameik Moore,Miles Morales / Spider-Man (voice)
1,238,1972-03-14,The Godfather,8.7,18,2,1158,Acting,Al Pacino,Michael Corleone
2,278,1994-09-23,The Shawshank Redemption,8.7,18,2,504,Acting,Tim Robbins,Andy Dufresne
3,240,1974-12-20,The Godfather Part II,8.6,18,2,1158,Acting,Al Pacino,Don Michael Corleone
4,19404,1995-10-20,Dilwale Dulhania Le Jayenge,8.6,18,2,35742,Acting,Shah Rukh Khan,Raj Malhotra
...,...,...,...,...,...,...,...,...,...,...
571,10191,2010-03-26,How to Train Your Dragon,7.8,14,2,449,Acting,Jay Baruchel,Hiccup Horrendous Haddock III (voice)
572,142,2005-12-09,Brokeback Mountain,7.8,18,2,1810,Acting,Heath Ledger,Ennis Del Mar
573,393559,2017-02-24,My Life as a Zucchini,7.8,16,0,1615534,Acting,Gaspard Schlatter,Courgette (voice)
574,107,2001-01-19,Snatch,7.8,80,2,976,Acting,Jason Statham,Turkish
