In [1]:
# Dependencies.
# API calls.
import requests
import json
import pprint
from api_keys import ACCESS_TOKEN

# Data cleaning/engineering.
import pandas as pd
import numpy as np

# Other things.
import time # Respect the API creators... do not burden their site.
from itertools import islice

# Requesting And Storing Movie Data

<hr>

In this section of the notebook, I will be using The Movie Database ( TMDB | https://www.themoviedb.org/ ) to create a list of movies.  

TMDB provides users with a file containing a list of JSON objects that represent each valid movie on the database at that time.  
For this notebook, I will be using the version of this file provided on 9/23/2024.  

From the above list, I want to get the following information from each movie:  
- id
- title
- release_date
- certificate
- runtime
- genre
- rating
- number_of_votes
- overview
- director
- cast (top 4)
- gross (revenue - budget)
- poster_link

Results from these requests will be put into a list and then made into a DataFrame.  
I will assure that the DataFrame has no issues and then write it off as a CSV.

### Functions

Following are a few functions to ease the use of the script.

- requestMovie
  - Takes in a movie_id and makes a request to TMDB API, returns the information needed formatted as I want it.


In [2]:
# TMDB Request.
def requestMovie(movie_id):
    '''
    Make a request to The Movie Database API with a given movie_id number.
    Requires that the user has obtained a TMDB access token through their website.
    https://www.themoviedb.org
    '''

    # Set up variables for the request.
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?append_to_response=credits&language=en-US"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {ACCESS_TOKEN}"
    }

    # Make the request.
    time.sleep(.25) # Make sure that we don't overload the site with requests with this function.
    response = requests.get(url, headers = headers)

    # Check to see if request was successful.
    # Unsuccessful. 
    if response.status_code != 200:
        print(f"Error: request for {movie_id} failed.")
        print(f"Code: {response.status_code}.")
        time.sleep(10) # Give the site more time if we get a timeout.
        return None
    
    # Successful.
    else:
        data = response.json()

    # Get desired information from the request.
    result = dict()
    result['movie_id'] = data.get('id', None)
    result['title'] = data.get('title', None)
    result['genre'] = [x.get('name', '') for x in data.get('genres', {})]
    result['release_date'] = data.get('release_date', None)
    result['runtime'] = data.get('runtime', None)
    result['rating'] = data.get('vote_average', None)
    result['vote_count'] = data.get('vote_count', None)
    result['director'] = next(((x.get('name', None) for x in data.get('credits', {}).get('crew', {}) if x.get('job', '') == 'Director')), None)
    result['cast'] = list(islice((x.get('name', None) for x in data.get('credits', {}).get('cast', {})), 4))
    result['overview'] = data.get('overview', None)
    result['gross'] = (data.get('revenue', 0) - data.get('budget', 0))
    result['poster_link'] = data.get('poster_path', '')

    return result

# Requests

All we need to do is run through the list of movies to grab and make a request for each ID.

In [3]:
# Path to list and storage.
filepath = "clean/tmdb_movie_list.txt"
movies = list()

with open(filepath, 'r', encoding = "utf8") as file:
    for line in file:
        movie_id = json.loads(line)['id']
        result = requestMovie(movie_id)

        if result != None:
            movies.append(result)

        else:
            print(f"Failed to append for ID: {movie_id}")

In [25]:
# Make a DataFrame and check out the data.
df = pd.DataFrame(movies)

display(df.shape)
display(df.info())
display(df.head())

(2462, 12)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2462 entries, 0 to 2461
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie_id      2462 non-null   int64  
 1   title         2462 non-null   object 
 2   genre         2462 non-null   object 
 3   release_date  2462 non-null   object 
 4   runtime       2462 non-null   int64  
 5   rating        2462 non-null   float64
 6   vote_count    2462 non-null   int64  
 7   director      2448 non-null   object 
 8   cast          2462 non-null   object 
 9   overview      2462 non-null   object 
 10  gross         2462 non-null   int64  
 11  poster_link   2454 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 230.9+ KB


None

Unnamed: 0,movie_id,title,genre,release_date,runtime,rating,vote_count,director,cast,overview,gross,poster_link
0,12,Finding Nemo,"[Animation, Family]",2003-05-30,100,7.82,19059,Andrew Stanton,"[Albert Brooks, Ellen DeGeneres, Alexander Gou...","Nemo, an adventurous young clownfish, is unexp...",846335536,/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg
1,14,American Beauty,[Drama],1999-09-15,122,8.019,11903,Sam Mendes,"[Kevin Spacey, Annette Bening, Thora Birch, We...","Lester Burnham, a depressed suburban father in...",341296601,/wby9315QzVKdW9BonAefg8jGTTb.jpg
2,16,Dancer in the Dark,"[Drama, Crime]",2000-09-01,140,7.9,1760,Lars von Trier,"[Björk, Catherine Deneuve, David Morse, Peter ...","Selma, a Czech immigrant on the verge of blind...",27561153,/8Wdd3fQfbbQeoSfWpHrDfaFNhBU.jpg
3,18,The Fifth Element,"[Adventure, Fantasy, Action, Thriller, Science...",1997-05-02,126,7.552,10591,Luc Besson,"[Bruce Willis, Milla Jovovich, Gary Oldman, Ia...","In 2257, a taxi driver is unintentionally give...",173920180,/fPtlCO1yQtnoLHOwKtWz7db6RGU.jpg
4,22,Pirates of the Caribbean: The Curse of the Bla...,"[Adventure, Fantasy, Action]",2003-07-09,143,7.805,20390,Gore Verbinski,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",After Port Royal is attacked and pillaged by a...,515011224,/poHwCZeWzJCShH7tOjg8RIoyjcw.jpg


In [26]:
# Missing some poster links. For which movies?
df.loc[pd.isnull(df['poster_link']), :]

Unnamed: 0,movie_id,title,genre,release_date,runtime,rating,vote_count,director,cast,overview,gross,poster_link
1542,374338,The First Hope,[],2013-02-20,13,5.2,4,Jeremy David White,"[Davis Desmond, Lili Reinhart, Noland Ammon, V...",After seeing Leia kiss Luke on his VHS copy of...,0,
2007,840069,Attraction 3,[],2024-01-01,0,0.0,0,Fyodor Bondarchuk,"[Irina Starshenbaum, Rinal Mukhametov, Oleg Me...",,0,
2087,926391,Red Notice 3,"[Crime, Comedy, Thriller, Action]",,0,0.0,0,Rawson Marshall Thurber,"[Dwayne Johnson, Ryan Reynolds, Gal Gadot]",The second of two planned sequels to Red Notic...,0,
2426,1316786,No People Found,[Drama],2024-09-23,20,0.0,0,Jakub Gomółka,"[Maja Michnacka, Adam Szustak, Urszula Gryczew...","When Ada returns home at dawn, she realizes th...",0,
2438,1340034,The Plumber,[Music],2024-09-23,30,0.0,0,Sean Nicholas Savage,"[Σtella, Noda Pappa, Johnny Labelle, Dimitri R...","When her brother returns from ages abroad, an ...",0,
2446,1353767,Do Bangladroids Dream of Electric Tagore?,[],2024-09-22,3,0.0,0,Aleem Hossain,[],"In this desi-futuristic sci-fi, a documentaria...",0,
2447,1353773,"Huntsville, July 1981",[Animation],2024-09-22,13,0.0,0,Sol Friedman,[],In this captivating dark comedy by Fantastic F...,0,
2449,1353778,The Night Boots,[Animation],2024-09-22,12,0.0,0,Pierre-Luc Granjon,[],"The forest can be full of scary creatures, Eli...",0,


In [27]:
# How about the missing directors?
df.loc[pd.isnull(df['director']), :]

Unnamed: 0,movie_id,title,genre,release_date,runtime,rating,vote_count,director,cast,overview,gross,poster_link
1402,274181,Countdown to Eternity,[Documentary],1999-01-01,74,10.0,1,,"[Bill Gallatin, Dave Hunt, Chuck Missler, Roge...",Bible expert Bill Gallatin explores biblical p...,0,/mSozEOhPuYKa2HYqL4OhpmXchMn.jpg
1551,385281,Gravity Falls: Six Strange Tales,[],2013-10-15,129,8.219,32,,"[Jason Ritter, Kristen Schaal]","Gravity Falls, where there's always something ...",0,/oIGVrLzhtO47kLgf3TN0zswvdut.jpg
1677,492606,Game of Thrones - Conquest & Rebellion: An Ani...,"[Animation, Fantasy, War, TV Movie]",2017-12-12,45,7.75,94,,"[Nikolaj Coster-Waldau, Aidan Gillen, Sophie T...",HBO's animated history of Westeros brings to l...,0,/b3HXxFnhy0pamuDY9rqJ4mk7L1t.jpg
1821,619022,John Wick: Assassin's Code (Extra),[],2015-02-03,5,7.67,47,,[Keanu Reeves],John Wick Movie Extra,0,/fJbw16AwM59dEhSiCIAfFGgIgOP.jpg
2081,915557,The Porns,"[Comedy, Music]",,17,0.0,0,,"[Jee Heng-Liao, David Dotterer, Wataru Nishida...","In the near future, North Korea has establishe...",35592,/KVVIDBG8SH6dCldZO0GNJzaWVk.jpg
2116,939345,Transformers: Rise of the Beasts 2,"[Action, Adventure, Science Fiction]",,0,0.0,0,,[Peter Cullen],The first of two planned sequels to the 2023 f...,0,/f4PFiwOHVcNUXRcOmxX2hUYdAx7.jpg
2117,939347,Transformers: Rise of the Beasts 3,"[Action, Adventure, Science Fiction, Fantasy]",,0,0.0,0,,[],The second of two planned sequels to the 2023 ...,0,/zjDGpjRj9M9pLqVVZPpaFhG6BLx.jpg
2139,961651,Spider-Man: All Roads Lead to No Way Home,[Documentary],2022-05-03,30,6.8,73,,"[Tom Holland, Andrew Garfield, Tobey Maguire, ...",JB Smoove and Martin Starr host a celebration ...,0,/e2LLmI5wKIrWGAy9Of3yyWu7Szn.jpg
2196,1013296,Transformers Prime: One Shall Stand,"[Animation, Science Fiction]",2012-07-31,310,2.0,1,,[],Since acquiring Dark Energon - the very lifebl...,0,/pqt7KbU3UwwPswTjtB0P6VVyoFa.jpg
2382,1228246,Five Nights at Freddy's 2,"[Horror, Mystery]",2025-12-03,0,0.0,0,,"[Josh Hutcherson, Matthew Lillard, Piper Rubio...","Anyone can survive five nights. This time, the...",0,/ppZW3rMyrjkFitpDog6BRl2hIw5.jpg


In [28]:
# Seem like good candidates to get rid of.
# Movies without poster links are riddled with other missing data.
# Movies without director are extras or behind the scenes things.
df = df.dropna()

display(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 2440 entries, 0 to 2461
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie_id      2440 non-null   int64  
 1   title         2440 non-null   object 
 2   genre         2440 non-null   object 
 3   release_date  2440 non-null   object 
 4   runtime       2440 non-null   int64  
 5   rating        2440 non-null   float64
 6   vote_count    2440 non-null   int64  
 7   director      2440 non-null   object 
 8   cast          2440 non-null   object 
 9   overview      2440 non-null   object 
 10  gross         2440 non-null   int64  
 11  poster_link   2440 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 247.8+ KB


None

In [29]:
# We don't want the exact date the movie was released, just the year.
# And some of the release dates are missing.
df.loc[df['release_date'] == '', :]

Unnamed: 0,movie_id,title,genre,release_date,runtime,rating,vote_count,director,cast,overview,gross,poster_link
1776,574475,Final Destination: Bloodlines,"[Horror, Mystery]",,0,0.0,0,Adam B. Stein,"[Kaitlyn Santa Juana, Teo Briones, Andrew Tinp...",Sixth installment in the Final Destination fra...,0,/wkTzcp7qt8oDPDbqlnNZrOjBY1l.jpg
2028,848890,The Tomorrow War 2,"[Action, Science Fiction]",,0,0.0,0,Chris McKay,"[Chris Pratt, Yvonne Strahovski, J.K. Simmons,...",Sequel to The Tomorrow War (2021).,0,/5feOpRpuBL3aDuSdgmWMxDXe0Wz.jpg
2089,926670,Henry Danger: The Movie,"[Adventure, Action, Comedy, Family, Science Fi...",,0,0.0,0,Joe Menendez,"[Jace Norman, Sean Ryan Fox, Ella Anderson, Fr...",A film adaptation of the popular Nickelodeon s...,0,/iR3yZ63H3McDEB7Ze8rwD3W7uPL.jpg


In [30]:
# Remove the missing data.
df = df.drop([1776, 2028, 2089])
df.loc[df['release_date'] == '', :]

Unnamed: 0,movie_id,title,genre,release_date,runtime,rating,vote_count,director,cast,overview,gross,poster_link


In [32]:
# Convert the column to be year only.
df['release_date'] = df['release_date'].str[0:4]
df.head()

Unnamed: 0,movie_id,title,genre,release_date,runtime,rating,vote_count,director,cast,overview,gross,poster_link
0,12,Finding Nemo,"[Animation, Family]",2003,100,7.82,19059,Andrew Stanton,"[Albert Brooks, Ellen DeGeneres, Alexander Gou...","Nemo, an adventurous young clownfish, is unexp...",846335536,/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg
1,14,American Beauty,[Drama],1999,122,8.019,11903,Sam Mendes,"[Kevin Spacey, Annette Bening, Thora Birch, We...","Lester Burnham, a depressed suburban father in...",341296601,/wby9315QzVKdW9BonAefg8jGTTb.jpg
2,16,Dancer in the Dark,"[Drama, Crime]",2000,140,7.9,1760,Lars von Trier,"[Björk, Catherine Deneuve, David Morse, Peter ...","Selma, a Czech immigrant on the verge of blind...",27561153,/8Wdd3fQfbbQeoSfWpHrDfaFNhBU.jpg
3,18,The Fifth Element,"[Adventure, Fantasy, Action, Thriller, Science...",1997,126,7.552,10591,Luc Besson,"[Bruce Willis, Milla Jovovich, Gary Oldman, Ia...","In 2257, a taxi driver is unintentionally give...",173920180,/fPtlCO1yQtnoLHOwKtWz7db6RGU.jpg
4,22,Pirates of the Caribbean: The Curse of the Bla...,"[Adventure, Fantasy, Action]",2003,143,7.805,20390,Gore Verbinski,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",After Port Royal is attacked and pillaged by a...,515011224,/poHwCZeWzJCShH7tOjg8RIoyjcw.jpg


In [33]:
# Convert to integer.
df['release_date'] = df['release_date'].astype('int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2437 entries, 0 to 2461
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie_id      2437 non-null   int64  
 1   title         2437 non-null   object 
 2   genre         2437 non-null   object 
 3   release_date  2437 non-null   int64  
 4   runtime       2437 non-null   int64  
 5   rating        2437 non-null   float64
 6   vote_count    2437 non-null   int64  
 7   director      2437 non-null   object 
 8   cast          2437 non-null   object 
 9   overview      2437 non-null   object 
 10  gross         2437 non-null   int64  
 11  poster_link   2437 non-null   object 
dtypes: float64(1), int64(5), object(6)
memory usage: 247.5+ KB


In [34]:
# Rename the column.
df.rename(columns={'release_date': 'release_year'}, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2437 entries, 0 to 2461
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie_id      2437 non-null   int64  
 1   title         2437 non-null   object 
 2   genre         2437 non-null   object 
 3   release_year  2437 non-null   int64  
 4   runtime       2437 non-null   int64  
 5   rating        2437 non-null   float64
 6   vote_count    2437 non-null   int64  
 7   director      2437 non-null   object 
 8   cast          2437 non-null   object 
 9   overview      2437 non-null   object 
 10  gross         2437 non-null   int64  
 11  poster_link   2437 non-null   object 
dtypes: float64(1), int64(5), object(6)
memory usage: 247.5+ KB


In [35]:
df.to_csv("raw/tmdb_movies.csv")