# Using the TMDB API

In [10]:
import requests
import json
import gzip
import pandas as pd
from urllib.request import urlretrieve
from datetime import date

## Movie IDs DataFrame

We need to be able to look up the TMDB ID for each movie, so first of all we download, unzip and store this data in a data frame.

In [16]:
def fetch_tmdb_movie_ids(day, month, year, directory="./data/tmdb_id_file.gz"):
    """
    This function downloads the zipped json file from TMDB with the most recent movie IDs in it. 
    Then it unzippes the file and stores its contents in a data frame.
    Required arguments:
    - day: int, the day from today's date
    - month: int, the current month
    - year: int, the current year
    Optional arguments:
    - directory: string, folder path to the download location of the zipped file from tmdb, defaults to ./data/tmdb_id_file.gz
    Returns:
    - movie_id_df: pandas dataframe containing the ID and other information on all the tmdb movies.
    """
    # Download the file
    path = f"http://files.tmdb.org/p/exports/movie_ids_{month}_{day}_{year}.json.gz"
    if directory:
        directory = directory
    urlretrieve(path, directory)
    
    # Unzip the file
    with gzip.GzipFile(directory, 'r') as fin:   
        json_bytes = fin.read()    
    
    # Format the output into a list of strings
    json_list_of_str = json_bytes.decode().split("\n")
    
    # Turn the list of strings into a list of dictionaries
    dict_list = []
    for dict_str in json_list_of_str[:-1]:
        real_dict = json.loads(dict_str)
        dict_list.append(real_dict)
    
    # Convert it to a dataframe
    movie_id_df = pd.DataFrame(dict_list)

    return movie_id_df
    

In [18]:
movie_id_df = fetch_tmdb_movie_ids(day=10, month=10, year=2021)
print(movie_id_df.shape)
movie_id_df.head()

(644416, 5)


Unnamed: 0,adult,id,original_title,popularity,video
0,False,3924,Blondie,1.659,False
1,False,6124,Der Mann ohne Namen,0.6,False
2,False,8773,L'amour à vingt ans,2.741,False
3,False,25449,New World Disorder 9: Never Enough,1.545,False
4,False,31975,Sesame Street: Elmo Loves You!,0.6,True


## Getting Info for individual movies

### IDs of Movies currently running

In [38]:
def get_zurich_movie_ids(date_string, tmdb_ids_dataframe):
    """
    This function takes in a date and reads in the movie program for Zurich scraped from cineman.ch.
    Then it filters the movie_id_df from tmdb to keep only information for those titles that can be matched with
    movies that are currently running in Zurich.
    Required arguments:
    - date_string: string of the date (format YYYY-MM-DD) for which the movie program will be loaded, e.g. 2021-09-24
    - tmdb_ids_dataframe: pandas dataframe compiled from the ids json file that is available on tmdb.
    Returns:
    - newest_films_df: pandas dataframe with tmdb information on the films that are currently running in Zurich.
    """
    # First load the most recent movie program in Zurich
    cineman_df = pd.read_csv(f"./data/{date_string}_showtimes_zurich.csv", index_col=0)
    
    # Create a list with the movies that are currently running
    current_movies = cineman_df["movie"].unique()
    
    # Filter the movie_id_df to keep only those titles
    current_movie_id_df = tmdb_ids_dataframe[tmdb_ids_dataframe["original_title"].isin(current_movies)].reset_index(drop=True)
    
    # Some of the movie titles appear more than once with different IDs - need to take only the most recent one (highest popularity score)
    newest_films = []

    for movie in current_movies:
        same_title = current_movie_id_df[current_movie_id_df["original_title"] == movie]
        if not same_title.empty:
            newest_film = same_title[same_title["popularity"] == max(same_title["popularity"])]
            if len(newest_film)==1:
                newest_films.append(newest_film)
        
    newest_films_df = pd.concat(newest_films).reset_index(drop=True)
    
    return newest_films_df
    

In [42]:
newest_films_df = get_zurich_movie_ids("2021-09-24", movie_id_df)
print(newest_films_df.shape)
newest_films_df.head()

(62, 5)


Unnamed: 0,adult,id,original_title,popularity,video
0,False,600354,The Father,35.44,False
1,False,649137,Billie,2.336,False
2,False,698369,Helmut Newton: The Bad and the Beautiful,2.426,False
3,False,438631,Dune,416.541,False
4,False,527774,Raya and the Last Dragon,402.802,False


### Get the Description for these Movies

In [47]:
def get_movie_overviews(credentials_path, currently_running_movies_df):
    """
    This function takes in the credentials path and a dataframe with movie IDs.
    Then it requests information for these movie IDs via the tmdb API and stores it in a dataframe.
    Required arguments:
    - credentials_path: string, path to the file with the tmbd API credentials.
    - currently_running_movies_df: pandas dataframe containing a column named 'id' with the movie ids
    Returns:
    - movies_with_overviews_df: same pandas dataframe as currently_running_movies_df with one additional column for the movie overviews
    """
    # load the API credentials
    key_yml = json.load(open(credentials_path))
    tmdb_api_key = key_yml["api_key"]
    
    # take only the movie ids column
    current_ids = currently_running_movies_df["id"]
    overviews = []

    # request info for each movie and store the movie overviews in a list
    for movie_id in current_ids:
        url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key="
        req = requests.get(url + tmdb_api_key)
        movie_json = req.json()
        overviews.append(movie_json["overview"])
    
    # create the new dataframe with the movie overviews column
    movies_with_overviews_df = currently_running_movies_df
    movies_with_overviews_df["overview"] = overviews
    
    return movies_with_overviews_df

In [49]:
credentials_path = "./tmdb_credentials.yml"

movies_with_overviews_df= get_movie_overviews(credentials_path, newest_films_df)
print(movies_with_overviews_df.shape)
movies_with_overviews_df.head()

(62, 6)


Unnamed: 0,adult,id,original_title,popularity,video,overview
0,False,600354,The Father,35.44,False,A man refuses all assistance from his daughter...
1,False,649137,Billie,2.336,False,‘Lady Day’ was one of the greatest jazz vocali...
2,False,698369,Helmut Newton: The Bad and the Beautiful,2.426,False,Women were clearly at the core of legendary ph...
3,False,438631,Dune,416.541,False,"Paul Atreides, a brilliant and gifted young ma..."
4,False,527774,Raya and the Last Dragon,402.802,False,"Long ago, in the fantasy world of Kumandra, hu..."


In [48]:
movies_with_overviews_df.to_csv(f"data/{date.today()}_zurich_movie_overviews.csv")