In [2]:
import requests
import os
from dotenv import load_dotenv   
import pandas as pd
from tqdm import tqdm
from pathlib import Path

load_dotenv()

TMDB_TOKEN = os.getenv("TMDB_TOKEN")

HEADERS = {
    "accept": "application/json",
    "Authorization": "Bearer " + str(TMDB_TOKEN)
}


def check_authentication(headers):
    url = "https://api.themoviedb.org/3/authentication"
    response = requests.get(url, headers=headers)
    return response.status_code == 200


def get_movies_on_page(headers, language, getNumPages = False, page=1):
    """
        Used to get a specific page of records from the movies API in TMDB. To avoid overload when calling the API, the 
        list of movies/shows are splitted in pages you have to iterate over. 
        This function is used to iterate over all the pages in the method "get_top_rated_movies".
    """
    
    errors = []
    url = f"https://api.themoviedb.org/3/movie/top_rated?language={language}&page={page}"
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        results = data.get("results", [])
        
        if getNumPages:
            num_pages = data.get("total_pages", [])
            return pd.DataFrame(results), num_pages, errors
        else:
            return pd.DataFrame(results), errors
    
    else: 
        errors.append(f"TMDB - Movies - Error {response.status_code} on page {page}")
        return pd.DataFrame(), errors


def get_top_rated_movies(headers, language='es-ES'):
    total_result, num_pages, errors = get_movies_on_page(headers, language, getNumPages = True, page=1)
    
    for i in tqdm(range(2, num_pages), desc=f"TMDB - Retrieving movies [{language}] "):
        page_result, errors = get_movies_on_page(headers, language, page=i)
        
        total_result = pd.concat([total_result, page_result], ignore_index=True)
    
    total_result["type"] = 'movie'
    
    return total_result, errors


def get_shows_on_page(headers, language, getNumPages = False, page=1):
    """
        Used to get a specific page of records from the shows API in TMDB. To avoid overload when calling the API, the 
        list of movies/shows are splitted in pages you have to iterate over. 
        This function is used to iterate over all the pages in the method "get_top_rated_shows".
    """
    
    errors = []
    url = f"https://api.themoviedb.org/3/tv/top_rated?language={language}&page={page}"

    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        results = data.get("results", [])
        
        if getNumPages:
            num_pages = data.get("total_pages", [])
            return pd.DataFrame(results), num_pages, errors
        else:
            return pd.DataFrame(results), errors
    
    else: 
        errors.append(f"TMDB - Shows - Error {response.status_code} on page {page}")
        return pd.DataFrame(), errors


def get_top_rated_shows(headers, language='es-ES'):
    total_result, num_pages, errors = get_shows_on_page(headers, language, getNumPages = True, page=1)
    
    for i in tqdm(range(2, num_pages), desc=f"TMDB - Retrieving shows  [{language}] "):
        page_result, errors = get_shows_on_page(headers, language, page=i)
        
        total_result = pd.concat([total_result, page_result], ignore_index=True)
    
    total_result["type"] = 'show'
    
    return total_result, errors

def get_movie_genres(headers, language='es-ES'):
    url = f"https://api.themoviedb.org/3/genre/movie/list?language={language}"
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        return pd.DataFrame(data.get('genres'))
    
    else:
        return f"Error getting MOVIE genres - Status Code: {response.status_code}"
    
def get_shows_genres(headers, language='es-ES'):
    url = f"https://api.themoviedb.org/3/genre/tv/list?language={language}"
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        return pd.DataFrame(data.get('genres'))
    
    else:
        return f"Error getting SHOWS genres - Status Code: {response.status_code}"

def get_watch_providers(headers, movie_id, type, selected_language = 'ES'):
    """
     - Type can be 'movie' or 'tv' (for shows)
    """
    url = f"https://api.themoviedb.org/3/{type}/{movie_id}/watch/providers"

    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        filter_language = data.get("results").get(selected_language)
        if filter_language: 
            flatrate_options = filter_language.get("flatrate") # Only interested in the ones that are included in subscriptions (flatrate)
            if flatrate_options: 
                print(flatrate_options)
                providers_list = [provider.get('provider_name') for provider in flatrate_options]
                return {id: providers_list}
            else: 
                print(f"Flatrate option for {type}: {movie_id} not found")
                return []
        else: 
            print(f"Language '{selected_language}' for the {type}: {movie_id} not found")
            return []
    else:
        print("Error getting watch provider")
        return []


In [4]:

if __name__ == '__main__':
    
    if check_authentication(HEADERS):
        
        # Extraction in es-ES
        movies_ES, movies_errors = get_top_rated_movies(HEADERS)
        shows_ES, shows_errors = get_top_rated_shows(HEADERS)
        
        # Extraction in en-US (later needed to search in OMDB)
        movies_EN, movies_errors = get_top_rated_movies(HEADERS, language='en-US')
        shows_EN, shows_errors = get_top_rated_shows(HEADERS, language='en-US')
        
        # Merge all of es-ES with titles in en-US
        movies_titles_EN = movies_EN[["id", "title"]].rename(columns={"title": "title_EN"})
        result_movies = movies_ES.merge(movies_titles_EN, on='id', how='left').rename(columns={"title": "title_ES"}).sort_values(by=['vote_average', 'vote_count'], ascending=False)
        
        shows_titles_EN = shows_EN[["id", "name"]].rename(columns={"name": "title_EN"})
        result_shows = shows_ES.merge(shows_titles_EN, on='id', how='left').rename(columns={"name": "title_ES"}).sort_values(by=['vote_average', 'vote_count'], ascending=False)
        
        # Show errors during process
        print("Errors during the extraction: ", movies_errors + shows_errors)
        
        # Extraction of movie-shows genres
        movie_genres = get_movie_genres(HEADERS)
        shows_genres = get_shows_genres(HEADERS)
        
        
        # Save files
        BASE_DIR = Path(__file__).resolve().parent.parent.parent
        RAW_DIR = BASE_DIR / "data" / "1_bronze"

        movies_absolute_path = f"{RAW_DIR}\\TMDB_top_rated_movies.csv"
        shows_absolute_path = f"{RAW_DIR}\\TMDB_top_rated_shows.csv"
        
        movies_genres_path = f"{RAW_DIR}\\TMDB_movies_genres.csv"
        shows_genres_path = f"{RAW_DIR}\\TMDB_shows_genres.csv"
        
        watch_providers_movies_path = f"{RAW_DIR}\\TMDB_watch_providers_movies.csv"
        watch_providers_shows_path = f"{RAW_DIR}\\TMDB_watch_providers_shows.csv"
        
        # Movies and shows have different columns, need to be merged later
        result_movies.to_csv(movies_absolute_path, sep=';')
        result_shows.to_csv(shows_absolute_path, sep=';')
        
        movie_genres.to_csv(movies_genres_path, sep=';')
        shows_genres.to_csv(shows_genres_path, sep=';')
        
        
        
        print(f"TMDB - Movies file saved on: {movies_absolute_path}")
        print(f"TMDB - Shows file saved on: {shows_absolute_path}")
         
    else: 
        print('ERROR: Connection to API failed')
    
   

TMDB - Retrieving movies [es-ES] : 100%|██████████| 505/505 [01:58<00:00,  4.26it/s]
TMDB - Retrieving shows  [es-ES] : 100%|██████████| 108/108 [00:20<00:00,  5.36it/s]
TMDB - Retrieving movies [en-US] : 100%|██████████| 505/505 [01:34<00:00,  5.34it/s]
TMDB - Retrieving shows  [en-US] : 100%|██████████| 108/108 [00:19<00:00,  5.55it/s]


Errors during the extraction:  ['TMDB - Movies - Error 400 on page 506']


NameError: name '__file__' is not defined

In [5]:

# Extraction of watch providers for the data we have
watch_providers_movies = movies_ES["id"].apply(lambda movie_id: get_watch_providers(HEADERS, movie_id, 'movie'))
watch_providers_shows = movies_ES["id"].apply(lambda movie_id: get_watch_providers(HEADERS, movie_id, 'tv'))

# watch_providers_shows = shows_ES["id"].apply(lambda movie_id: get_watch_providers(HEADERS, movie_id, 'movie'))

[{'logo_path': '/pbpMk2JmcoNnQwx5JGpXngfoWtp.jpg', 'provider_id': 8, 'provider_name': 'Netflix', 'display_priority': 0}, {'logo_path': '/jse4MOi92Jgetym7nbXFZZBI6LK.jpg', 'provider_id': 2241, 'provider_name': 'Movistar Plus+', 'display_priority': 5}, {'logo_path': '/f6TRLB3H4jDpFEZ0z2KWSSvu1SB.jpg', 'provider_id': 149, 'provider_name': 'Movistar Plus+ Ficción Total ', 'display_priority': 6}, {'logo_path': '/kO2SWXvDCHAquaUuTJBuZkTBAuU.jpg', 'provider_id': 63, 'provider_name': 'Filmin', 'display_priority': 7}, {'logo_path': '/dpR8r13zWDeUR0QkzWidrdMxa56.jpg', 'provider_id': 1796, 'provider_name': 'Netflix Standard with Ads', 'display_priority': 61}]
[{'logo_path': '/f6TRLB3H4jDpFEZ0z2KWSSvu1SB.jpg', 'provider_id': 149, 'provider_name': 'Movistar Plus+ Ficción Total ', 'display_priority': 6}, {'logo_path': '/h0ZYcYHicKQ4Ixm5nOjqvwni5NG.jpg', 'provider_id': 1773, 'provider_name': 'SkyShowtime', 'display_priority': 10}, {'logo_path': '/170ZfHTLT6ZlG38iLLpNYcBGUkG.jpg', 'provider_id': 1899,

KeyboardInterrupt: 

In [None]:
watch_providers_movies