## Get Additional Data

Descriptions:
- run a query to get all movies
- run function to get TMDB match and store imdb_id and overview 
- saves results to csv for `update_db.ipynb` to save to DB later 

Trailers:
- run a query to get all movies
- run function to get TMDB match and store imdb_id and tmdb_id
- run function to get videos from TMDB using tmdb_id
- saves results to csv for `update_db.ipynb` to save to DB later 

Missing Posters:
- run query to get movies with missing posters
- run function to get TMDB match and store imdb_id and poster_path
- saves results to csv for `update_db.ipynb` to save to DB later 

In [None]:
# for colab
%%capture
!pip install tmdbv3api

In [1]:
# imports 
from tmdbv3api import Movie, TMDb
import requests 
import json
import psycopg2
from multiprocessing import Pool
from tqdm._tqdm_notebook import tqdm_notebook
import time
import pandas as pd 
import os
from dotenv import load_dotenv

In [2]:
# load .env file for keys
load_dotenv()

True

In [3]:
# declare movie client
movie_client = Movie()
tmdb_api_key = os.getenv('tmdb_key')
Movie.API_KEY = tmdb_api_key
tmdb = TMDb()
tmdb.api_key = os.getenv('tmdb_key')

In [4]:
# create connection to prod DB
connection = psycopg2.connect(
                            user=os.getenv('DB_USER'),
                            password=os.getenv('DB_PASSWORD'),
                            host=os.getenv('HOST'),
                            port=os.getenv('PORT'),
                            database=os.getenv('DB_NAME')
                            )

In [13]:
# get all movies with missing posters
cursor = connection.cursor()
postgreSQL_select_Query = "SELECT movie_id, primary_title FROM movies where poster_url not like '/%';"
cursor.execute(postgreSQL_select_Query)
missing_posters = cursor.fetchall()

In [5]:
# get all movies
cursor = connection.cursor()
postgreSQL_select_Query = "SELECT movie_id, primary_title FROM movies;"
cursor.execute(postgreSQL_select_Query)
all_movies = cursor.fetchall()

In [15]:
# how many do we have of each?
len(all_movies), len(missing_posters)

(301967, 60255)

## Find missing posters using TMDB

In [6]:
# get tmdb match to be able to check for posters
def get_poster(movie):
    movie_id, title = movie
    url = f'https://api.themoviedb.org/3/find/tt{movie_id}?api_key={tmdb_api_key}&external_source=imdb_id'
    response = requests.get(url)
    data = json.loads(response.text)
    if len(data['movie_results']) > 0:
        result = data['movie_results'][0]
        return {
            "movie_id": str(movie_id),
            "tmdb_id": result['id'],
            "poster_path": result['poster_path']
        }
    return None


with Pool(5) as p:
     posters = list(tqdm_notebook(
        p.imap(get_poster, all_movies), 
        total=len(all_movies)
        ))

HBox(children=(IntProgress(value=0, max=301967), HTML(value='')))




In [7]:
# check success rate of get_poster
matches = 0
poster_counter = 0
movie_ids = []
tmdb_ids = []
poster_paths = []

for result in posters:
    if result is not None: 
        matches += 1
        if result['poster_path'] is not None:
            poster_counter += 1
        movie_ids.append(result['movie_id'])
        tmdb_ids.append(result['tmdb_id'])
        poster_paths.append(result['poster_path'])

print(len(posters), matches, len(posters)-matches)
poster_counter

301967 201482 100485


123960

In [8]:
# make df from poster data
poster_df = pd.DataFrame({
    "movie_id": movie_ids,
    "tmdb_id": tmdb_ids,
    "poster_path": poster_paths
})
print(poster_df.shape)
poster_df.head()

(201482, 3)


Unnamed: 0,movie_id,tmdb_id,poster_path
0,1051259,87397,/4c27iElQGoLSlrg2uZQUANHA4aa.jpg
1,1051834,533781,
2,10520386,637881,/iASQXlZqZfERdwTnBxIRpfgKrsO.jpg
3,1052347,201899,
4,10519798,676343,


In [9]:
# save poster df to csv
poster_df.to_csv('poster_data.csv', index=False)

In [11]:
# ensuring save worked correctly
poster_df = pd.read_csv('poster_data.csv')
poster_df.head()

Unnamed: 0,movie_id,tmdb_id,poster_path
0,1051231,31223,
1,1051704,636806,
2,1051245,120528,
3,1051226,41255,/85PnTI5NknwbOabekq30kqisMPX.jpg
4,1051834,533781,


## Get all descriptions

In [7]:
# get tmdb match to retrieve descriptions
def get_description(movie):
    movie_id, title = movie
    url = f'https://api.themoviedb.org/3/find/tt{movie_id}?api_key={tmdb_api_key}&external_source=imdb_id'
    response = requests.get(url)
    data = json.loads(response.text)
    if len(data['movie_results']) > 0:
        result = data['movie_results'][0]
        return {
            "movie_id": movie_id,
            "tmdb_id": result['id'],
            "description": result['overview']
        }
    return None


with Pool(10) as p:
    descs = list(tqdm_notebook(
        p.imap(get_description, all_movies), 
        total=len(all_movies)
        ))

HBox(children=(IntProgress(value=0, max=301967), HTML(value='')))




In [11]:
# create df from results
movie_ids = []
tmdb_ids = []
descs_list = []

for elem in descs:
    if elem is not None:
        movie_ids.append(elem['movie_id'])
        tmdb_ids.append(elem['tmdb_id'])
        descs_list.append(elem['description'])

description_result = pd.DataFrame({
    "movie_id": movie_ids,
    "tmdb_id": tmdb_ids,
    "description": descs_list
})
print(description_result.shape)
description_result.head()

Unnamed: 0,movie_id,tmdb_id,description
0,1051231,31223,In the Hands of the Gods is the true story of ...
1,1051244,573815,A group of talented youth exploited by the hea...
2,10513474,639651,A unique chance to explore Pier Paolo Pasolini...
3,10515086,599672,A meeting with a new inmate in the psychiatric...
4,10515340,531678,A strange disease is plaguing the city. Hoping...


In [12]:
# save results to csv
description_result.to_csv('description_results.csv', index=False)

In [15]:
description_result = pd.read_csv('description_results.csv', engine='python')
description_result.head()

Unnamed: 0,movie_id,tmdb_id,description
0,1051231,31223.0,In the Hands of the Gods is the true story of ...
1,1051244,573815.0,A group of talented youth exploited by the hea...
2,10513474,639651.0,A unique chance to explore Pier Paolo Pasolini...
3,10515086,599672.0,A meeting with a new inmate in the psychiatric...
4,10515340,531678.0,A strange disease is plaguing the city. Hoping...


## Get trailers

In [21]:
# prep desc results for get_trailers
# both_ids = []
# for elem in descs:
#   curr = (elem['movie_id'], elem['tmdb_id'])
#   both_ids.append(curr)

# len(descs), len(both_ids)
def get_id_tuple(row):
    return (row['movie_id'], int(row['tmdb_id']))

# prep description_result for trailers
description_result = description_result.dropna(subset=['tmdb_id'])
both_ids = list(description_result.apply(get_id_tuple, axis=1))

description_result.shape[0], len(both_ids), both_ids[0]

(201278, 201278, ('1051231', 31223))

In [19]:
description_result.isnull().sum()

movie_id           0
tmdb_id           74
description    27476
dtype: int64

In [56]:
# get trailers from tmdb match
def get_trailer(movie):
    imdb_id, tmdb_id = movie
    # check for videos
    try:
    
        video_results = movie_client.videos(tmdb_id)
        if len(video_results) > 0:
            return {
                "movie_id": imdb_id,
                "video_key": video_results[0].key,
                "video_site": video_results[0].site,
                "more_than_one": len(video_results) > 1
            }
        else:
            return None
    except:
        return None

In [57]:
print(get_trailer(both_ids[5]))

{'movie_id': '10515460', 'video_key': 'HQksgesFrFY', 'video_site': 'YouTube', 'more_than_one': False}


In [58]:
# run on all ids
with Pool(5) as p:
    trailers = list(tqdm_notebook(
        p.imap(get_trailer, both_ids), 
        total=len(both_ids)
        ))

HBox(children=(IntProgress(value=0, max=201278), HTML(value='')))




In [60]:
# check success rate of get_trailers and prep data for DF

trailer_count = 0
movie_ids = []
video_keys = []
video_sites = []
more_than_ones = []

for elem in trailers:
    if elem is not None:
        trailer_count += 1
        movie_ids.append(elem['movie_id'])
        video_keys.append(elem['video_key'])
        video_sites.append(elem['video_site'])
        more_than_ones.append(elem['more_than_one'])

len(trailers), trailer_count

(201278, 50793)

In [61]:
# save trailer results in df
trailer_df = pd.DataFrame({
    "movie_id": movie_ids,
    "video_key": video_keys,
    "video_site": video_sites,
    "more_than_one": more_than_ones
})
trailer_df.head()

Unnamed: 0,movie_id,video_key,video_site,more_than_one
0,1051244,ztSS7hnEviY,YouTube,False
1,10515086,WA2NvFSHchk,YouTube,False
2,10515460,HQksgesFrFY,YouTube,False
3,10515480,QBNKpcUOWgI,YouTube,False
4,1051232,k9SdzYiyG14,YouTube,False


In [62]:
# check if any have more than one
trailer_df['more_than_one'].value_counts()

False    43481
True      7312
Name: more_than_one, dtype: int64

In [63]:
# check what sites are used
trailer_df['video_site'].value_counts()

YouTube    50205
Vimeo        588
Name: video_site, dtype: int64

In [64]:
# save df as csv
trailer_df.to_csv('trailer_data.csv', index=False)

In [65]:
# ensure save was successful
trailer_df = pd.read_csv('trailer_data.csv') #, engine='python'
trailer_df.head()

Unnamed: 0,movie_id,video_key,video_site,more_than_one
0,1051244,ztSS7hnEviY,YouTube,False
1,10515086,WA2NvFSHchk,YouTube,False
2,10515460,HQksgesFrFY,YouTube,False
3,10515480,QBNKpcUOWgI,YouTube,False
4,1051232,k9SdzYiyG14,YouTube,False
