# Get Service Providers

This notebook is used to grab service provider information from JustWatch. The only way to search using this API is by title, so the results are imperfect. A fuzzywuzzy ratio is currently calculated to try to determine if the search result is a good match. 

In [1]:
# imports 
from justwatch import JustWatch # https://github.com/dawoudt/JustWatchAPI
import psycopg2
from multiprocessing import Pool
from tqdm._tqdm_notebook import tqdm_notebook
import time
from fuzzywuzzy import fuzz # https://github.com/seatgeek/fuzzywuzzy
import pandas as pd
import os
from dotenv import load_dotenv

In [2]:
# load .env file for keys
load_dotenv()

True

In [3]:
# make justwatch instance
just_watch = JustWatch(country='US')

In [4]:
# make providers list to narrow search
provider_details = just_watch.get_providers()
my_providers = []

for provider in provider_details:
    if len(provider['data']['deeplink_data']) > 0:
        my_providers.append((
            provider['id'], 
            provider['short_name'], 
            provider['clear_name'], 
            provider['data']['deeplink_data'][0]['path_template']
        ))

my_providers[0], len(my_providers)

((8, 'nfx', 'Netflix', 'www.netflix.com/watch/%DEEPLINK%'), 49)

In [5]:
# test API
result = just_watch.search_for_item(
    query='the other guys', 
    release_year_from=2010, 
    release_year_until=2010)
if result['total_results'] > 0:
    test_item = result['items'][0]
test_item.keys()

dict_keys(['jw_entity_id', 'id', 'title', 'full_path', 'full_paths', 'poster', 'original_release_year', 'tmdb_popularity', 'object_type', 'offers', 'scoring'])

In [6]:
# test get by id
the_matrix = just_watch.get_title(title_id=test_item['id'])
the_matrix['title']

'The Other Guys'

In [7]:
test_item['id'], test_item['title'], test_item['offers'][:2]

(102740,
 'The Other Guys',
 [{'monetization_type': 'buy',
   'provider_id': 7,
   'retail_price': 13.99,
   'last_change_retail_price': 4.99,
   'last_change_difference': 9,
   'last_change_percent': 180.36072,
   'last_change_date': '2020-04-22',
   'last_change_date_provider_id': '2020-04-22_7',
   'currency': 'USD',
   'urls': {'standard_web': 'https://www.vudu.com/content/movies/details/The-Other-Guys/176855',
    'deeplink_android_tv': 'intent://play?contentId=176855#Intent;launchFlags=0x00800000;scheme=vuduapp;component=air.com.vudu.air.DownloaderTablet/.TvMainActivity;end',
    'deeplink_tvos': 'vuduapp://play?contentId=176855'},
   'presentation_type': 'sd',
   'date_provider_id': '2000-01-01_7',
   'date_created': '2000-01-01'},
  {'monetization_type': 'rent',
   'provider_id': 7,
   'retail_price': 2.99,
   'currency': 'USD',
   'urls': {'standard_web': 'https://www.vudu.com/content/movies/details/The-Other-Guys/176855',
    'deeplink_android_tv': 'intent://play?contentId=17

In [8]:
# create connection to prod DB
connection = psycopg2.connect(
                            user=os.getenv('db_user'),
                            password=os.getenv('db_password'),
                            host=os.getenv('db_host'),
                            port="5432",
                            database=os.getenv('db_user')
                            )

In [9]:
# get all movies
cursor = connection.cursor()
postgreSQL_select_Query = "SELECT movie_id, primary_title, start_year FROM movies;"
cursor.execute(postgreSQL_select_Query)
all_movies = cursor.fetchall()
len(all_movies)

301967

In [10]:
# function to search for movie
"""
Now saving a ratio score from fuzzywuzzy 
to later determine if the result is the correct movie
Could improve by calculating ratio after collecting JustWatch info
because currently the collection process is slow (~1.5hr per 50k)

I'm finding it may be better to broaden the providers in the 
search to more effectively get the correct movie, then can 
filter down to the offer data from the providers we want.
"""
def get_movie(movie_data, providers=['nfx', 'amp', 'hlu', 'amz', 'dnp', 'yot']):
    movie_id, title, start_year = movie_data
    # only other way I see to improve this query is with:
    # release_year_from and release_year_until
    try:
        # could also save original_release_year for verifying match 
        # and poster may be another source for posters
        # because they also provide tmdb_popularity I'm guessing id may be linked to tmdb
        result = just_watch.search_for_item(
            query=title, 
            providers=providers,
            release_year_from=start_year, 
            release_year_until=start_year
        )
        result = result['items'][0]
        return {
            'movie_id': movie_id,
            'movie_title': title,
            'id': result['id'],
            'title': result['title'], 
            'offers': result['offers'],
            'ratio': fuzz.ratio(result['title'].lower(),title.lower())
        }
    except: 
        return None

In [11]:
# get movie by id
def get_by_id(title_id):
    return just_watch.get_title(title_id=title_id)

In [12]:
get_movie(all_movies[100000]), all_movies[100000][1]

({'movie_id': '0118901',
  'movie_title': 'Critical Care',
  'id': 191426,
  'title': 'Critical Care',
  'offers': [{'monetization_type': 'buy',
    'provider_id': 3,
    'retail_price': 5.99,
    'currency': 'USD',
    'urls': {'standard_web': 'https://play.google.com/store/movies/details/Critical_Care?gl=US&hl=en&id=qry4JbtFE6k',
     'deeplink_android_tv': 'qry4JbtFE6k'},
    'subtitle_languages': ['en'],
    'audio_languages': ['en'],
    'presentation_type': 'hd',
    'date_provider_id': '2016-03-06_3',
    'date_created': '2016-03-06'},
   {'monetization_type': 'rent',
    'provider_id': 3,
    'retail_price': 3.99,
    'currency': 'USD',
    'urls': {'standard_web': 'https://play.google.com/store/movies/details/Critical_Care?gl=US&hl=en&id=qry4JbtFE6k',
     'deeplink_android_tv': 'qry4JbtFE6k'},
    'subtitle_languages': ['en'],
    'audio_languages': ['en'],
    'presentation_type': 'hd',
    'date_provider_id': '2016-03-06_3',
    'date_created': '2016-03-06'},
   {'monetizat

In [13]:
# test func on sample
movie_sample = all_movies[:50000]
start = time.process_time()
with Pool(5) as p:
    provider_data = list(tqdm_notebook(
                p.imap(get_movie, movie_sample),
                total=len(movie_sample)
                ))
time.process_time() - start

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




70.260204

In [14]:
# check results of collection
provider_count = 0
movie_ids = []
jw_ids = []
titles = []
jw_titles = []
offer_ids = []
offer_urls = []
offer_pres_type = []
offer_mon_type = []
ratios = []

for provider in provider_data:
    if provider is not None:
        provider_count += 1
        movie_ids.append(provider['movie_id'])
        jw_ids.append(provider['id'])
        titles.append(provider['movie_title'])
        jw_titles.append(provider['title'])
        # other offer info: presentation_type (sd, hd), 
        # date_created, monetization_type (flat_rate, buy, rent)
        offer_ids.append(",".join([str(prov['provider_id']) for prov in provider['offers']]))
        offer_urls.append(",".join([prov['urls']['standard_web'] for prov in provider['offers'] if 'standard_web' in prov['urls']]))
        offer_pres_type.append(",".join([prov['presentation_type'] for prov in provider['offers']]))
        offer_mon_type.append(",".join([prov['monetization_type'] for prov in provider['offers']]))
        ratios.append(provider['ratio'])
        
str(round(provider_count / len(provider_data), 2)*100)+"%"

'72.0%'

In [15]:
# make df with results
prov_df = pd.DataFrame({
    'movie_id': movie_ids,
    'title': titles,
    'jw_id': jw_ids,
    'jw_title': jw_titles,
    'offer_provider_id': offer_ids,
    'offer_urls': offer_urls,
    'presentation_types': offer_pres_type,
    'monetization_type': offer_mon_type,
    'ratio': ratios
})
print(prov_df.shape)
prov_df.head()

(36244, 9)


Unnamed: 0,movie_id,title,jw_id,jw_title,offer_provider_id,offer_urls,presentation_types,monetization_type,ratio
0,1051262,The Stalker Within,149482,Within,1010101099,https://www.amazon.com/gp/product/B005C2UHTA?c...,"sd,sd,hd,hd,hd,sd","buy,rent,rent,buy,flatrate,flatrate",50
1,1051263,Stolen Life,82951,Reckless Behavior: Caught on Tape,10101010,https://www.amazon.com/gp/product/B07578QZRK?c...,"hd,sd,hd,sd","buy,buy,rent,rent",27
2,1051320,La cantatrice chauve,81030,The Golden Compass,"7,7,7,7,3,3,192,192,10,10,10,10,358,358,2,2,2,...",https://www.vudu.com/content/movies/details/Th...,"sd,sd,hd,hd,hd,hd,hd,hd,hd,sd,hd,sd,hd,hd,hd,s...","rent,buy,buy,rent,rent,buy,buy,rent,rent,rent,...",26
3,10513286,Historia de mi nombre,403709,Marriage Story,888,"http://www.netflix.com/title/80223779,http://w...","hd,sd,4k","flatrate,flatrate,flatrate",40
4,10513474,In un futuro aprile,413776,A Beautiful Day in the Neighborhood,"68,68,68,7,7,7,10,10,3,192,2,2,2,10,68,68,68,3...",https://www.microsoft.com/en-us/p/a-beautiful-...,"hd,sd,4k,sd,4k,hd,hd,sd,4k,4k,4k,sd,hd,4k,4k,h...","buy,buy,buy,buy,buy,buy,buy,buy,buy,buy,buy,bu...",30


In [17]:
# save prov_df
prov_df.to_csv('provider_data_0_50k.csv', index=False)

In [29]:
# concat all provider_data_*.csv files
df_1 = pd.read_csv('provider_data_0_50k.csv')
df_2 = pd.read_csv('provider_data_50k_100k.csv')
df_3 = pd.read_csv('provider_data_100k_150k.csv')
df_4 = pd.read_csv('provider_data_150k_200k.csv')
df_5 = pd.read_csv('provider_data_250k_on.csv')

prov_df = pd.concat([df_1, df_2, df_3, df_4, df_5])
print(prov_df.shape)
prov_df.head()

(224382, 7)


Unnamed: 0,movie_id,title,jw_id,jw_title,offer_provider_id,offer_urls,ratio
0,1051262,The Stalker Within,309961,The Evil Within,"7,7,7,7,192,3,3,192,3,192,9,9,10,10,10,10,3,19...",https://www.vudu.com/content/movies/details/Th...,73
1,1051263,Stolen Life,167298,Stolen Life,2510109,"https://www.fandor.com/films/stolen_life,https...",100
2,10513072,Don't Date Your Sister,434938,Don't Open Your Eyes,1010101022221299,https://www.amazon.com/gp/product/B07H4T4NQ2?c...,67
3,1051320,La cantatrice chauve,23259,La Bamba,"7,7,7,7,3,3,192,192,279,279,279,279,10,10,10,1...",https://www.vudu.com/content/movies/details/La...,36
4,10513286,Historia de mi nombre,403709,Marriage Story,888,"http://www.netflix.com/title/80223779,http://w...",40


In [30]:
# save concatenated df
prov_df.to_csv('provider_data.csv', index=False)

In [31]:
# checking stats of ratios to get a guess on how many matches were found
prov_df['ratio'].describe()

count    224382.000000
mean         51.437958
std          25.091076
min           0.000000
25%          32.000000
50%          47.000000
75%          67.000000
max         100.000000
Name: ratio, dtype: float64

In [16]:
# filter down to rows with ratio > 90
filtered_df = prov_df[prov_df['ratio'] > 90]
print(filtered_df.shape)
filtered_df.head()

(5941, 9)


Unnamed: 0,movie_id,title,jw_id,jw_title,offer_provider_id,offer_urls,presentation_types,monetization_type,ratio
21,1051245,Moving Midway,78930,Moving Midway,2219199,https://itunes.apple.com/us/movie/moving-midwa...,"sd,sd,sd,hd,sd","buy,rent,flatrate,flatrate,flatrate",100
24,1051253,The Portal,93922,The Portal,99101073,https://www.amazon.com/gp/product/B01MXEV6MJ?c...,"sd,hd,sd,sd,sd","flatrate,flatrate,buy,rent,ads",100
27,1051715,Lost Everything,332461,Lost Everything,91010191,https://www.amazon.com/gp/product/B010SHCSF4?c...,"sd,sd,sd,sd","flatrate,buy,rent,flatrate",100
29,1051226,Frankenhood,149223,Frankenhood,"3,3,7,7,7,7,192,192,10,10,10,10,2,2,2,2,3,192,...",https://play.google.com/store/movies/details/F...,"hd,hd,sd,sd,hd,hd,hd,hd,sd,hd,sd,hd,hd,sd,hd,s...","buy,rent,buy,rent,rent,buy,rent,buy,buy,rent,r...",100
66,1051904,Goosebumps,141571,Goosebumps,"7,2,2,7,2,2,10,10,10,10,3,3,192,192,68,68,7,68...",https://www.vudu.com/content/movies/details/Go...,"sd,hd,sd,hd,hd,sd,sd,hd,hd,sd,4k,4k,4k,4k,hd,s...","buy,buy,buy,buy,rent,rent,rent,rent,buy,buy,re...",100


In [33]:
# filtered_df looks much better, lets check the worst of them
filtered_df.sort_values(by='ratio')[:10]

Unnamed: 0,movie_id,title,jw_id,jw_title,offer_provider_id,offer_urls,ratio
34760,247931,Antham,419091,Antam,21821899,https://erosnow.com/movie/watch/1065669/Antham...,91
38962,85900,Le marginal,42022,El marginal,88,"http://www.netflix.com/title/80115297,http://w...",91
22463,4344742,Women and Sometimes Men,418028,Women & Sometimes Men,"191,2,2,2,2,10,10,10,10,3,192,3,3,192,192,212,...",https://www.kanopy.com/product/women-and-somet...,91
31613,1954796,Sahar,270987,Sahara,88,"http://www.netflix.com/title/80161029,http://w...",91
20379,4150196,U.F.O,60061,U.F.O.,1921921010101099105105105105,"https://www.youtube.com/watch?v=BZ7VuxArTY4,ht...",91
31560,1954381,Dhada,371764,Dhadak,9,https://www.amazon.com/gp/product/B07HKC1PSZ?c...,91
33348,4793322,Defenders of the Faith,9543,Defenders of the Earth,9910241300,https://www.amazon.com/gp/product/B06X9P2R5V?c...,91
39208,86193,Richard III,249063,Richard III,99,https://www.amazon.com/gp/product/B07TYJBC15?c...,91
13416,376296,Tramp,244134,Tramps,88,"http://www.netflix.com/title/80146759,http://w...",91
24724,6459680,Danga,245671,Dangal,88,"http://www.netflix.com/title/80166185,http://w...",91


## Retrospective:

`filtered_df` is looking good, but it may be cool to incorperate a user-feedback feature with these links where they vote if they are correct or not. (more advanced version would be to save a couple from each search and have the user pick which one is correct if any)