In [1]:
import json
import gzip
import pandas as pd
import numpy as np

In [2]:
# title download from The Movie DB
file_name = 'tv_series_ids_01_02_2021.json.gz'

In [3]:
# use gzip to open json.gz file
# result has multiple records and json.loads() is not able to decode more than one. Need to do it record by record.
# Otherwise JSONDecodeError: Extra data: line 2 column 1

with gzip.open(file_name, 'rt', encoding='utf-8') as zipfile:
    json_data = [json.loads(line) for line in zipfile]

In [4]:
# first entry example - all are dictionaries
json_data[0]

{'id': 1, 'original_name': 'プライド', 'popularity': 2.384}

In [5]:
# convert list of dictionaries to a dataframe
df = pd.DataFrame(json_data)

In [6]:
df

Unnamed: 0,id,original_name,popularity
0,1,プライド,2.384
1,2,Clerks: The Animated Series,11.962
2,3,The Message,0.745
3,4,The Amazing Mrs Pritchard,1.740
4,5,La Job,6.901
...,...,...,...
101935,115748,Scholastic Storybook Treasures - 20 Holiday Ad...,0.000
101936,115749,Color Rush,0.000
101937,115750,Split Ends,0.000
101938,115752,仮面ライダーゼロワン,0.000


In [7]:
# test find specific TV series
df.loc[df['original_name']=='Game of Thrones']

Unnamed: 0,id,original_name,popularity
1348,1399,Game of Thrones,297.83


In [8]:
# search for partial text
df[df['original_name'].str.contains('Sopran')]

Unnamed: 0,id,original_name,popularity
1347,1398,The Sopranos,74.516


In [9]:
# separating the list into English titles and other languages using special characters
df['special'] = df['original_name'].str.contains(r'[^\x00-\x7F]+')

In [10]:
# export to excel
df.to_excel('Tv_list.xlsx')

### Making a list of TV titles to review

In [11]:
# There are over 100k titles in the download. Shorten this list based on English and popular TV shows

In [12]:
# create a list of conditions. If no special character and popularity greater than 1. 
# If special character and popularity greater than 20.
conditions = [
    (df['special'] == False) & (df['popularity'] >= 1),
    (df['special'] == True) & (df['popularity'] >= 20),
    (df['special'] == True) & (df['popularity'] < 20),
    (df['popularity'] < 1)
    ]

In [13]:
# create a list of values to assign to the conditions. keep or not
values = ['yes', 'yes', 'no', 'no']

In [14]:
# create a new column and use np.select to assign values to it, using conditions and values
df['keep'] = np.select(conditions, values)

In [15]:
df

Unnamed: 0,id,original_name,popularity,special,keep
0,1,プライド,2.384,True,no
1,2,Clerks: The Animated Series,11.962,False,yes
2,3,The Message,0.745,False,no
3,4,The Amazing Mrs Pritchard,1.740,False,yes
4,5,La Job,6.901,False,yes
...,...,...,...,...,...
101935,115748,Scholastic Storybook Treasures - 20 Holiday Ad...,0.000,False,no
101936,115749,Color Rush,0.000,False,no
101937,115750,Split Ends,0.000,False,no
101938,115752,仮面ライダーゼロワン,0.000,True,no


In [16]:
# create a subset of the dataframe with TV titles to keep
df_keep = df[df['keep'] == 'yes']

In [17]:
# sort dataframe by popularity
df_keep = df_keep.sort_values(by=['popularity'], ascending=False)

In [20]:
# reset the index and drop the old index column
df_keep.reset_index(inplace = True)
df_keep.drop(columns = 'index', inplace = True)

In [21]:
df_keep

Unnamed: 0,level_0,id,original_name,popularity,special,keep
0,0,82856,The Mandalorian,1183.816,False,yes
1,1,44217,Vikings,893.852,False,yes
2,2,75006,The Umbrella Academy,736.408,False,yes
3,3,71712,The Good Doctor,711.145,False,yes
4,4,77169,Cobra Kai,648.353,False,yes
...,...,...,...,...,...,...
23344,23344,64682,Born This Way,1.000,False,yes
23345,23345,67120,Gay Skit Happens,1.000,False,yes
23346,23346,66985,Big Easy Motors,1.000,False,yes
23347,23347,66733,Bad Internet,1.000,False,yes


In [22]:
# create a list of titles to review
id_list = df_keep['id'].to_list()

In [23]:
id_list[0:5]

[82856, 44217, 75006, 71712, 77169]

## Set up API

In [37]:
import requests
import json

In [103]:
baseurl = 'https://api.themoviedb.org/3/tv/'
tv_id = 1399
baseurlTV = baseurl + str(tv_id)
api_ky = '######'
param_dict = {'api_key': api_ky, 'language': 'en-US', 'append_to_response': 'watch/providers'}

In [32]:
baseurlTV

'https://api.themoviedb.org/3/tv/1399'

In [33]:
# This function accepts a URL path and a params diction as inputs.
import requests
def requestURL(baseurl, params = {}):
    req = requests.Request(method = 'GET', url = baseurl, params = params)
    prepped = req.prepare()
    return prepped.url

In [102]:
# check URL before running request
print(requestURL(baseurlTV, param_dict))

https://api.themoviedb.org/3/tv/1399?api_key=%23%23%23&language=en-US&append_to_response=watch%2Fproviders


In [42]:
# send request to The Movie DB
response  = requests.get(url = baseurlTV, params = param_dict)

In [44]:
# read json file
text = json.loads(response.text)

In [47]:
text.keys()

dict_keys(['backdrop_path', 'created_by', 'episode_run_time', 'first_air_date', 'genres', 'homepage', 'id', 'in_production', 'languages', 'last_air_date', 'last_episode_to_air', 'name', 'next_episode_to_air', 'networks', 'number_of_episodes', 'number_of_seasons', 'origin_country', 'original_language', 'original_name', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'seasons', 'spoken_languages', 'status', 'tagline', 'type', 'vote_average', 'vote_count', 'watch/providers'])

In [97]:
print(text['name'])
print(text['episode_run_time'])
print(text['number_of_episodes'])
print(text['number_of_seasons'])

Game of Thrones
[60]
73
8


In [63]:
text['watch/providers']['results']['US']['flatrate'][0]['provider_name']

'HBO Max'

### API Mini Test

In [68]:
mini_list = id_list[0:5]
url_list =[]

In [69]:
for id in mini_list:
    url_ind = baseurl + str(id)
    url_list.append(url_ind)

In [70]:
url_list

['https://api.themoviedb.org/3/tv/82856',
 'https://api.themoviedb.org/3/tv/44217',
 'https://api.themoviedb.org/3/tv/75006',
 'https://api.themoviedb.org/3/tv/71712',
 'https://api.themoviedb.org/3/tv/77169']

In [71]:
# send request to The Movie DB
tv_data = []
for urls in url_list:
    respon  = requests.get(url = urls, params = param_dict)
    txt = json.loads(respon.text)
    tv_data.append(txt)

In [96]:
tv_data[4]['name']

'Cobra Kai'

### API Full Download

In [98]:
url_full_list =[]
for id in id_list:
    url_ind = baseurl + str(id)
    url_full_list.append(url_ind)

In [104]:
# send request to The Movie DB
tv_data_all = []
for urls in url_full_list:
    respons  = requests.get(url = urls, params = param_dict)
    txt = json.loads(respons.text)
    tv_data_all.append(txt)

ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))

In [105]:
len(tv_data_all)

12801

In [127]:
tv_data_all[0]['networks'][0]['name']

'Disney+'

In [113]:
tv_data_all[12799]['watch/providers']['results']['US']['flatrate'][0]['provider_name']

{}

In [114]:
import pickle

In [115]:
# saving pickle file
with open('tv_data_all.pickle', "wb") as f:
    pickle.dump(tv_data_all, f)

#### TV Id List

In [148]:
tv_ids=[]

In [149]:
for i in tv_data_all:
    for key in i:
        if key == 'id':
            id = i['id']  
    tv_ids.append(id)

In [151]:
len(tv_ids)

12801

#### Finder Function

In [152]:
def finder_fn (col, list):
    for i in tv_data_all:
        for key in i:
            if key == col:
                item = i[col]  
        list.append(item)
    return list

In [154]:
# Name
name_list =[]
finder_fn('name', name_list)
len(name_list)

12801

In [156]:
# Run Time
run_time =[]
finder_fn('episode_run_time', run_time)
len(run_time)

12801

In [157]:
# Number of Episodes
no_episodes =[]
finder_fn('number_of_episodes', no_episodes)
len(no_episodes)

12801

In [158]:
# Number of Seasons
no_ssn =[]
finder_fn('number_of_seasons', no_ssn)
len(no_ssn)

12801

#### Networks

In [290]:
# Latest Network (one only)
network =[]
for i in tv_data_all:
    for key in i:
        if i['networks']:
            net = i['networks'][0]['name'] 
    network.append(net)


In [291]:
len(network)

12801

In [265]:
tv_data_all[2]['networks']

[{'name': 'Netflix',
  'id': 213,
  'logo_path': '/wwemzKWzjKYJFfCeiB57q3r4Bcm.png',
  'origin_country': ''}]

#### Providers

In [203]:
# provider example
tv_data_all[1]['watch/providers']['results']['US']['flatrate']

[{'display_priority': 1,
  'logo_path': '/68MNrwlkpF7WnmNPXLah69CR5cb.jpg',
  'provider_id': 119,
  'provider_name': 'Amazon Prime Video'},
 {'display_priority': 1,
  'logo_path': '/68MNrwlkpF7WnmNPXLah69CR5cb.jpg',
  'provider_id': 9,
  'provider_name': 'Amazon Prime Video'},
 {'display_priority': 6,
  'logo_path': '/giwM8XX4V2AQb9vsoN7yti82tKK.jpg',
  'provider_id': 15,
  'provider_name': 'Hulu'},
 {'display_priority': 53,
  'logo_path': '/er05UBXvaKlu93aAQTS1iegjGVK.jpg',
  'provider_id': 155,
  'provider_name': 'History'}]

In [204]:
# provider loop
provider = []
prv = ''
for item in tv_data_all:
    prov = []
    try:
        if item['watch/providers']['results']['US']:
            for p in item['watch/providers']['results']['US']['flatrate']:
                for key in p:
                    if key == 'provider_name':
                        prv = p['provider_name']
                        prov.append(prv)
    except:
        prov = ['None']

    provider.append(prov)

In [217]:
provider[16]

['fuboTV',
 'Hulu',
 'Disney Plus',
 'FXNow',
 'DIRECTV',
 'Fox',
 'Spectrum On Demand']

### Dataframe

In [309]:
# Creating a dataframe from the lists
TV_df = pd.DataFrame({'id': tv_ids})

In [313]:
TV_df['name'] = name_list
TV_df['runtime'] = run_time
TV_df['episode_count'] = no_episodes
TV_df['ssn_count'] = no_ssn
TV_df['network'] = network
TV_df['provider'] = provider


In [314]:
TV_df

Unnamed: 0,id,name,runtime,ssn_count,network,provider,episode_count
0,82856,The Mandalorian,"[35, 48]",2,Disney+,[Disney Plus],16.0
1,44217,Vikings,[44],6,Amazon,"[Amazon Prime Video, Amazon Prime Video, Hulu,...",89.0
2,75006,The Umbrella Academy,[55],2,Netflix,[Netflix],20.0
3,71712,The Good Doctor,[42],4,ABC,"[fuboTV, Hulu, DIRECTV, Spectrum On Demand]",66.0
4,77169,Cobra Kai,[30],3,Netflix,[Netflix],30.0
...,...,...,...,...,...,...,...
12796,73623,Unser Pauker,[30],1,ZDF,[None],20.0
12797,4762,Clarence,[30],1,BBC One,[None],6.0
12798,76486,Ferrari,[],1,BBC One,[None],2.0
12799,90762,The Office,[23],2,Disney+ Hotstar,[None],28.0


In [320]:
# Creating an average for the multiple values listed in runtime
TV_df['run_ave'] = pd.DataFrame(TV_df['runtime'].values.tolist()).mean(1)

In [321]:
TV_df

Unnamed: 0,id,name,runtime,ssn_count,network,provider,episode_count,run_ave
0,82856,The Mandalorian,"[35, 48]",2,Disney+,[Disney Plus],16.0,41.5
1,44217,Vikings,[44],6,Amazon,"[Amazon Prime Video, Amazon Prime Video, Hulu,...",89.0,44.0
2,75006,The Umbrella Academy,[55],2,Netflix,[Netflix],20.0,55.0
3,71712,The Good Doctor,[42],4,ABC,"[fuboTV, Hulu, DIRECTV, Spectrum On Demand]",66.0,42.0
4,77169,Cobra Kai,[30],3,Netflix,[Netflix],30.0,30.0
...,...,...,...,...,...,...,...,...
12796,73623,Unser Pauker,[30],1,ZDF,[None],20.0,30.0
12797,4762,Clarence,[30],1,BBC One,[None],6.0,30.0
12798,76486,Ferrari,[],1,BBC One,[None],2.0,
12799,90762,The Office,[23],2,Disney+ Hotstar,[None],28.0,23.0


### To fix

In [160]:
# Latest Network (one only) - this overwrites the network with the last listed


network =[]
for i in tv_data_all:
    for key in i:
        if key == 'networks':
            for j in i['networks']:
                for key2 in j:
                    if key2 == 'name':
                        net = j['name'] 
    network.append(net)

In [235]:
tv_test = tv_data_all[0:6]

In [307]:
# Don't use Network loop (multiple each level)
network2 = []
net = []
for item in tv_test:
    nt = []
    for key in item:
        if item['networks']:
            for n in item:
                for key in n:
                    if key == 'name':
                        nt = n['name']
                        net.append(nt)

    network2.append(net)


"    \nfor item in tv_data_all:\n    prov = []\n    try:\n        if item['watch/providers']['results']['US']:\n            for p in item['watch/providers']['results']['US']['flatrate']:\n                for key in p:\n                    if key == 'provider_name':\n                        prv = p['provider_name']\n                        prov.append(prv)\n    except:\n        prov = ['None'] "

In [180]:
# provider loop level 1 only (only lists first provider)
provider = []
prov2 = []
prov = ''
for item in tv_data_all:
    try:
        if item['watch/providers']['results']['US']:
            prov = item['watch/providers']['results']['US']['flatrate'][0]['provider_name']
    except:
        prov = 'None'

    provider.append(prov)