In [1]:
import commendaroo.data_transformer.model_data as mdt

from scipy import sparse
import implicit
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import json
import os
import boto3
import tensorflow_hub as hub

from aws_tools.cloudwatch_logging import logger
from aws_tools.project_config import get_aws_config

In [60]:
def create_model(data_implicit):
    '''
    '''
    
    logger.info('Training the model...')
    
    sparse_content_person = sparse.csr_matrix(
        (data_implicit['eventStrength'].astype(float), (data_implicit['id_editorial_simple'], data_implicit['id_user_simple']))
    )
    
    model = implicit.als.AlternatingLeastSquares(factors=30, regularization=0.1, iterations=50, use_gpu = False)
    alpha = 15
    data_tofit = (sparse_content_person * alpha).astype('double')
    model.fit(data_tofit)
    
    return model

def downloadDirectoryFroms3(bucket, remoteDirectoryName, project_folder):
    s3_resource = boto3.resource('s3')
    bucket = s3_resource.Bucket(bucket) 
    for object in bucket.objects.filter(Prefix = remoteDirectoryName):
        if not os.path.exists(os.path.dirname(object.key.split(project_folder)[-1])):
            os.makedirs(os.path.dirname(object.key.split(project_folder)[-1]))
        if object.key.split(project_folder)[-1][-1] != '/': #avoids trying to copy files from an empty folder
            bucket.download_file(object.key,object.key.split(project_folder)[-1])

def load_use():
    logger.info('Loading Universal Sentence Encoder')
    
    bucket = 'bt-data-science-playground'
    project_folder = 'nps-score-verbatim-text-analysis/'
    model_filename = 'model_objects_UniversalSentenceEncoders/4'
    
    # Download the relevant model objects
    downloadDirectoryFroms3(bucket, project_folder+model_filename, project_folder)
    
    return hub.load(model_filename)

def more_like_this(content_id, content_vecs, content_norms, syn_vecs, syn_norms, othermd_vecs, othermd_norms, availability, id_editorial_list, id_editorial_simple_list, type_production_list, rating, genre, scheduler_channel):
    
    ## historical data
    scores_cont = content_vecs.dot(content_vecs[content_id,:])  / (content_norms * content_vecs[content_id,:].sum()) # i.e. calculating cosine similarity, (A.B) / (|A| x |B|) --> |B| just a constant so effectively won't need it
    scores_cont =  MinMaxScaler().fit_transform(scores_cont.reshape(-1,1))[:,0]
    
    ## synopsis
    scores_syn = syn_vecs.dot(syn_vecs[content_id,:])  / (syn_norms * syn_vecs[content_id,:].sum()) # i.e. calculating cosine similarity, (A.B) / (|A| x |B|) --> |B| just a constant so effectively won't need it
    scores_syn =  MinMaxScaler().fit_transform(scores_syn.reshape(-1,1))[:,0]
    
    ## other metadata
    scores_othermd = othermd_vecs.dot(othermd_vecs[content_id,:])  / (othermd_norms * othermd_vecs[content_id,:].sum()) # i.e. calculating cosine similarity, (A.B) / (|A| x |B|) --> |B| just a constant so effectively won't need it
    scores_othermd =  MinMaxScaler().fit_transform(scores_syn.reshape(-1,1))[:,0]
    
    ## sum scores
    a_cont = get_aws_config('model_parameters.json')['MLT_strength_historical']
    a_syn = get_aws_config('model_parameters.json')['MLT_strength_synopsis']
    a_othermd = get_aws_config('model_parameters.json')['MLT_strength_othermetadata']
    #
    scores = ((a_cont * scores_cont) + (a_syn * scores_syn) + (a_othermd * scores_othermd)) / (a_cont + a_syn + a_othermd)
    
    # make zero the content no longer avilable
    scores = scores * availability
    
    # make zero the content not matching the same scheduler_channel
    if scheduler_channel[content_id] in ['Kids', 'Film', 'TV', 'TV Replay', 'Sport', 'Music']:
        scheduler_channel_logical = scheduler_channel == scheduler_channel[content_id]
        scheduler_channel_logical = np.array(scheduler_channel_logical.astype(int))
        #
        scores = scores * scheduler_channel_logical
    
    # make zero the content not matching the same genre
    genre_logical = genre == genre[content_id]
    genre_logical = np.array(genre_logical.astype(int))
    #
    scores = scores * genre_logical
    
    # make zero the content with 2 or more degrees higher rating except for kids (same rating only)
    if scheduler_channel[content_id] == 'Kids':
        rating_logical = rating == rating[content_id]
        rating_logical = np.array(rating_logical.astype(int))
        #
        scores = scores * rating_logical
    else:
        rating_logical = rating <= (rating[content_id] + 1)
        rating_logical = np.array(rating_logical.astype(int))
        #
        scores = scores * rating_logical
    
    similar = sorted(zip(id_editorial_list, scores[id_editorial_simple_list], type_production_list), key=lambda x: -x[1])
    
    #con_id = similar[0][0]
    
    return similar[:21] # return 21, so later can drop itself and will have at least 20
    
def for_you(person_id, sparse_person_content, content_vecs_T, person_vecs, availability, id_editorial_list, id_editorial_simple_list, type_production_list, id_user_list, id_user_simple_list):

    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()

    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1

    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0

    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs_T)

    rec_vector = MinMaxScaler().fit_transform(rec_vector.reshape(-1,1))[:,0]

    # Multiply by zero the scores of items already interacted with
    recommend_vector = person_interactions * rec_vector
    
    # make zero the content no longer avilable
    recommend_vector = recommend_vector * availability
    
    recs = sorted(zip(id_editorial_list, recommend_vector[id_editorial_simple_list], type_production_list), key=lambda x: -x[1])
    
    user_id = id_user_list[id_user_simple_list.index(person_id)]
    
    return recs[:50], user_id


def get_recommendations(data_implicit, model):
    '''
    '''
    
    logger.info('Creating recommendations...')
    
    # (sorting first alphabetically by status so ACTIVE versions of same items appears first over INACTIVE versions)
    id_editorial_legend = data_implicit[['id_editorial', 'id_editorial_simple', 'type_production', 'end_date', 'start_date', 'status', 'rating_n', 'genre', 'scheduler_channel', 'synopsis', 'sub_genres']].sort_values(by=['status','sub_genres']).drop_duplicates(subset=['id_editorial', 'id_editorial_simple']).sort_values(by='id_editorial_simple')

    id_editorial_simple_list = id_editorial_legend['id_editorial_simple'].tolist()
    id_editorial_list = id_editorial_legend['id_editorial'].tolist()
    type_production_list = id_editorial_legend['type_production'].tolist()

    # get array with 1 or 0 based on availability of content today + status
    availability = (id_editorial_legend['status'] == 'ACTIVE') & (id_editorial_legend['end_date'] > datetime.today().strftime('%Y-%m-%d')) & (id_editorial_legend['start_date'] <= datetime.today().strftime('%Y-%m-%d'))
    availability = np.array(availability.astype(int))
    
    scheduler_channel = np.array(id_editorial_legend['scheduler_channel'].tolist())
    
    genre = np.array(id_editorial_legend['genre'].tolist())
    
    sub_genres = [str(x).lower().replace(' ', '').replace(',', ' ') for x in id_editorial_legend['sub_genres'].tolist()]
    
    rating = np.array(id_editorial_legend['rating_n'].tolist())
    
    synopsis = id_editorial_legend['synopsis'].tolist()


    id_user_legend = data_implicit[['id_user', 'id_user_simple']].drop_duplicates(subset=['id_user', 'id_user_simple']).sort_values(by='id_user_simple')
    id_user_simple_list = id_user_legend['id_user_simple'].tolist()
    id_user_list = id_user_legend['id_user'].tolist()

    date_today = datetime.today().strftime('%Y-%m-%d')
    code_version = get_aws_config('model_parameters.json')['code_version']
    
    data_update_date =  {
        'view' : str(data_implicit[data_implicit['type_entitlement'] == 'VIEW']['event_date'].max()),
        'purchase' : str(data_implicit[data_implicit['type_entitlement'] == 'EVOD']['event_date'].max()),
        'rental' : str(data_implicit[data_implicit['type_entitlement'] == 'TVOD']['event_date'].max())
    }
    
    sparse_content_person = sparse.csr_matrix(
        (data_implicit['eventStrength'].astype(float), (data_implicit['id_editorial_simple'], data_implicit['id_user_simple']))
    )
    sparse_person_content = sparse.csr_matrix(
        (data_implicit['eventStrength'].astype(float), (data_implicit['id_user_simple'], data_implicit['id_editorial_simple']))
    )
    
    ### More Like This   
    logger.info('More Like This')
    
    content_vecs = model.item_factors
    content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1)) # i.e. calculating abs. value of the vector of each item -->  |A|
    
    embed = load_use()
    syn_vecs = np.array(embed(synopsis))
    syn_norms = np.sqrt((syn_vecs * syn_vecs).sum(axis=1)) # i.e. calculating abs. value of the vector of each item -->  |A|
    
    vectorizer = CountVectorizer()
    othermd_vecs = vectorizer.fit_transform(sub_genres).toarray()
    othermd_norms = np.sqrt((othermd_vecs * othermd_vecs).sum(axis=1)) # i.e. calculating abs. value of the vector of each item -->  |A|

    # create the output for dynamoDB table
    output = []

    for index, i in enumerate(id_editorial_simple_list): 

        if availability[index] == 1: # check if content is available otherwise those recommendations will be wrong due to having multiplied scores by zero for unavailable content
            
            con_id = id_editorial_list[index]
            recs = more_like_this(i, content_vecs, content_norms, syn_vecs, syn_norms, othermd_vecs, othermd_norms, availability, id_editorial_list, id_editorial_simple_list, type_production_list, rating, genre, scheduler_channel)

            rec_guids = []
            for r in recs:
                
                if r[0] != con_id:

                    rec_guids.append({
                        'guid' : r[0],
                        'score' : round(float(r[1]), 5),
                        'type' : r[2]
                    })

            output_rec = {'score_date' : date_today, 'data_update_date' : data_update_date, 'code_version' : code_version, 'recommendations' : rec_guids[:20]}

            #output.append([con_id, output_rec])
            
            #also append same recommendations with <type>|<guid> lookup
            con_id_type = type_production_list[index]
            output.append([con_id_type + '|' + con_id, output_rec])

    output_morelikethis = pd.DataFrame(output,columns=['content','recommendations'])

    output_morelikethis['recommendations'] = output_morelikethis['recommendations'].apply(lambda x: json.dumps(x)) # if already a dict

#     # remove any duplicates; these can still occur if some content has exactly the same watch history as other and gets most similar item itself and other stuff too
#     output_morelikethis = output_morelikethis.drop_duplicates(subset=['content']).reset_index(drop=True)

    logger.info('More Like This: done')
    
    
    ### For You
    logger.info('For You')
    content_vecs_T = model.item_factors.T
    person_vecs = model.user_factors

    # create the output for dynamoDB table
    output = []
    
    # Do For You for all users loaded in, unless testing
    users_list_foryou = id_user_simple_list.copy()
    #
    if "ENVIRON_PROD" in os.environ:
        if os.environ['ENVIRON_PROD'] == 'TESTING':
            users_tech_trial = [
                'V3006126692',
                'V3007158774',
                'V3009295704',
                'V3000606445',
                'V2283739102',
                'V2200004307',
                'V3009378451',
                'V3009436446',
                'V1000019727',
                'V3008138884',
                'V3003052251',
                'V3008613163',
                'V3578624855',
                'V3000143118',
                'V2200007285',
                'V3000163582',
                'V3000537931'
            ]
            # turn the VSID of the tech trail list to simple user ids
            users_list_foryou = [int(data_implicit[data_implicit['id_user'] == x]['id_user_simple'].iloc[0]) for x in users_tech_trial if x in id_user_list]
        
        
    for i in users_list_foryou: 

        recs, user_id = for_you(i, sparse_person_content, content_vecs_T, person_vecs, availability, id_editorial_list, id_editorial_simple_list, type_production_list, id_user_list, id_user_simple_list)

        rec_guids = []
        for r in recs:

            rec_guids.append({
                'guid' : r[0],
                'score' : round(float(r[1]), 5),
                'type' : r[2]
            })

        output_rec = {'score_date' : date_today, 'data_update_date' : data_update_date, 'code_version' : code_version, 'recommendations' : rec_guids}

        output.append([user_id, output_rec])
        
        if int(i+1) % 50_000 == 0:
            logger.info('For You calculated for {} users'.format(int(i)+1))
    logger.info('For You calculated for all users ({}).'.format(int(i)+1)) 
    
    ##### add 'anonymous' entry for most popular content
    logger.info('Adding anonymous in For You table')
    n_days = 9
    popular = data_implicit[(data_implicit['event_date'] + pd.DateOffset(n_days) >= date_today)]['id_editorial_simple'].value_counts()
    
    rec_guids = []
    scores_fake = sorted(list(range(len(popular.index.tolist()[:20]))), reverse=True)
    for count, i in enumerate(popular.index.tolist()[:20]):
        
        rec_guids.append({
                'guid' : data_implicit[data_implicit['id_editorial_simple'] == i]['id_editorial'].iloc[0],
                'score' : scores_fake[count],
                'type' : data_implicit[data_implicit['id_editorial_simple'] == i]['type_production'].iloc[0]
            })
        
    output_rec = {'score_date' : date_today, 'data_update_date' : data_update_date, 'code_version' : code_version, 'recommendations' : rec_guids}

    output.append(['anonymous', output_rec])
    #####

    output_foryou = pd.DataFrame(output,columns=['user','recommendations'])

    output_foryou['recommendations'] = output_foryou['recommendations'].apply(lambda x: json.dumps(x)) # if already a dict

    logger.info('For You: done')
    
    return output_foryou, output_morelikethis

In [3]:
data_implicit = mdt.get_data()
logger.info('Data is ready for model training')

2021-03-02 08:55:36,437 [INFO ]  Getting the data
2021-03-02 08:55:36,438 [INFO ]  Creating views tables
2021-03-02 08:57:01,238 [INFO ]  Creating purchases/rentals/PPVs tables
2021-03-02 08:57:28,670 [INFO ]  Loading views data
2021-03-02 08:57:28,671 [INFO ]  starting Athena query ...
2021-03-02 09:05:46,840 [INFO ]  loading 77f4b678-686f-468a-b0e1-cae500e13d7a.csv
2021-03-02 09:18:04,148 [INFO ]  Athena query complete: returning dataframe
2021-03-02 09:18:04,281 [INFO ]  Loading purchases/rentals/PPVs data
2021-03-02 09:18:04,282 [INFO ]  starting Athena query ...
2021-03-02 09:19:10,706 [INFO ]  loading 060c4616-4ee3-474e-af5e-a329757d9094.csv
2021-03-02 09:19:25,324 [INFO ]  Athena query complete: returning dataframe
2021-03-02 09:19:43,531 [INFO ]  Data implicit loaded
2021-03-02 09:19:43,532 [INFO ]  Aggregate seasons into brands
2021-03-02 09:20:18,701 [INFO ]  Season-to-brand update query failed - rolling up with existing season-to-brand map
2021-03-02 09:21:22,117 [INFO ]  Pr

In [22]:
data_implicit.drop_duplicates(subset=['id_editorial'])['scheduler_channel'].value_counts()

Film         7458
Music        7319
Sport         842
TV Replay     783
TV            489
Kids          478
Name: scheduler_channel, dtype: int64

In [48]:
id_editorial_legend = data_implicit[['id_editorial', 'id_editorial_simple', 'type_production', 'end_date', 'start_date', 'status', 'rating_n', 'genre', 'scheduler_channel', 'synopsis', 'sub_genres']].sort_values(by=['status','sub_genres']).drop_duplicates(subset=['id_editorial', 'id_editorial_simple']).sort_values(by='id_editorial_simple')

id_editorial_simple_list = id_editorial_legend['id_editorial_simple'].tolist()
id_editorial_list = id_editorial_legend['id_editorial'].tolist()
type_production_list = id_editorial_legend['type_production'].tolist()

# get array with 1 or 0 based on availability of content today + status
availability = (id_editorial_legend['status'] == 'ACTIVE') & (id_editorial_legend['end_date'] > datetime.today().strftime('%Y-%m-%d')) & (id_editorial_legend['start_date'] <= datetime.today().strftime('%Y-%m-%d'))
availability = np.array(availability.astype(int))

In [55]:
missing = []
for c, i in enumerate(id_editorial_list):
    if (type_production_list[c]+'|'+i not in recs_morelikethis['content'].tolist()) and (availability[c] == 1):
        missing.append(i)

In [57]:
missing # e.g. BBJ1003714HVOD

['BBJ1003714HVOD',
 'BBJ1009544HVOD',
 'BBJ1009547HVOD',
 'BBJ1009553HVOD',
 'BBJ1009563HVOD',
 'BBJ1009582HVOD',
 'BBJ1009600HVOD',
 'BBJ1009612HVOD',
 'BBJ1009633HVOD',
 'BBJ1009687HVOD',
 'BBJ1009717HVOD',
 'BBJ1009722HVOD',
 'BBJ1009727HVOD',
 'BBJ1009732HVOD',
 'BBJ1009785HVOD',
 'BBJ1012398HVOD',
 'BBJ1020717HVOD',
 'BBJ1020753HVOD',
 'BBJ1020774HVOD',
 'BBJ1020781HVOD',
 'BBJ1020800IOC',
 'BBJ1020845HVOD',
 'BBJ1021618HVOD',
 'BBJ1028097HVOD',
 'BBJ1028103HVOD',
 'BBJ1028106HVOD',
 'BBJ1028117HVOD',
 'BBJ1028122HVOD',
 'BBJ1051966HVOD',
 'BBJ1051979HVOD',
 'BBJ1051989HVOD',
 'BBJ1051997HVOD',
 'BBJ1059070HVOD',
 'BBJ1059956HVOD',
 'BBJ1059972HVOD',
 'BBJ1059978HVOD',
 'BBJ1059982HVOD',
 'BBJ1060001HVOD',
 'BBJ1060007HVOD',
 'BBJ1060026HVOD',
 'BBJ1060028HVOD',
 'BBJ1060048HVOD',
 'BBJ1060066HVOD',
 'BBJ1060072HVOD',
 'BBJ1060081HVOD',
 'BBJ1060085HVOD',
 'BBJ1060093HVOD',
 'BBJ1060095HVOD',
 'BBJ1060119HVOD',
 'BBJ1060121HVOD',
 'BBJ1060127HVOD',
 'BBJ1060893HVOD',
 'BBJ1065038H

In [58]:
data_implicit[data_implicit['id_editorial'] == 'BBJ1003714HVOD'].head()

Unnamed: 0,id_user,type_asset,event_date,id_editorial,title,type,end_date,start_date,genre,sub_genres,rating,scheduler_channel,status,type_entitlement,eventStrength,rating_n,type_production,synopsis,id_user_simple,id_editorial_simple
1977992,V3005411811,film,2018-12-22 19:59:33,BBJ1003714HVOD,Pretty Woman,film,2025-12-31,2017-12-01,Romance,"Romance,Comedy",15,Film,ACTIVE,VIEW,1,3,PROGRAM,A man in a legal but hurtful business needs an...,367762,18
1977994,V3002288854,film,2018-12-22 20:33:25,BBJ1003714HVOD,Pretty Woman,film,2025-12-31,2017-12-01,Romance,"Romance,Comedy",15,Film,ACTIVE,VIEW,1,3,PROGRAM,A man in a legal but hurtful business needs an...,172950,18
1977997,V3003806785,film,2018-12-22 21:27:38,BBJ1003714HVOD,Pretty Woman,film,2025-12-31,2017-12-01,Romance,"Romance,Comedy",15,Film,ACTIVE,VIEW,1,3,PROGRAM,A man in a legal but hurtful business needs an...,252483,18
1978027,V3006061174,film,2019-02-25 12:29:40,BBJ1003714HVOD,Pretty Woman,film,2025-12-31,2017-12-01,Romance,"Romance,Comedy",15,Film,ACTIVE,VIEW,1,3,PROGRAM,A man in a legal but hurtful business needs an...,419480,18
1978048,V3003275783,film,2019-09-11 14:16:20,BBJ1003714HVOD,Pretty Woman,film,2025-12-31,2017-12-01,Romance,"Romance,Comedy",15,Film,ACTIVE,VIEW,1,3,PROGRAM,A man in a legal but hurtful business needs an...,224261,18


In [50]:
sum(availability)

5348

In [35]:
data_implicit[(data_implicit['scheduler_channel']=='Music') & (data_implicit['status'] == 'ACTIVE') & (data_implicit['end_date'] > datetime.today().strftime('%Y-%m-%d')) & (data_implicit['start_date'] <= datetime.today().strftime('%Y-%m-%d'))].head()

Unnamed: 0,id_user,type_asset,event_date,id_editorial,title,type,end_date,start_date,genre,sub_genres,rating,scheduler_channel,status,type_entitlement,eventStrength,rating_n,type_production,synopsis,id_user_simple,id_editorial_simple
872211,V1000291764,music,2020-06-16 16:21:23,movida_10036206,Time To Relax,collection,2021-03-31,2015-03-11,,"Audio,US,Jazz,Female vocalist",u,Music,ACTIVE,VIEW,1,0,COLLECTION,An audio playlist of great tracks to help you ...,29041,14653
988173,V3002458076,music,2020-06-21 18:53:29,movida_10036206,Time To Relax,collection,2021-03-31,2015-03-11,,"Audio,60's,US,Soul,Funk,Guitar",u,Music,ACTIVE,VIEW,1,0,COLLECTION,An audio playlist of great tracks to help you ...,180724,14653
1259295,V3000477130,music,2020-04-01 09:20:18,movida_10036206,Time To Relax,collection,2021-03-31,2015-03-11,,"Audio,60's,Blue Eyed Soul,Female vocalist,R&B",u,Music,ACTIVE,VIEW,1,0,COLLECTION,An audio playlist of great tracks to help you ...,71775,14653
1259296,V3009171656,music,2020-04-01 23:33:38,movida_10036206,Time To Relax,collection,2021-03-31,2015-03-11,,"Audio,60's,Blue Eyed Soul,Female vocalist,R&B",u,Music,ACTIVE,VIEW,1,0,COLLECTION,An audio playlist of great tracks to help you ...,856928,14653
1259297,V3001638654,music,2020-04-01 08:52:25,movida_10036206,Time To Relax,collection,2021-03-31,2015-03-11,,"Audio,60's,Blue Eyed Soul,Female vocalist,R&B",u,Music,ACTIVE,VIEW,1,0,COLLECTION,An audio playlist of great tracks to help you ...,140013,14653


In [4]:
model = create_model(data_implicit)
logger.info('Model trained')

2021-03-02 09:31:28,002 [INFO ]  Training the model...
2021-03-02 09:31:40,315 [WARNI]  OpenBLAS detected. Its highly recommend to set the environment variable 'export OPENBLAS_NUM_THREADS=1' to disable its internal multithreading


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

2021-03-02 09:32:30,895 [INFO ]  Model trained





In [61]:
import os
os.environ['ENVIRON_PROD'] = 'TESTING'

recs_foryou, recs_morelikethis = get_recommendations(data_implicit, model)

2021-03-02 10:36:43,092 [INFO ]  Creating recommendations...
2021-03-02 10:38:31,122 [INFO ]  More Like This
2021-03-02 10:38:31,123 [INFO ]  Loading Universal Sentence Encoder
2021-03-02 10:38:51,433 [WARNI]  7 out of the last 7 calls to <function recreate_function.<locals>.restored_function_body at 0x7f6cd03c9d90> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
2021-03-02 10:38:53,947 [WARNI]  8 out of the last 8 calls to <function recre

In [62]:
recs_morelikethis.head()

Unnamed: 0,content,recommendations
0,PROGRAM|BBJ1003714HVOD,"{""score_date"": ""2021-03-02"", ""data_update_date..."
1,PROGRAM|BBJ1009533HVOD,"{""score_date"": ""2021-03-02"", ""data_update_date..."
2,PROGRAM|BBJ1009538HVOD,"{""score_date"": ""2021-03-02"", ""data_update_date..."
3,PROGRAM|BBJ1009541HVOD,"{""score_date"": ""2021-03-02"", ""data_update_date..."
4,PROGRAM|BBJ1009544HVOD,"{""score_date"": ""2021-03-02"", ""data_update_date..."


In [81]:
len(recs_morelikethis)

5348

In [64]:
import ast

In [79]:
title = ''

bbb = [x['type']+'|'+x['guid'] for x in ast.literal_eval(recs_morelikethis['recommendations'][recs_morelikethis['content'] == data_implicit[data_implicit['title'] == title]['type_production'].iloc[0]+'|'+data_implicit[data_implicit['title'] == title]['id_editorial'].iloc[0]].tolist()[0])['recommendations']]

In [80]:
for i in bbb: print(data_implicit[data_implicit['id_editorial'] == i.split('|')[-1]]['title'].iloc[0])      

Shallow Grave
Midnight Express
Butch Cassidy and the Sundance Kid
Zodiac
Pulp Fiction
Heat
Jackie Brown
The Sweeney
Michael Clayton
Seven
The Big Sleep
Dial M For Murder
Sexy Beast
This Is England
L.A. Confidential
Mystic River
No Country for Old Men
Agatha Christie's Crooked House
American Gangster
The Hatton Garden Job


In [None]:
movida_10027664

In [28]:
recs_morelikethis[recs_morelikethis['content'] == 'movida_63946']['recommendations']

7762    {"score_date": "2021-02-26", "data_update_date...
Name: recommendations, dtype: object

In [1]:
from aws_tools.cloudwatch_logging import logger
from aws_tools.project_config import get_aws_config

import pandas as pd
import boto3

In [2]:
sts_client = boto3.client('sts')

assumed_role_object=sts_client.assume_role(
    RoleArn="arn:aws:iam::881289283440:role/pipeline/bttv-dar-prod-role-tvrecommend-prd-consrecommend",
    RoleSessionName="recommendAssume"
)

credentials = assumed_role_object['Credentials']

dynamo_client = boto3.client(
    'dynamodb',
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken'],
    region_name='eu-west-1'
)

response = dynamo_client.list_tables(Limit=10)

# table_b = dynamo_client.scan(TableName = 'tvrecommend-brand-prd')
# table_c = dynamo_client.scan(TableName = 'tvrecommend-collection-prd')
# table_p = dynamo_client.scan(TableName = 'tvrecommend-programme-prd')


# data_unique = data_implicit[['id_editorial', 'type_production', 'title', 'status']].drop_duplicates()

In [3]:
def get_full_table(TableName):
    table = dynamo_client.scan(TableName = TableName)
    data = table['Items']
    while 'LastEvaluatedKey' in table:
        table = dynamo_client.scan(TableName = TableName, ExclusiveStartKey=table['LastEvaluatedKey'])
        data.extend(table['Items'])
    
    return data

In [4]:
table_b = get_full_table('tvrecommend-brand-prd')
table_c = get_full_table('tvrecommend-collection-prd')
table_p = get_full_table('tvrecommend-programme-prd')

In [73]:
table_b = dynamo_client.scan(TableName = 'tvrecommend-brand-prd')
data = table_b['Items']
while 'LastEvaluatedKey' in table_b:
    table_b = dynamo_client.scan(TableName = 'tvrecommend-brand-prd', ExclusiveStartKey=table_b['LastEvaluatedKey'])
    data.extend(table_b['Items'])

In [58]:
def return_description(table, ed_id, title):
    for d in table:
        if d['guid']['S'] == ed_id:
            try:
                return d['longDescription']['S']
            except:
                try:
                    return d['shortDescription']['S']
                except:
                    return title
    return title
    

def pull_synopsis(content, table_b, table_c, table_p):
    
    if content[1] == 'BRAND':
        return return_description(table_b, content[0], content[2])
        
    elif content[1] == 'COLLECTION':
        return return_description(table_c, content[0], content[2])
        
    elif content[1] == 'PROGRAM':
        return return_description(table_p, content[0], content[2])
        
    else:
        return ''

In [59]:
data_unique['synopsis'] = data_unique.apply(pull_synopsis, args=(table_b, table_c, table_p), axis = 1)

In [60]:
data_unique[(data_unique['title'] == data_unique['synopsis']) & (data_unique['status'] == 'ACTIVE')].head(10)

Unnamed: 0,id_editorial,type_production,title,status,synopsis
2491,movida_16578,BRAND,Nasa's Unexplained Files Series 4,ACTIVE,Nasa's Unexplained Files Series 4
2603,movida_14139,BRAND,Blindspot Series 3,ACTIVE,Blindspot Series 3
3186,movida_10021027,COLLECTION,Sofia the First - Volume 5,ACTIVE,Sofia the First - Volume 5
3200,movida_48224,BRAND,Hamilton's Pharmacopeia Series 1,ACTIVE,Hamilton's Pharmacopeia Series 1
3288,movida_1851,BRAND,Modern Family Series 10,ACTIVE,Modern Family Series 10
4316,movida_10016421,COLLECTION,UCL 2017/18,ACTIVE,UCL 2017/18
4404,movida_10002359,COLLECTION,The Hobbit Trilogy,ACTIVE,The Hobbit Trilogy
4405,movida_26336,BRAND,Outback Truckers Series 5,ACTIVE,Outback Truckers Series 5
4530,BBJ963235HVOD,PROGRAM,Love is All you Need,ACTIVE,Love is All you Need
4533,BBJ314317HVOD,PROGRAM,The Hunt for Red October,ACTIVE,The Hunt for Red October


In [61]:
for d in table_b['Items']:
        if d['guid']['S'] == 'movida_16578':
            print(d)

In [14]:
l = []
for d in table_b:
    l.append(d['guid']['S'])

In [17]:
len(l)  == len(set(l))

True

In [13]:
list(set(val for dic in table_b for val in dic.values())) 

TypeError: unhashable type: 'dict'

In [9]:
table_b

[{'updatedAtSource': {'N': '1575289171000'},
  'offerStartDate': {'N': '1491951600000'},
  'availableStartDate': {'N': '1491951600000'},
  'availableEndDate': {'N': '1767225540000'},
  'pricingStartDate': {'N': '1491951600000'},
  'anchors': {'L': [{'M': {'seasonGuid': {'S': 'movida_10011328'},
      'programGuid': {'S': 'BBJ1331795A'},
      'seasonNumber': {'N': '1'},
      'episodeNumber': {'N': '1'},
      'anchorType': {'S': 'default'}}},
    {'M': {'seasonGuid': {'S': 'movida_10011328'},
      'programGuid': {'S': 'BBJ1331795A'},
      'seasonNumber': {'N': '1'},
      'anchorId': {'S': 'XXH'},
      'episodeNumber': {'N': '1'},
      'anchorType': {'S': 'cpRelated'}}}]},
  'pricingEndDate': {'N': '1767225540000'},
  'longDescription': {'S': 'Pete Holmes created and stars in this comedy series about a stand-up comic on the New York City comedy scene'},
  'contentProviderIDList': {'L': []},
  'offerTypes': {'L': [{'S': 'EST'}]},
  'shortDescription': {'S': 'Comedy series about a s