In [1]:
import commendaroo.data_transformer.model_data as mdt

from scipy import sparse
import implicit
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import json

from aws_tools.cloudwatch_logging import logger
from aws_tools.project_config import get_aws_config

In [2]:
data = mdt.get_data()

2021-01-15 10:44:48,607 [INFO ]  Getting the data
2021-01-15 10:44:48,608 [INFO ]  Creating views tables
2021-01-15 10:46:00,237 [INFO ]  Creating purchases/rentals/PPVs tables
2021-01-15 10:46:30,746 [INFO ]  Loading views data
2021-01-15 10:46:30,747 [INFO ]  starting Athena query ...
2021-01-15 10:53:40,150 [INFO ]  loading ee41db3d-5989-4f34-af81-f2d100d83a71.csv
2021-01-15 11:00:29,237 [INFO ]  Athena query complete: returning dataframe
2021-01-15 11:00:29,239 [INFO ]  Loading purchases/rentals/PPVs data
2021-01-15 11:00:29,240 [INFO ]  starting Athena query ...
2021-01-15 11:00:32,627 [INFO ]  loading a22f4060-8a65-4c05-a213-44be95636a08.csv
2021-01-15 11:00:33,515 [INFO ]  Athena query complete: returning dataframe
2021-01-15 11:00:46,557 [INFO ]  Data implicit loaded
2021-01-15 11:01:12,051 [INFO ]  Aggregate seasons into brands
2021-01-15 11:03:50,482 [INFO ]  Prepping data for implicit model


In [4]:
def create_model(data_implicit):
    '''
    '''
    
    logger.info('Training the model...')
    
    sparse_content_person = sparse.csr_matrix(
        (data_implicit['eventStrength'].astype(float), (data_implicit['id_editorial_simple'], data_implicit['id_user_simple']))
    )
    
    model = implicit.als.AlternatingLeastSquares(factors=30, regularization=0.1, iterations=50, use_gpu = False)
    alpha = 15
    data_tofit = (sparse_content_person * alpha).astype('double')
    model.fit(data_tofit)
    
    return model

model = create_model(data)

2021-01-15 11:05:24,902 [INFO ]  Training the model...
2021-01-15 11:05:32,142 [WARNI]  OpenBLAS detected. Its highly recommend to set the environment variable 'export OPENBLAS_NUM_THREADS=1' to disable its internal multithreading


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [5]:
def more_like_this(content_id, content_vecs, content_norms, availability, id_editorial_list, id_editorial_simple_list, type_production_list):
    
    scores = content_vecs.dot(content_vecs[content_id,:])  / (content_norms * content_vecs[content_id,:].sum()) # i.e. calculating cosine similarity, (A.B) / (|A| x |B|) --> |B| just a constant so effectively won't need it
    
    scores =  MinMaxScaler().fit_transform(scores.reshape(-1,1))[:,0]
    
    # make zero the content no longer avilable
    scores = scores * availability
    
    similar = sorted(zip(id_editorial_list, scores[id_editorial_simple_list], type_production_list), key=lambda x: -x[1])
    
    con_id = similar[0][0]
    
    return similar[1:21], con_id # return 20, skipping first (i.e. itself)
    
def for_you(person_id, sparse_person_content, content_vecs_T, person_vecs, availability, id_editorial_list, id_editorial_simple_list, type_production_list, id_user_list, id_user_simple_list):

    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()

    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1

    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0

    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs_T)

    rec_vector = MinMaxScaler().fit_transform(rec_vector.reshape(-1,1))[:,0]

    # Multiply by zero the scores of items already interacted with
    recommend_vector = person_interactions * rec_vector
    
    # make zero the content no longer avilable
    recommend_vector = recommend_vector * availability
    
    recs = sorted(zip(id_editorial_list, recommend_vector[id_editorial_simple_list], type_production_list), key=lambda x: -x[1])
    
    user_id = id_user_list[id_user_simple_list.index(person_id)]
    
    return recs[:50], user_id

In [7]:
logger.info('Creating recommendations...')

id_editorial_legend = data[['id_editorial', 'id_editorial_simple', 'type_production', 'end_date']].drop_duplicates(subset=['id_editorial', 'id_editorial_simple']).sort_values(by='id_editorial_simple')

id_editorial_simple_list = id_editorial_legend['id_editorial_simple'].tolist()
id_editorial_list = id_editorial_legend['id_editorial'].tolist()
type_production_list = id_editorial_legend['type_production'].tolist()

# get array with 1 or 0 based on availability of content today
availability = id_editorial_legend['end_date'] > datetime.today().strftime('%Y-%m-%d')
availability = np.array(availability.astype(int))


id_user_legend = data[['id_user', 'id_user_simple']].drop_duplicates(subset=['id_user', 'id_user_simple']).sort_values(by='id_user_simple')
id_user_simple_list = id_user_legend['id_user_simple'].tolist()
id_user_list = id_user_legend['id_user'].tolist()

date_today = datetime.today().strftime('%Y-%m-%d')
code_version = get_aws_config('model_parameters.json')['code_version']


sparse_content_person = sparse.csr_matrix(
    (data['eventStrength'].astype(float), (data['id_editorial_simple'], data['id_user_simple']))
)
sparse_person_content = sparse.csr_matrix(
    (data['eventStrength'].astype(float), (data['id_user_simple'], data['id_editorial_simple']))
)

2021-01-15 11:07:47,755 [INFO ]  Creating recommendations...


In [17]:
data['event_date'].max()

Timestamp('2021-01-13 23:59:58')

In [14]:
### More Like This   
logger.info('More Like This')
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1)) # i.e. calculating abs. value of the vector of each item -->  |A|

data_update_date =  {
    'view' : data[data['type_entitlement'] == 'VIEW']['event_date'].max(),
    'purchase' : data[data['type_entitlement'] == 'EVOD']['event_date'].max(),
    'rental' : data[data['type_entitlement'] == 'TVOD']['event_date'].max()
}

# create the output for dynamoDB table
output = []

for index, i in enumerate(id_editorial_simple_list[:1000]): 

    if availability[index] == 1: # check if content is available otherwise those recommendations will be wrong due to having multiplied scores by zero for unavailable content

        recs, con_id = more_like_this(i, content_vecs, content_norms, availability, id_editorial_list, id_editorial_simple_list, type_production_list)
        
        #{"BBJ1810496A": {"score": 1.0, "type": "PROGRAM", "s_dt": "2020-12-23", "c_v": "0.3"}, "movida_10030641": {"scor
                                                                                                                   
        rec_guids = []
        for r in recs:

            rec_guids.append({
                'guid' : r[0],
                'score' : round(float(r[1]), 5),
                'type' : r[2]
            })
        
        output_rec = {'score_date' : date_today, 'data_update_date' : data_update_date, 'code_version' : code_version, 'recommendations' : rec_guids}

        output.append([con_id, output_rec])

output_morelikethis = pd.DataFrame(output,columns=['content','recommendations'])

output_morelikethis['recommendations'] = output_morelikethis['recommendations'].apply(lambda x: json.dumps(x)) # if already a dict

# remove any duplicates; these can still occur if some content has exactly the same watch history as other and gets most similar item itself and other stuff too
output_morelikethis = output_morelikethis.drop_duplicates(subset=['content']).reset_index(drop=True)

2021-01-15 11:10:08,191 [INFO ]  More Like This


In [16]:
output_morelikethis.iloc[0,1]

'{"s_dt": "2021-01-15", "c_v": "0.3", "recommendations": [{"guid": "BBJ836608HVOD", "score": 0.96913, "type": "PROGRAM"}, {"guid": "BBJ1228941A", "score": 0.96774, "type": "PROGRAM"}, {"guid": "BBJ1143404HVOD", "score": 0.96256, "type": "PROGRAM"}, {"guid": "BBJ889806HVOD", "score": 0.95924, "type": "PROGRAM"}, {"guid": "BBJ836611HVOD", "score": 0.95896, "type": "PROGRAM"}, {"guid": "BBJ2280844A", "score": 0.95827, "type": "PROGRAM"}, {"guid": "BBJ1288045A", "score": 0.95013, "type": "PROGRAM"}, {"guid": "BBJ336327HVOD", "score": 0.95004, "type": "PROGRAM"}, {"guid": "BBJ398491HVOD", "score": 0.94787, "type": "PROGRAM"}, {"guid": "BBJ332222HVOD", "score": 0.94764, "type": "PROGRAM"}, {"guid": "BBJ1083223HVOD", "score": 0.94759, "type": "PROGRAM"}, {"guid": "BBJ314806HVOD", "score": 0.94368, "type": "PROGRAM"}, {"guid": "BBJ374917HVOD", "score": 0.94306, "type": "PROGRAM"}, {"guid": "BBJ889811HVOD", "score": 0.94183, "type": "PROGRAM"}, {"guid": "BBJ319479HVOD", "score": 0.94053, "type"

In [15]:
output_morelikethis.head()

Unnamed: 0,content,recommendations
0,BBJ1003714HVOD,"{""s_dt"": ""2021-01-15"", ""c_v"": ""0.3"", ""recommen..."
1,BBJ1009533HVOD,"{""s_dt"": ""2021-01-15"", ""c_v"": ""0.3"", ""recommen..."
2,BBJ1009538HVOD,"{""s_dt"": ""2021-01-15"", ""c_v"": ""0.3"", ""recommen..."
3,BBJ1009541HVOD,"{""s_dt"": ""2021-01-15"", ""c_v"": ""0.3"", ""recommen..."
4,BBJ1009544HVOD,"{""s_dt"": ""2021-01-15"", ""c_v"": ""0.3"", ""recommen..."


In [None]:
### For You
logger.info('For You')
content_vecs_T = model.item_factors.T
person_vecs = model.user_factors

# create the output for dynamoDB table
output = []

users_tech_trial = [
    'V3006126692',
    'V3007158774',
    'V3009295704',
    'V3000606445',
    'V2283739102',
    'V2200004307',
    'V3009378451',
    'V3009436446',
    'V1000019727',
    'V3008138884',
    'V3003052251',
    'V3008613163',
    'V3578624855',
    'V3000143118',
    'V2200007285',
    'V3000163582'
]
# turn the VSID of the tech trail list to simple user ids
users_tech_trial_simple = [int(data[data['id_user'] == x]['id_user_simple'].iloc[0]) for x in users_tech_trial if x in id_user_list]

for i in users_tech_trial_simple:#id_user_simple_list[:1000]: 

    recs, user_id = for_you(i, sparse_person_content, content_vecs_T, person_vecs, availability, id_editorial_list, id_editorial_simple_list, type_production_list, id_user_list, id_user_simple_list)

    output_rec = {}
    for r in recs:

        output_rec[r[0]] = {
            'score' : round(float(r[1]), 5),
            'type' : r[2],
            's_dt' : date_today,
            'c_v' : code_version
        }

    output.append([user_id, output_rec])

output_foryou = pd.DataFrame(output,columns=['user','recommendations'])

output_foryou['recommendations'] = output_foryou['recommendations'].apply(lambda x: json.dumps(x)) # if already a dict