In [1]:
import commendaroo.data_transformer.model_data as mdt

from scipy import sparse
import implicit
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import json
import os
import boto3
import tensorflow_hub as hub

from aws_tools.cloudwatch_logging import logger
from aws_tools.project_config import get_aws_config

np.seterr(divide='ignore', invalid='ignore') # to avoid divide by nan warning (genre in music is nan)

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
def create_model(data_implicit):
    '''
    '''
    
    logger.info('Training the model...')
    
    sparse_content_person = sparse.csr_matrix(
        (data_implicit['eventStrength'].astype(float), (data_implicit['id_editorial_simple'], data_implicit['id_user_simple']))
    )
    
    model = implicit.als.AlternatingLeastSquares(factors=30, regularization=0.1, iterations=50, use_gpu = False)
    alpha = 15
    data_tofit = (sparse_content_person * alpha).astype('double')
    model.fit(data_tofit)
    
    return model

In [3]:
data_implicit = mdt.get_data()

2021-03-23 09:50:10,461 [INFO ]  Getting the data
2021-03-23 09:50:10,461 [INFO ]  Creating views tables
2021-03-23 09:51:38,181 [INFO ]  Creating purchases/rentals/PPVs tables
2021-03-23 09:52:08,621 [INFO ]  Loading views data
2021-03-23 09:52:08,621 [INFO ]  starting Athena query ...
2021-03-23 09:59:38,796 [INFO ]  loading 3f5bd93d-179c-41f7-9461-af54f5c7fec5.csv
2021-03-23 10:09:30,343 [INFO ]  Athena query complete: returning dataframe
2021-03-23 10:09:30,466 [INFO ]  Loading purchases/rentals/PPVs data
2021-03-23 10:09:30,466 [INFO ]  starting Athena query ...
2021-03-23 10:10:18,665 [INFO ]  loading 6d86b36e-3166-4118-9919-a597a651dc7b.csv
2021-03-23 10:10:33,365 [INFO ]  Athena query complete: returning dataframe
2021-03-23 10:10:52,324 [INFO ]  Data implicit loaded
2021-03-23 10:10:52,326 [INFO ]  Aggregate seasons into brands
2021-03-23 10:11:19,256 [INFO ]  Season-to-brand update query failed - rolling up with existing season-to-brand map
2021-03-23 10:12:24,880 [INFO ]  Pr

In [4]:
data_nohist = mdt.get_data_nohist(data_implicit)

2021-03-23 11:00:57,739 [INFO ]  Loading all metadata including for items with no history


In [5]:
model = create_model(data_implicit)

2021-03-23 11:11:03,679 [INFO ]  Training the model...


  0%|          | 0/50 [00:00<?, ?it/s]

In [6]:
# (sorting first alphabetically by status so ACTIVE versions of same items appears first over INACTIVE versions)
id_editorial_legend = data_implicit[['id_editorial', 'id_editorial_simple', 'type_production', 'end_date', 'start_date', 'status', 'rating_n', 'genre', 'scheduler_channel', 'synopsis', 'sub_genres']].sort_values(by=['status','sub_genres']).drop_duplicates(subset=['id_editorial', 'id_editorial_simple']).sort_values(by='id_editorial_simple')

id_editorial_simple_list = id_editorial_legend['id_editorial_simple'].tolist()
id_editorial_list = id_editorial_legend['id_editorial'].tolist()
type_production_list = id_editorial_legend['type_production'].tolist()

# get array with 1 or 0 based on availability of content today + status
availability = (id_editorial_legend['status'] == 'ACTIVE') & (id_editorial_legend['end_date'] > datetime.today().strftime('%Y-%m-%d')) & (id_editorial_legend['start_date'] <= datetime.today().strftime('%Y-%m-%d'))
availability = np.array(availability.astype(int))

scheduler_channel = np.array(id_editorial_legend['scheduler_channel'].tolist())

genre = np.array(id_editorial_legend['genre'].tolist())

sub_genres = [str(x).lower().replace(' ', '').replace(',', ' ') for x in id_editorial_legend['sub_genres'].tolist()]

rating = np.array(id_editorial_legend['rating_n'].tolist())

synopsis = id_editorial_legend['synopsis'].tolist()


id_user_legend = data_implicit[['id_user', 'id_user_simple']].drop_duplicates(subset=['id_user', 'id_user_simple']).sort_values(by='id_user_simple')
id_user_simple_list = id_user_legend['id_user_simple'].tolist()
id_user_list = id_user_legend['id_user'].tolist()

date_today = datetime.today().strftime('%Y-%m-%d')
code_version = get_aws_config('model_parameters.json')['code_version']

data_update_date =  {
    'view' : str(data_implicit[data_implicit['type_entitlement'] == 'VIEW']['event_date'].max()),
    'purchase' : str(data_implicit[data_implicit['type_entitlement'] == 'EVOD']['event_date'].max()),
    'rental' : str(data_implicit[data_implicit['type_entitlement'] == 'TVOD']['event_date'].max())
}

sparse_content_person = sparse.csr_matrix(
    (data_implicit['eventStrength'].astype(float), (data_implicit['id_editorial_simple'], data_implicit['id_user_simple']))
)
#     sparse_person_content = sparse.csr_matrix(
#         (data_implicit['eventStrength'].astype(float), (data_implicit['id_user_simple'], data_implicit['id_editorial_simple']))
#     )
sparse_person_content_hidelogic = sparse.csr_matrix(
    (data_implicit['FY_logic'].astype(float), (data_implicit['id_user_simple'], data_implicit['id_editorial_simple']))
)

In [7]:
content_vecs_T = model.item_factors.T
person_vecs = model.user_factors

In [11]:
person_id = 0

In [12]:
from timeit import default_timer as timer

In [33]:
import timeit

In [108]:
%%timeit -n 1 -r 1000

### NO IMPROVEMENTS

person_interactions = sparse_person_content_hidelogic[person_id,:].toarray()

person_interactions = person_interactions.reshape(-1) + 1

person_interactions[person_interactions > 1] = 0

rec_vector = person_vecs[person_id,:].dot(content_vecs_T)

rec_vector = MinMaxScaler().fit_transform(rec_vector.reshape(-1,1))[:,0]

recommend_vector = person_interactions * rec_vector

recommend_vector = recommend_vector * availability

recs = sorted(zip(id_editorial_list, recommend_vector[id_editorial_simple_list], type_production_list), key=lambda x: -x[1])[:50]

user_id = id_user_list[id_user_simple_list.index(person_id)]

26.2 ms ± 2.69 ms per loop (mean ± std. dev. of 1000 runs, 1 loop each)


In [109]:
%%timeit -n 1 -r 1000

### BETTER SORTING & SKIPPING UNNECESSARY INDEX LOADING

person_interactions = sparse_person_content_hidelogic[person_id,:].toarray()

person_interactions = person_interactions.reshape(-1) + 1

person_interactions[person_interactions > 1] = 0

rec_vector = person_vecs[person_id,:].dot(content_vecs_T)

rec_vector = MinMaxScaler().fit_transform(rec_vector.reshape(-1,1))[:,0]

recommend_vector = person_interactions * rec_vector

recommend_vector = recommend_vector * availability

order = np.argsort(-recommend_vector)[:50]
b = recommend_vector[id_editorial_simple_list][order]
a = np.array(id_editorial_list)[order]
c = np.array(type_production_list)[order]
recs = list(zip(list(a),list(b),list(c)))

user_id = id_user_list[id_user_simple_list.index(person_id)]

13.5 ms ± 816 µs per loop (mean ± std. dev. of 1000 runs, 1 loop each)


In [111]:
13.5/26.2*100

51.526717557251914