In [1]:
! pip install pandas
! pip install boto3
! pip install watchtower
! pip install s3fs==0.4.2
! pip install pyathena
! pip install matplotlib
! pip install scipy
! pip install ipywidgets
! pip install scikit-learn

! conda install -c conda-forge --yes implicit 

! pip install --upgrade jupyter_client # useful to make ipywidgets work properly when fitting data with implicit

# pip install git+https://gitlab.com/cloena/cloena-aws-tools.git

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.3
  latest version: 4.8.4

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Requirement already up-to-date: jupyter_client in /home/ec2-user/anaconda3/envs/pg/lib/python3.8/site-packages (6.1.6)


In [2]:
import pandas as pd

import s3fs

import pickle

import implicit

from aws_tools import athena_tools, s3_tools

import matplotlib

import re

import scipy

from ipywidgets import FloatProgress

import numpy as np

from sklearn.preprocessing import MinMaxScaler

from datetime import datetime



In [3]:
_v = '_v0.2'

In [4]:
data_views = pd.read_csv('s3://bt-data-science-playground/bt-tv-recommendation-system/model_objects/historicalviews'+_v+'.csv')
data_purren = pd.read_csv('s3://bt-data-science-playground/bt-tv-recommendation-system/model_objects/historicalpurchasesrentals'+_v+'.csv')

MemoryError: Unable to allocate 433. MiB for an array with shape (56764816,) and data type object

In [None]:
data_views.head()

In [None]:
data_purren.head()

In [None]:
data_views = data_views.rename(columns={'VIEW_DATE': 'EVENT_DATE'})
data_purren = data_purren.rename(columns={'PURCHASE_DATE': 'EVENT_DATE'})

data_views['TYPE_ENTITLEMENT'] = 'VIEW'

In [None]:
cols = ['ID_USER', 'ID_EDITORIAL', 'TITLE', 'GENRE', 'RATING', 'TYPE', 'TYPE_ASSET', 'END_DATE', 'EVENT_DATE', 'TYPE_ENTITLEMENT']

data_views = data_views[cols]
data_purren = data_purren[cols]

data_implicit = data_views.append(data_purren, ignore_index=True)
del data_views
del data_purren

In [None]:
DROP_MUSIC = True
if DROP_MUSIC:
    data_implicit = data_implicit[data_implicit['TYPE_ASSET'] != 'Music']
else:
    pass

In [None]:
# Calculate sparsity 
n_users = data_implicit['ID_USER'].unique().shape[0]
n_items = data_implicit['ID_EDITORIAL'].unique().shape[0]

print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(float(data_implicit.shape[0]) / float(n_users*n_items) * 100))

In [None]:
# Calculate how many users have watched X% of the content
idx = data_implicit['ID_USER'].value_counts().index.tolist()
counts = data_implicit['ID_USER'].value_counts().tolist()

user_count = []
percentage_activity = []
activity_so_far = 0
total_activity = sum(counts)

for u in range(len(idx)):
    
    user_count.append(u+1)
    
    activity_so_far = activity_so_far + counts[u]
    percentage_so_far = activity_so_far / total_activity * 100
    percentage_activity.append(percentage_so_far)
     
import matplotlib.pyplot as plt

plt.plot(user_count, percentage_activity)
plt.xlabel('User count')
plt.ylabel('Total # of activity [%]')
plt.grid()
plt.yticks(np.arange(0,105, 5))
plt.xticks(np.arange(0,1000000, 150000))
plt.show()

In [None]:
data_implicit['TYPE_ENTITLEMENT'].value_counts()

In [None]:
# based on type of entitlement (purchase/rental/ppv/view) give different strength [EVOD=purchase;TVOD=rental]
def assign_eventStrength(x):
    if x == 'EVOD':
        val = 3
    elif x == 'TVOD':
        val = 2
    elif x == 'PPV':
        val = 3
    elif x == 'VIEW':
        val = 1
    else:
        val = 1
    return val

data_implicit['eventStrength'] = data_implicit['TYPE_ENTITLEMENT'].apply(assign_eventStrength)

In [None]:
# Turn duplicate the RATING column in a form that is numeric so that they can be compared
def rating_toNumeric(x):
    if x == 'u':
        return 0
    elif x == 'pg':
        return 1
    elif x == '12':
        return 2
    elif x == '15':
        return 3
    elif x == '18':
        return 4
    else:
        return 5
    
data_implicit['RATING_n'] = data_implicit['RATING'].apply(rating_toNumeric)

data_implicit.head()

In [None]:
data_implicit['TITLE'] = data_implicit['TITLE'].astype("category")
data_implicit['ID_USER'] = data_implicit['ID_USER'].astype("category")
data_implicit['ID_EDITORIAL'] = data_implicit['ID_EDITORIAL'].astype("category")
data_implicit['ID_USER_simple'] = data_implicit['ID_USER'].cat.codes
data_implicit['ID_EDITORIAL_simple'] = data_implicit['ID_EDITORIAL'].cat.codes

data_implicit.head()

In [None]:
sparse_content_person = scipy.sparse.csr_matrix(
    (data_implicit['eventStrength'].astype(float), (data_implicit['ID_EDITORIAL_simple'], data_implicit['ID_USER_simple']))
)
sparse_person_content = scipy.sparse.csr_matrix(
    (data_implicit['eventStrength'].astype(float), (data_implicit['ID_USER_simple'], data_implicit['ID_EDITORIAL_simple']))
)

In [None]:
%%time

model = implicit.als.AlternatingLeastSquares(factors=30, regularization=0.1, iterations=50)
alpha = 15
data_tofit = (sparse_content_person * alpha).astype('double')
model.fit(data_tofit)

In [None]:
# one hot encode the genre tag to add as features for the More Like This

genres = data_implicit[['ID_EDITORIAL_simple', 'TITLE', 'GENRE']]
genres = genres.drop_duplicates(subset='ID_EDITORIAL_simple')
genres = genres.sort_values(by='ID_EDITORIAL_simple').reset_index(drop=True)
genres.head()

In [None]:
genres_dummies = pd.get_dummies(genres['GENRE'], prefix='GENRE')

genres_dummies.head()

In [None]:
def more_like_this(content_id):
    
    n_similar = 10

    content_vecs = model.item_factors

    # trasnform to dataframe to stack with genre one hot encoded features
    content_vecs = pd.DataFrame(content_vecs)

    # create list of feature dataframes to stack in case one of the sets needs to be multiplied to make more prominent
    # can improve by looking at size of features of the two sets, e.g. if one hot encoding gives 90 features vs 30 chosen for synthetic ones genre will automatically be more important and synthetic sets may need to be multiplied by an equivalent factor
    importance_syntheticFeatures = 6
    importance_onehotencodedFeatures = 1
    to_stack = [content_vecs]*importance_syntheticFeatures + [genres_dummies]*importance_onehotencodedFeatures

    # Stack horizontally features coming from collaborativ filtering & genre one hot encoding
    content_vecs = pd.concat(to_stack, axis=1)

    # cosine similarity where A is any item, and B is the item of interest
    content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1)) # i.e. calculating abs. value of the vector of each item -->  |A|

    scores = content_vecs.dot(content_vecs.iloc[content_id,:])  / (content_norms * content_vecs.iloc[content_id,:].sum()) # i.e. calculating cosine similarity, (A.B) / (|A| x |B|) --> |B| just a constant so effectively won't need it

    top_idx = scores.argsort()
    similar = sorted(zip(top_idx, scores[top_idx]), key=lambda x: -x[1])

    counter = 1
    for content in similar:
        if counter <= n_similar:

            idx, score = content

            # show only if still available 
            if pd.to_datetime(data_implicit['END_DATE'].loc[data_implicit['ID_EDITORIAL_simple'] == idx].iloc[0]) > datetime.now():

                # show only if rating is as selected item or lower
                if data_implicit['RATING_n'].loc[data_implicit['ID_EDITORIAL_simple'] == idx].iloc[0] <= data_implicit['RATING_n'].loc[data_implicit['ID_EDITORIAL_simple'] == content_id].iloc[0]:

                    print(data_implicit['TITLE'].loc[data_implicit['ID_EDITORIAL_simple'] == idx].iloc[0])

                    counter = counter + 1

        else:
            break

In [None]:
more_like_this(2439)

In [None]:
#data_implicit[data_implicit['TITLE'].str.lower().str.contains("frozen")].head(10)
data_implicit[data_implicit['TITLE']=='Wonder Woman'].head(1)

In [None]:
def for_you(person_id):
    
    # Number of content items to get
    num_contents=10

    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()

    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]

    dithering = True
    if dithering:
        # Dithering score is caclulated as:
        # dithScore = log10(rank) + N(0,log10(eps)), where eps is generally between 1.5 & 3.0
        eps = 1.5
        rank = np.argsort(np.argsort(-rec_vector.reshape(-1,1)[:,0])) # for rec_vector_scaled the higher the better
        dithScore = np.log10(rank+1) + np.random.normal(0.0, np.sqrt(np.log10(eps)), len(rank)) # make rank start from 1 not zero
        # for dithScore the lower the better

        # Content already interacted will effectively have their recommendation multiplied by zero
        recommend_vector = person_interactions * (-dithScore + max(dithScore)) # multiply this way to effectively make the best score being the highest again

        content_idx = np.argsort(recommend_vector)[::-1]

    else:
        # Content already interacted will effectively have their recommendation multiplied by zero
        recommend_vector = person_interactions * rec_vector_scaled

        # Sort the indices of the content into order of best recommendations
        content_idx = np.argsort(recommend_vector)[::-1]

    # Start empty list to store titles and scores
    titles = []
    scores = []
    content_ids = []

    idx = 0
    counter = 0
    max_user_rating = max(data_implicit['RATING_n'].loc[data_implicit['ID_USER_simple'] == person_id])
    while counter < num_contents:

        # show only if still available 
        if pd.to_datetime(data_implicit['END_DATE'].loc[data_implicit['ID_EDITORIAL_simple'] == content_idx[idx]].iloc[0]) > datetime.now():

            # check that item has rating as or lower of highest watched by user
            if data_implicit['RATING_n'].loc[data_implicit['ID_EDITORIAL_simple'] == content_idx[idx]].iloc[0] <= max_user_rating:

                # Append titles and scores to the list
                titles.append(data_implicit['TITLE'].loc[data_implicit['ID_EDITORIAL_simple'] == content_idx[idx]].iloc[0])
                scores.append(recommend_vector[content_idx[idx]])
                content_ids.append(content_idx[idx])

                counter = counter + 1
        idx = idx + 1
    recommendations = pd.DataFrame({'title': titles, 'score': scores, 'ids' : content_ids})
    print(recommendations)

In [None]:
for_you(21)