In [1]:
! pip install pandas
! pip install boto3
! pip install watchtower
! pip install s3fs==0.4.2
! pip install pyathena
! pip install matplotlib
! pip install scipy
! pip install ipywidgets
! pip install scikit-learn

! conda install -c conda-forge --yes implicit 

! pip install --upgrade jupyter_client # useful to make ipywidgets work properly when fitting data with implicit

# pip install git+https://gitlab.com/cloena/cloena-aws-tools.git

[31mERROR: Error checking for conflicts.
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pip/_vendor/pkg_resources/__init__.py", line 3021, in _dep_map
    return self.__dep_map
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pip/_vendor/pkg_resources/__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pip/_vendor/pkg_resources/__init__.py", line 3012, in _parsed_pkg_info
    return self._pkg_info
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pip/_vendor/pkg_resources/__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _pkg_info

During handling of the above exception, another exception occurred:


In [13]:
import pandas as pd

import implicit

from aws_tools import athena_tools, s3_tools

import matplotlib

import re

import scipy

from ipywidgets import FloatProgress

import numpy as np

from sklearn.preprocessing import MinMaxScaler

In [14]:
_v = '_v0.2'

In [15]:
query = """
SELECT
    LINE_ACCOUNT_ID AS ID_USER -- account ID
    ,PRODUCT_ID AS ID_PRODUCT -- item ID
    ,CI_TITLE AS TITLE -- human readable title name
    ,CI_ASSET_TYPE AS TYPE_ASSET -- e.g. Film, Music, etc.
    ,ENTITLEMENT_TYPE AS TYPE_ENTITLEMENT -- TVOD / EVOD
    ,CI_PRICE AS PRICE -- £ price
    ,PURCHASE_TIME_ST AS PURCHASE_DATE -- time of purchase
    ,EDITORIAL_VERSION_ID AS ID_EDITORIAL -- identifies multiple instances of same film/season/episode, e.g. rent/purchase & SD/HD
    ,PARENT_GUID AS ID_PARENT -- if present, can connect episode to season by matching with season PRODUCT_ID
FROM
    bt_home_datamart.l_edw_vod_purchases;
"""

# WHERE (ENTITLEMENT_TYPE LIKE 'EVOD' OR ENTITLEMENT_TYPE LIKE 'TVOD') -- can be used to exclude PPV, but better to include it and then remove anything no longer available from the catalogue

data = athena_tools.AthenaQuerier().execute_query(query=query)

2020-06-23 14:59:00,052 [INFO ]  starting Athena query ...
2020-06-23 14:59:18,762 [INFO ]  loading 219d6c4b-24bc-4db0-aff6-0b34b2ce8bb6.csv
2020-06-23 14:59:27,452 [INFO ]  Athena query complete: returning dataframe


In [16]:
query = """
SELECT
    PRODUCT_GUID AS ID_PRODUCT -- item ID
    ,CI_TITLE AS TITLE --  human readable title
    ,CI_TYPE AS TYPE -- type, like film/music/episode/season/collection
    ,CI_AVAILABLE_END_DT AS END_DATE -- date until availability of item
    ,EDITORIAL_VERSION_ID AS ID_EDITORIAL -- identifies multiple instances of same film/season/episode, e.g. rent/purchase & SD/HD
    ,CI_PARENTGUID AS ID_PARENT -- if present, can connect episode to season by matching with season PRODUCT_ID
    ,GENRE 
    ,RATING
FROM
    bt_home_datamart.l_edw_vod_products;
"""

cat = athena_tools.AthenaQuerier().execute_query(query=query)

2020-06-23 14:59:27,535 [INFO ]  starting Athena query ...
2020-06-23 14:59:38,951 [INFO ]  loading 183b87af-022e-4db1-9d75-038434068f6e.csv
2020-06-23 14:59:41,019 [INFO ]  Athena query complete: returning dataframe


In [17]:
print(data.head())
print(cat.head())

        ID_USER ID_PRODUCT                   TITLE TYPE_ASSET  \
0  BBEU05449846    9976024       Bad Boys for Life       Film   
1  BBEU10559541   10225401    The Call of the Wild       Film   
2  BBEU26855376    9357150        Official Secrets       Film   
3  BBEU22011432   10242613  No Time To Die (Audio)      Music   
4  BBEU22140449    9829824                    1917       Film   

  TYPE_ENTITLEMENT  PRICE       PURCHASE_DATE  ID_EDITORIAL  \
0             TVOD   4.49 2020-05-27 00:03:22   BBJ2550316A   
1             TVOD   3.49 2020-05-27 00:03:10   BBJ2526197A   
2             EVOD   7.99 2020-05-27 01:50:21   BBJ2371180A   
3             TVOD   0.30 2020-05-27 06:03:50  MOD35959769A   
4             TVOD   4.49 2020-05-27 06:47:20   BBJ2526048A   

            ID_PARENT  
0                 NaN  
1                 NaN  
2                 NaN  
3  movida_10074968_SD  
4                 NaN  
            ID_PRODUCT                                    TITLE     TYPE  \
0         

In [18]:
print(data.shape)
print(cat.shape)

(2553192, 9)
(447552, 8)


In [19]:
# Left join the parent id in data with the product id from the catalogue dataframe
# By joining something that has a parent with the parent effectively connects episodes to the seasons (once we have bran info, we will perform that extra join step)
# we then fill the empty cells of editorial versions (those that are not seasons) with the same editorial data so that we have a single column with editorial id for each purchase/rental

# first we fill any empty PARENT_GUID with PRODUCT_ID to make the merge fully work below
data['ID_PARENT'].fillna(data['ID_PRODUCT'], inplace=True)

data_m = data.merge(cat, left_on='ID_PARENT', right_on='ID_PRODUCT', suffixes=('_data', ''))

# From here for editorial we will look at the cat version as it will have the editorial ID of the season
# Drop duplicate or obsolete columns
data_m.drop(columns=['ID_PRODUCT_data', 'TITLE_data', 'ID_EDITORIAL_data', 'ID_PARENT_data', 'ID_PRODUCT', 'ID_PARENT'], inplace=True)

# we don't drop items that have an END_DATE in the past because we want to create the model, but will need to set those as zero to make sure we do not recommend anything that is not available anymore

In [20]:
print(data_m.shape) 
data_m.sort_values(by='PURCHASE_DATE', ascending=False).head()

(2553193, 11)


Unnamed: 0,ID_USER,TYPE_ASSET,TYPE_ENTITLEMENT,PRICE,PURCHASE_DATE,TITLE,TYPE,END_DATE,ID_EDITORIAL,GENRE,RATING
1287290,BBEU05051752,Film,EVOD,6.99,2020-06-21 23:57:23,Ratatouille,film,2025-12-31 23:59:00,BBJ316821HVOD,Animation,pg
723594,BBEU05727214,TV,EVOD,4.99,2020-06-21 23:47:06,Downton Abbey S04: Christmas Special PT 1 & 2,episode,2025-12-31 23:59:00,BBJ2335948A,Drama,12
2492500,BBEU37205900,Film,EVOD,5.99,2020-06-21 23:40:45,Little Fockers,film,2025-12-31 23:59:00,BBJ1143395HVOD,Comedy,12
1764406,BBEU05727214,TV,EVOD,10.99,2020-06-21 23:40:30,Downton Abbey Series 5,season,2025-12-31 23:59:00,movida_36757,Drama,15
2436003,BBEU36132330,Film,TVOD,3.5,2020-06-21 23:39:53,The Jungle Bunch,film,2020-08-16 22:59:00,BBJ1529781A,Animation,u


In [21]:
data_m['RATING'].value_counts()

15    798713
12    755652
pg    408336
u     366610
18    223882
Name: RATING, dtype: int64

In [22]:
# Drop any nan users or editorial ids
data_m = data_m[(~data_m['ID_USER'].isnull()) & (~data_m['ID_EDITORIAL'].isnull())]

In [23]:
data_m.to_csv('s3://bt-data-science-playground/bt-tv-recommendation-system/model_objects/historicalpurchasesrentals'+_v+'.csv', index=False)