# [Optional] Module 4: Advanced Example using HRNN-Metadata Recipe with MovieLens Data

`
Rev Date           By       Description
PA1 2020-02-16     akirmak  Modified and extended version of one of the Amazon Personalize samples at github AWS-samples
`

The notebook demonstrates using Item metadata with HRNN-Metadata recipe. 

If you are interested in doing another exercise, this time using the HRNN-Metadata Recipe, this module is a modified version of an advanced example from Amazon Personalize samples at github AWS-samples: https://github.com/aws-samples/amazon-personalize-samples/blob/master/advanced_examples/personalize_temporal_holdout.ipynb

It also demonstrates holding-out 1% of "future" data for every user and using item meta-data. Then, an inference endpoint to bring recommendation and evaluate externally on the held-out data is also demonstrated.

Note on Costs & Duration of the Lab Module: The Movielens dataset file size used is around 600+MB, and has approximately 20M lines. The module takes longer to train than the Last.FM dataset.

In [None]:
import tempfile, subprocess, urllib.request, zipfile
import pandas as pd, numpy as np

In [None]:
import io
import scipy.sparse as ss
import json
import time
import os

In [None]:
import sagemaker.amazon.common as smac

In [None]:
import boto3

# Download and process a dataset

In [None]:
with tempfile.TemporaryDirectory() as tmpdir:
    urllib.request.urlretrieve(
        'http://files.grouplens.org/datasets/movielens/ml-20m.zip',
        tmpdir + '/ml-20m.zip')
    zipfile.ZipFile(tmpdir + '/ml-20m.zip').extractall(tmpdir)
    df = pd.read_csv(tmpdir + '/ml-20m/ratings.csv')
    movies = pd.read_csv(tmpdir + '/ml-20m/movies.csv', index_col='movieId')
    vocab_size = df.movieId.max() + 1

In [None]:
tmpdir

In [None]:
vocab_size

In [None]:
test_time_ratio = 0.01


## hold out the last bit of data in time

In [None]:
dfo = df.copy()
df = df[df.timestamp < df.timestamp.max() * (1-test_time_ratio) + df.timestamp.min() * test_time_ratio]

## convert into Personalize format

In [None]:
df.columns = ['USER_ID','ITEM_ID','EVENT_VALUE','TIMESTAMP']
df['EVENT_TYPE']='RATING'

In [None]:
df.head()

In [None]:
#for demo we may want to upload a small dataset
#df=df.loc[:10000]

Write the DF to file. the file size is around 600+MB, and has approximately 20M lines. So this may take a few minutes. 

In [None]:
df.to_csv('movielens_interactions.csv',index=False)

## process item metadata into Personalize formats

In [None]:
movies = movies.reset_index()

del movies['title']

movies.columns=['ITEM_ID','GENRE']

In [None]:
movies.head()

In [None]:
movies.to_csv('movielens_item_metadata.csv',index=False)

# upload data to s3

 Note: add your initials below. 

In [None]:
os.environ['AWS_DEFAULT_REGION']="us-east-1"
#suffix = str(np.random.uniform())[4:9]

#bucket = "ai-personalizepoc"+suffix     # replace with the name of your S3 bucket
# Override with your bucket or, comment out the line below to use the random bucket name above

bucket = "hba-ai-personalizepoc"
!aws s3 mb s3://{bucket}

In [None]:
personalize = boto3.client(service_name='personalize', endpoint_url='https://personalize.us-east-1.amazonaws.com')
personalize_runtime = boto3.client(service_name='personalize-runtime', endpoint_url='https://personalize-runtime.us-east-1.amazonaws.com')

In [None]:
interactions_filename = 'movielens_interactions.csv'
boto3.Session().resource('s3').Bucket(bucket).Object(interactions_filename).upload_file(interactions_filename)

In [None]:
item_metadata_file = 'movielens_item_metadata.csv'
boto3.Session().resource('s3').Bucket(bucket).Object(item_metadata_file).upload_file(item_metadata_file)

## create schemas for our two types of data 

In [None]:
schema_name="ai-personalize-movielens-interactions-metadata-schema"

In [None]:
schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "EVENT_VALUE",
            "type": "float"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        },
        { 
            "name": "EVENT_TYPE",
            "type": "string"
        },
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = schema_name,
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

In [None]:
metadata_schema_name="ai-personalize-movielens-item-metadata-schema"

In [None]:
metadata_schema = {
 "type": "record",
 "name": "Items",
 "namespace": "com.amazonaws.personalize.schema",
 "fields": [
 {
 "name": "ITEM_ID",
 "type": "string"
 },
 {
 "name": "GENRE",
 "type": "string",
 "categorical": True
 }
 ],
 "version": "1.0"
}

create_metadata_schema_response = personalize.create_schema(
    name = metadata_schema_name,
    schema = json.dumps(metadata_schema)
)

metadata_schema_arn = create_metadata_schema_response['schemaArn']
print(json.dumps(create_metadata_schema_response, indent=2))


## create a dataset group where we are going to add the data

In [None]:
dataset_group_name = "ai-personalize-movielens-metadata-dataset-group-"

create_dataset_group_response = personalize.create_dataset_group(
    name = dataset_group_name
)

movielens_dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = movielens_dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(20)

## add our two datasets into the dataset group

In [None]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = movielens_dataset_group_arn,
    schemaArn = schema_arn,
    name = "ai-personalize-movielens-metadata-dataset-interactions"
)

movielens_interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

In [None]:
dataset_type = "ITEMS"
create_metadata_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = movielens_dataset_group_arn,
    schemaArn = metadata_schema_arn,
    name = "ai-personalize-movielens-metadata-dataset-items"
)

metadata_dataset_arn = create_metadata_dataset_response['datasetArn']
print(json.dumps(create_metadata_dataset_response, indent=2))

In [None]:
# No clash with this notebook variables. 

%store -r

In [None]:
print(role_arn)

# import data into the created dataset group

In [None]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "ai-personalize-movielens-dataset-import-job-",
    datasetArn = movielens_interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, 'movielens_interactions.csv')
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

In [None]:
create_metadata_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "ai-personalize-movielens-metadata-dataset-import-job",
    datasetArn = metadata_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, 'movielens_item_metadata.csv')
    },
    roleArn = role_arn
)

metadata_dataset_import_job_arn = create_metadata_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_metadata_dataset_import_job_response, indent=2))

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    
    dataset_import_job = describe_dataset_import_job_response["datasetImportJob"]
    if "latestDatasetImportJobRun" not in dataset_import_job:
        status = dataset_import_job["status"]
        print("DatasetImportJob: {}".format(status))
    else:
        status = dataset_import_job["latestDatasetImportJobRun"]["status"]
        print("LatestDatasetImportJobRun: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = metadata_dataset_import_job_arn
    )
    
    dataset_import_job = describe_dataset_import_job_response["datasetImportJob"]
    if "latestDatasetImportJobRun" not in dataset_import_job:
        status = dataset_import_job["status"]
        print("DatasetImportJob: {}".format(status))
    else:
        status = dataset_import_job["latestDatasetImportJobRun"]["status"]
        print("LatestDatasetImportJobRun: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

## create a solution which uses meta-data

In [None]:
recipe_list = personalize.list_recipes()
for recipe in recipe_list['recipes']:
    print(recipe['recipeArn'])

In [None]:
recipe_arn = "arn:aws:personalize:::recipe/aws-hrnn-metadata"

In [None]:
create_solution_response = personalize.create_solution(
    name = "ai-personalize-movielens-metadata-solution",
    datasetGroupArn = movielens_dataset_group_arn,
    recipeArn = recipe_arn
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

In [None]:
create_solution_version_response = personalize.create_solution_version(
    solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_solution_version_response = personalize.describe_solution_version(
        solutionVersionArn = solution_version_arn
    )
    status = describe_solution_version_response["solutionVersion"]["status"]
    print("SolutionVersion: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

## get metrics for the solution

In [None]:
get_solution_metrics_response = personalize.get_solution_metrics(
    solutionVersionArn = solution_version_arn
)

print(json.dumps(get_solution_metrics_response, indent=2))


# Create a campaign from the solution

In [None]:
create_campaign_response = personalize.create_campaign(
    name = "ai-personalize-movielens-metadata-campaign",
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 1,    
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

In [None]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_campaign_response = personalize.describe_campaign(
        campaignArn = campaign_arn
    )
    status = describe_campaign_response["campaign"]["status"]
    print("Campaign: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

# Get the held out data, to compute metrics externally from the system as well

In [None]:
df = dfo.copy()
df = df[df.timestamp >= df.timestamp.max() * (1-test_time_ratio) + df.timestamp.min() * test_time_ratio]
df.columns = ['USER_ID','ITEM_ID','EVENT_VALUE','TIMESTAMP']
df['EVENT_TYPE']='RATING'
test_users = df['USER_ID'].unique()
df.head()

In [None]:
!pip install tqdm

If you get en error while executing the statement below, make sure the ranking_metrics_utils.py file is in the path:m

In [None]:
import sys
sys.path.append("$HOME/PersonalizePOC/advanced")  # path contains python_file.py


In [None]:
from tqdm import tqdm_notebook
import numpy as np
from ranking_metrics_utils import mean_reciprocal_rank, ndcg_at_k, precision_at_k

In [None]:
relevance = []
for user_id in tqdm_notebook(test_users):
    true_items = set(df[df['USER_ID']==user_id]['ITEM_ID'].values)
    rec_response = personalize_runtime.get_recommendations(
        campaignArn = campaign_arn,
        userId = str(user_id)
    )
    rec_items = [int(x['itemId']) for x in rec_response['itemList']]
    relevance.append([int(x in true_items) for x in rec_items])

In [None]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))

# Appendix  - A Data Preparation Utility (Diagnosis of your Dataset)
This utility was published in AWS samples in github. You could use it to explore the temporal characteristics of your dataset. 

In [None]:
import tempfile, subprocess, urllib.request, zipfile
import pandas as pd, numpy as np
import datetime
%matplotlib inline
from diagnose_personalize_data_utils import diagnose

### load data and some formatting


In [None]:
with tempfile.TemporaryDirectory() as tmpdir:
    urllib.request.urlretrieve(
        'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
        tmpdir + '/ml-100k.zip')
    zipfile.ZipFile(tmpdir + '/ml-100k.zip').extractall(tmpdir)
    print(subprocess.check_output(['ls', tmpdir+'/ml-100k']).decode('utf-8'))

    interactions = pd.read_csv(
        tmpdir + '/ml-100k/u.data',
        sep='\t',
        names=['USER_ID','ITEM_ID','RATING', 'TIMESTAMP'])

    users = pd.read_csv(
        tmpdir + '/ml-100k/u.user',
        sep='|',
        names=['USER_ID','AGE','GENDER','OCCUPATION','ZIPCODE'],
    )

    items = pd.read_csv(
        tmpdir + '/ml-100k/u.item',
        sep='|', encoding='latin1',
        names=['ITEM_ID', '_TITLE', 'CREATION_TIMESTAMP', '_', '_IMDb_URL'] + ['GENRE.%s'%i for i in range(19)],
    )

In [None]:
# CREATION_TIMESTAMP may become a reserved keyword and its behavior may change without further notice.
items.loc[items['CREATION_TIMESTAMP'].notnull(), 'CREATION_TIMESTAMP'] = items['CREATION_TIMESTAMP'].dropna().apply(
    lambda x:datetime.datetime.strptime(str(x), '%d-%b-%Y').timestamp())
items.fillna({'CREATION_TIMESTAMP': items['CREATION_TIMESTAMP'].min()}, inplace=True)

### show data template

In [None]:
interactions.head()

In [None]:
users.head()

In [None]:
items.head()

### run diagnostics

In [None]:
diagnose(interactions, users, items)

## Clean up: Delete Your Campaigns, Solutions and Datasets

The code below deletes the campaigns, solutions and the datasets for this module. 

In [None]:
personalize.delete_campaign(campaignArn=campaign_arn)
while len(personalize.list_campaigns(solutionArn=solution_arn)['campaigns']):
    time.sleep(5)

personalize.delete_solution(solutionArn=solution_arn)
while len(personalize.list_solutions(datasetGroupArn=movielens_dataset_group_arn)['solutions']):
    time.sleep(5)

for dataset in personalize.list_datasets(datasetGroupArn=movielens_dataset_group_arn)['datasets']:
    personalize.delete_dataset(datasetArn=dataset['datasetArn'])
while len(personalize.list_datasets(datasetGroupArn=movielens_dataset_group_arn)['datasets']):
    time.sleep(5)

personalize.delete_dataset_group(datasetGroupArn=movielens_dataset_group_arn)

Congratulations. You have gone through an advanced Example using HRNN-Metadata Recipe with MovieLens Data.