# Bulding an Expedia recommender engine with Amazon Personalize

### Imports

In [1]:
import pandas as pd
from functools import reduce
import random
import boto3
import time
import json

bucket = 'pochetti-personalize'

### Helper functions

In [2]:
def extract_feature(d, idx, col):
    a = d.groupby([idx, col])[idx].nunique().to_frame('col')
    a = a.reset_index().drop('col', axis=1).pivot(index=idx, columns=col,values=col)
    a = a.astype('str')
    a['new'] = a.apply('|'.join, axis=1)
    a[col] = a.new.apply(lambda x: x.replace('|nan', '').replace('nan|', '').replace('.0', ''))
    return a.loc[:,[col]].reset_index()

def aggregate_features(d, idx, l, new):
    u = []
    for el in zip([idx]*len(l), l):
        print(*el)
        u.append(extract_feature(d, *el))
    
    u = reduce(lambda x, y: pd.merge(x, y, on = idx), u)
    u.rename(columns={idx: new}, inplace=True)
    u.columns = [c.upper() for c in u.columns]
    return u
    
def upload_to_s3(b, f): boto3.Session().resource('s3').Bucket(b).Object(f).upload_file(f)

### Loading data sample and select only transactions ending up in a booking

In [3]:
p = 0.03  # 3% of the lines
# keep the header, then take only 3% of lines
# if random from [0,1] interval is greater than 0.03 the row will be skipped
df = pd.read_csv(
         '/home/ec2-user/SageMaker/data/train.csv',
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)

In [4]:
df.shape

(1130695, 24)

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
date_time,2014-08-11 08:24:33,2014-08-17 15:06:49,2014-12-26 18:24:16,2014-12-29 00:02:47,2014-12-30 06:04:13
site_name,2,24,24,24,24
posa_continent,3,2,2,2,2
user_location_country,66,3,3,3,3
user_location_region,348,64,64,64,64
user_location_city,48862,4777,9448,9448,9448
orig_destination_distance,2234.26,,,,
user_id,12,2451,2451,2451,2451
is_mobile,0,0,0,0,0
is_package,0,0,0,0,0


In [6]:
df.user_id.unique().shape

(522714,)

In [7]:
df.is_booking.sum()

90436

In [8]:
df.is_booking.sum()/len(df)

0.07998266552872349

In [9]:
df = df.loc[df.is_booking==1,:]

In [10]:
df.shape

(90436, 24)

### Getting user features

In [11]:
user_feats = ['site_name',
             'posa_continent',
             'user_location_country',
             'user_location_region']

In [12]:
users = aggregate_features(df, 'user_id', user_feats, 'USER_ID')

user_id site_name
user_id posa_continent
user_id user_location_country
user_id user_location_region


In [13]:
users.shape

(78844, 5)

In [14]:
users.head()

Unnamed: 0,USER_ID,SITE_NAME,POSA_CONTINENT,USER_LOCATION_COUNTRY,USER_LOCATION_REGION
0,5,2,3,66,174
1,52,2,3,66,392
2,94,2,3,66,294
3,115,2,3,154,196
4,127,34,3,205,155


In [15]:
users.to_csv('users.csv', index=False)

### Getting items (hotels) features

In [16]:
item_feats = ['srch_destination_type_id',
             'hotel_continent',
             'hotel_country']

In [17]:
items = aggregate_features(df, 'hotel_cluster', item_feats, 'ITEM_ID')

hotel_cluster srch_destination_type_id
hotel_cluster hotel_continent
hotel_cluster hotel_country


In [18]:
items.shape

(100, 4)

In [19]:
items.head()

Unnamed: 0,ITEM_ID,SRCH_DESTINATION_TYPE_ID,HOTEL_CONTINENT,HOTEL_COUNTRY
0,0,1|3|4|5|6|8,2|3|4,8|32|50|51|52|88|96|151|198
1,1,1|3|4|5|6|8,2,50
2,2,1|3|4|5|6|8,0|2|3|4|5|6,3|5|7|8|11|12|13|15|17|21|25|31|34|46|48|50|58...
3,3,1|3|4|5|6,0|2|3|4|5|6,5|8|11|13|15|17|22|25|29|31|34|38|45|46|47|48|...
4,4,1|3|4|5|6|7|8,0|2|3|4|5|6,8|13|21|32|46|50|51|63|70|77|102|105|106|126|1...


In [20]:
items.to_csv('items.csv', index=False)

### Getting user-items interactions

In [26]:
inter = df.loc[:,['user_id', 'hotel_cluster', 'date_time']]
inter.rename(columns={'user_id': 'USER_ID', 'hotel_cluster': 'ITEM_ID', 'date_time': 'TIMESTAMP'}, inplace=True)
inter.shape

(90436, 3)

**Turning TIMESTAMP column into Unix time to be properly handled by Personalize**

In [29]:
inter.TIMESTAMP = pd.to_datetime(inter.TIMESTAMP)
inter.TIMESTAMP = inter.TIMESTAMP.apply(lambda x: int(time.mktime(x.timetuple())))

In [30]:
inter.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
16,4539,20,1371038294
19,4539,36,1395763732
22,6304,6,1412796293
23,6304,83,1413106407
70,9681,81,1418241164


In [31]:
inter.to_csv('inter.csv', index=False)

## AWS Personalize

### Moving CSV files to S3

In [32]:
upload_to_s3(bucket, 'inter.csv')
upload_to_s3(bucket, 'users.csv')
upload_to_s3(bucket, 'items.csv')

### Creating schemas for Personalize to properly read data

In [33]:
personalize = boto3.client('personalize')

**INTERACTIONS schema**

In [15]:
schema_inter = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {"name": "USER_ID", "type": "string"},
        {"name": "ITEM_ID", "type": "string"},
        {"name": "TIMESTAMP", "type": "long"}
    ],
    "version": "1.0"
}

create_schema_inter = personalize.create_schema(name = "interact-schema", schema = json.dumps(schema_inter))
schema_arn_inter = create_schema_inter['schemaArn']
f'schema_arn_inter: {schema_arn_inter}'

'schema_arn_inter: arn:aws:personalize:eu-west-1:257446244580:schema/interact-schema'

**USERS schema**

In [17]:
schema_users = {
    "type": "record",
    "name": "Users",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {"name": "USER_ID", "type": "string"},
        {"name": "SITE_NAME", "type": "string", "categorical": True},
        {"name": "POSA_CONTINENT", "type": "string", "categorical": True},
        {"name": "USER_LOCATION_COUNTRY", "type": "string", "categorical":True},
        {"name": "USER_LOCATION_REGION", "type": "string", "categorical": True}
    ],
    "version": "1.0"
}

create_schema_users = personalize.create_schema(name = "user-schema", schema = json.dumps(schema_users))
schema_arn_users = create_schema_users['schemaArn']
f'schema_arn_users: {schema_arn_users}'

'schema_arn_users: arn:aws:personalize:eu-west-1:257446244580:schema/user-schema'

**ITEMS schema**

In [36]:
schema_items = {
    "type": "record",
    "name": "Items",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {"name": "ITEM_ID", "type": "string"},
        {"name": "SRCH_DESTINATION_TYPE_ID", "type": "string", "categorical": True},
        {"name": "HOTEL_CONTINENT", "type": "string", "categorical": True},
        {"name": "HOTEL_COUNTRY", "type": "string", "categorical":True}
    ],
    "version": "1.0"
}

create_schema_items = personalize.create_schema(name = "item-schema-nomarket", schema = json.dumps(schema_items))
schema_arn_items = create_schema_items['schemaArn']
f'schema_arn_items: {schema_arn_items}'

'schema_arn_items: arn:aws:personalize:eu-west-1:257446244580:schema/item-schema-nomarket'

### Creating a dataset group

In [37]:
create_dataset_group_response = personalize.create_dataset_group(name = "expedia")
dataset_group_arn = create_dataset_group_response['datasetGroupArn']
f'dataset_group_arn: {dataset_group_arn}'

'dataset_group_arn: arn:aws:personalize:eu-west-1:257446244580:dataset-group/expedia'

### Creating datasets within dataset group

**INTERACTIONS dataset**

In [39]:
inter_ds = personalize.create_dataset(name = 'interactions-ds',
                                      schemaArn = 'arn:aws:personalize:eu-west-1:257446244580:schema/interact-schema',
                                      datasetGroupArn = dataset_group_arn,
                                      datasetType = 'Interactions')

f"inter_ds_arn: {inter_ds['datasetArn']}"

'inter_ds_arn: arn:aws:personalize:eu-west-1:257446244580:dataset/expedia/INTERACTIONS'

**USERS dataset**

In [40]:
users_ds = personalize.create_dataset(name = 'users-ds',
                                      schemaArn = 'arn:aws:personalize:eu-west-1:257446244580:schema/user-schema',
                                      datasetGroupArn = dataset_group_arn,
                                      datasetType = 'Users')

f"user_ds_arn: {users_ds['datasetArn']}"

'user_ds_arn: arn:aws:personalize:eu-west-1:257446244580:dataset/expedia/USERS'

**ITEMS dataset**

In [41]:
items_ds = personalize.create_dataset(name = 'items-ds',
                                      schemaArn = 'arn:aws:personalize:eu-west-1:257446244580:schema/item-schema-nomarket',
                                      datasetGroupArn = dataset_group_arn,
                                      datasetType = 'Items')

f"item_ds_arn: {items_ds['datasetArn']}"

'item_ds_arn: arn:aws:personalize:eu-west-1:257446244580:dataset/expedia/ITEMS'

### Creating Import Jobs to load data from S3 into Personalize datasets

In [11]:
s3 = boto3.client("s3")

policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket),
                "arn:aws:s3:::{}/*".format(bucket)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy))

{'ResponseMetadata': {'RequestId': '129466020D1FB060',
  'HostId': 'nEN+qc0Caimz80HaFZlkSr4mj3E5pxcJZvlKu8SNidMfYQwav3YQXpLhnwx69EQDdvVA2vS8/Ps=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'nEN+qc0Caimz80HaFZlkSr4mj3E5pxcJZvlKu8SNidMfYQwav3YQXpLhnwx69EQDdvVA2vS8/Ps=',
   'x-amz-request-id': '129466020D1FB060',
   'date': 'Sat, 07 Dec 2019 00:04:35 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

**INTERACTIONS import job**

In [42]:
inter_ij = personalize.create_dataset_import_job(
    jobName = 'interactions-ij',
    datasetArn = inter_ds['datasetArn'],
    dataSource = {'dataLocation':f's3://{bucket}/inter.csv'},
    roleArn = 'arn:aws:iam::257446244580:role/expedia')

inter_ij_arn = inter_ij['datasetImportJobArn']
print ('Interactons dataset Import Job arn: ' + inter_ij_arn)

description = personalize.describe_dataset_import_job(datasetImportJobArn = inter_ij_arn)['datasetImportJob']

print('Name: ' + description['jobName'])
print('ARN: ' + description['datasetImportJobArn'])
print('Status: ' + description['status'])

Interactons dataset Import Job arn: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/interactions-ij
Name: interactions-ij
ARN: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/interactions-ij
Status: CREATE PENDING


**USERS import job**

In [43]:
users_ij = personalize.create_dataset_import_job(
    jobName = 'users-ij',
    datasetArn = users_ds['datasetArn'],
    dataSource = {'dataLocation':f's3://{bucket}/users.csv'},
    roleArn = 'arn:aws:iam::257446244580:role/expedia')

users_ij_arn = users_ij['datasetImportJobArn']
print ('Users dataset Import Job arn: ' + users_ij_arn)

description = personalize.describe_dataset_import_job(datasetImportJobArn = users_ij_arn)['datasetImportJob']

print('Name: ' + description['jobName'])
print('ARN: ' + description['datasetImportJobArn'])
print('Status: ' + description['status'])

Users dataset Import Job arn: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/users-ij
Name: users-ij
ARN: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/users-ij
Status: CREATE PENDING


**ITEMS import job**

In [44]:
items_ij = personalize.create_dataset_import_job(
    jobName = 'items-ij',
    datasetArn = items_ds['datasetArn'],
    dataSource = {'dataLocation':f's3://{bucket}/items.csv'},
    roleArn = 'arn:aws:iam::257446244580:role/expedia')

items_ij_arn = items_ij['datasetImportJobArn']
print ('Items dataset Import Job arn: ' + items_ij_arn)

description = personalize.describe_dataset_import_job(datasetImportJobArn = items_ij_arn)['datasetImportJob']

print('Name: ' + description['jobName'])
print('ARN: ' + description['datasetImportJobArn'])
print('Status: ' + description['status'])

Items dataset Import Job arn: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/items-ij
Name: items-ij
ARN: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/items-ij
Status: CREATE PENDING


In [45]:
auto_recommender = personalize.create_solution(
    name = "expedia-recommender",
    datasetGroupArn = dataset_group_arn,
    performAutoML = True
)

solution_arn = auto_recommender['solutionArn']

In [46]:
solution_arn

'arn:aws:personalize:eu-west-1:257446244580:solution/expedia-recommender'

In [47]:
auto_recommender_model = personalize.create_solution_version(solutionArn = solution_arn)
solution_version_arn = auto_recommender_model['solutionVersionArn']
solution_version_arn

'arn:aws:personalize:eu-west-1:257446244580:solution/expedia-recommender/827f86d6'

In [48]:
meta_recommender = personalize.create_solution(
    name = "expedia-recommender-metadata",
    datasetGroupArn = dataset_group_arn,
    recipeArn = "arn:aws:personalize:::recipe/aws-hrnn-metadata"
)

solution_arn = meta_recommender['solutionArn']
solution_arn

'arn:aws:personalize:eu-west-1:257446244580:solution/expedia-recommender-metadata'

In [49]:
meta_recommender_model = personalize.create_solution_version(solutionArn = "arn:aws:personalize:eu-west-1:257446244580:solution/expedia-recommender-metadata")
solution_version_arn = meta_recommender_model['solutionVersionArn']
solution_version_arn

'arn:aws:personalize:eu-west-1:257446244580:solution/expedia-recommender-metadata/146fb017'

In [52]:
big = pd.read_csv('/home/ec2-user/SageMaker/data/train.csv', usecols=['date_time', 'user_id', 'hotel_cluster']) 

In [53]:
big.shape

(37670293, 3)

In [55]:
big.head()

Unnamed: 0,date_time,user_id,hotel_cluster
0,2014-08-11 07:46:59,12,1
1,2014-08-11 08:22:12,12,1
2,2014-08-11 08:24:33,12,1
3,2014-08-09 18:05:16,93,80
4,2014-08-09 18:08:18,93,21


In [56]:
big = big.loc[:,['user_id', 'hotel_cluster', 'date_time']]
big.rename(columns={'user_id': 'USER_ID', 'hotel_cluster': 'ITEM_ID', 'date_time': 'TIMESTAMP'}, inplace=True)
big.shape

(37670293, 3)

In [57]:
big.TIMESTAMP = pd.to_datetime(big.TIMESTAMP)
big.TIMESTAMP = big.TIMESTAMP.apply(lambda x: int(time.mktime(x.timetuple())))

In [58]:
big.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
0,12,1,1407743219
1,12,1,1407745332
2,12,1,1407745473
3,93,80,1407607516
4,93,21,1407607698


In [59]:
big.to_csv('inter-big.csv', index=False)

In [60]:
upload_to_s3oad_to_s3(bucket, 'inter-big.csv')