# Bulding an Expedia recommender engine with Amazon Personalize

### Imports

In [1]:
import pandas as pd
from functools import reduce
import random
import boto3
import time
import json

bucket = 'pochetti-personalize'

### Helper functions

In [2]:
def extract_feature(d, idx, col):
    a = d.groupby([idx, col])[idx].nunique().to_frame('col')
    a = a.reset_index().drop('col', axis=1).pivot(index=idx, columns=col,values=col)
    a = a.astype('str')
    a['new'] = a.apply('|'.join, axis=1)
    a[col] = a.new.apply(lambda x: x.replace('|nan', '').replace('nan|', '').replace('.0', ''))
    return a.loc[:,[col]].reset_index()

def aggregate_features(d, idx, l, new):
    u = []
    for el in zip([idx]*len(l), l):
        print(*el)
        u.append(extract_feature(d, *el))
    
    u = reduce(lambda x, y: pd.merge(x, y, on = idx), u)
    u.rename(columns={idx: new}, inplace=True)
    u.columns = [c.upper() for c in u.columns]
    return u
    
def upload_to_s3(b, f): boto3.Session().resource('s3').Bucket(b).Object(f).upload_file(f)

### Loading data sample and select only transactions ending up in a booking

In [3]:
p = 0.03  # 3% of the lines
# keep the header, then take only 3% of lines
# if random from [0,1] interval is greater than 0.03 the row will be skipped
df = pd.read_csv(
         '/home/ec2-user/SageMaker/data/train.csv',
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)

In [4]:
df.shape

(1131133, 24)

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
date_time,2014-08-09 18:08:18,2014-08-16 19:42:57,2014-09-15 18:28:49,2014-10-29 14:12:29,2014-07-17 23:59:01
site_name,2,2,2,2,24
posa_continent,3,3,3,3,2
user_location_country,66,66,66,66,3
user_location_region,442,462,311,174,64
user_location_city,35390,41898,45008,40365,12576
orig_destination_distance,913.626,2105.18,,8455.62,
user_id,93,1482,1561,1713,2451
is_mobile,0,0,0,0,0
is_package,0,0,0,0,0


In [6]:
df.user_id.unique().shape

(522759,)

In [7]:
df.is_booking.sum()

89926

In [8]:
df.is_booking.sum()/len(df)

0.07950081909023961

In [9]:
df = df.loc[df.is_booking==1,:]

In [10]:
df.shape

(89926, 24)

### Getting user features

In [11]:
user_feats = ['site_name',
             'posa_continent',
             'user_location_country',
             'user_location_region']

In [12]:
users = aggregate_features(df, 'user_id', user_feats, 'USER_ID')

user_id site_name
user_id posa_continent
user_id user_location_country
user_id user_location_region


In [13]:
users.shape

(78504, 5)

In [14]:
users.head()

Unnamed: 0,USER_ID,SITE_NAME,POSA_CONTINENT,USER_LOCATION_COUNTRY,USER_LOCATION_REGION
0,31,2,3,66,189
1,52,2,3,66,311
2,127,34,3,205,155
3,153,2,3,23,78
4,206,2,3,66,220


In [15]:
users.to_csv('users.csv', index=False)

### Getting items (hotels) features

In [16]:
item_feats = ['srch_destination_type_id',
             'hotel_continent',
             'hotel_country',
             'hotel_market']

In [18]:
items = aggregate_features(df, 'hotel_cluster', item_feats, 'ITEM_ID')

hotel_cluster srch_destination_type_id
hotel_cluster hotel_continent
hotel_cluster hotel_country
hotel_cluster hotel_market


In [19]:
items.shape

(100, 5)

In [20]:
items.head()

Unnamed: 0,ITEM_ID,SRCH_DESTINATION_TYPE_ID,HOTEL_CONTINENT,HOTEL_COUNTRY,HOTEL_MARKET
0,0,1|3|4|5|6|8,0|2|3|4,8|32|50|51|52|87|88|96|151|174|198,58|80|110|197|201|202|212|213|214|355|357|367|...
1,1,1|3|4|5|6|8,2,50,623|628|633
2,2,1|3|4|5|6|8,0|2|3|4|5|6,3|5|7|8|11|12|13|15|17|21|25|31|34|46|48|50|58...,2|4|5|6|8|9|10|12|13|14|15|16|19|20|21|22|23|2...
3,3,1|3|4|5|6,0|2|3|4|5|6,5|7|8|12|13|15|17|21|22|26|31|34|46|48|50|63|6...,0|2|4|10|12|13|19|21|22|27|28|29|30|32|35|36|4...
4,4,1|3|4|5|6|8,0|2|3|4|5|6,8|13|46|50|51|63|70|77|102|105|126|168|171|198...,19|27|29|35|61|121|129|191|192|212|213|262|350...


In [21]:
items.to_csv('items.csv', index=False)

### Getting user-items interactions

In [22]:
inter = df.loc[:,['user_id', 'hotel_cluster', 'date_time']]
inter.rename(columns={'user_id': 'USER_ID', 'hotel_cluster': 'ITEM_ID', 'date_time': 'TIMESTAMP'}, inplace=True)
inter.shape

(89926, 3)

**Turning TIMESTAMP column into Unix time to be properly handled by Personalize**

In [23]:
inter.TIMESTAMP = pd.to_datetime(inter.TIMESTAMP)
inter.TIMESTAMP = inter.TIMESTAMP.apply(lambda x: time.mktime(x.timetuple()))

In [24]:
inter.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
21,4539,85,1394208000.0
22,4539,72,1394797000.0
29,6304,83,1413106000.0
61,9681,60,1417722000.0
62,9701,82,1409824000.0


In [25]:
inter.to_csv('inter.csv', index=False)

## AWS Personalize

### Moving CSV files to S3

In [10]:
upload_to_s3(bucket, 'inter.csv')
upload_to_s3(bucket, 'users.csv')
upload_to_s3(bucket, 'items.csv')

### Creating schemas for Personalize to properly read data

In [3]:
personalize = boto3.client('personalize')

**INTERACTIONS schema**

In [15]:
schema_inter = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {"name": "USER_ID", "type": "string"},
        {"name": "ITEM_ID", "type": "string"},
        {"name": "TIMESTAMP", "type": "long"}
    ],
    "version": "1.0"
}

create_schema_inter = personalize.create_schema(name = "interact-schema", schema = json.dumps(schema_inter))
schema_arn_inter = create_schema_inter['schemaArn']
f'schema_arn_inter: {schema_arn_inter}'

'schema_arn_inter: arn:aws:personalize:eu-west-1:257446244580:schema/interact-schema'

**USERS schema**

In [17]:
schema_users = {
    "type": "record",
    "name": "Users",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {"name": "USER_ID", "type": "string"},
        {"name": "SITE_NAME", "type": "string", "categorical": True},
        {"name": "POSA_CONTINENT", "type": "string", "categorical": True},
        {"name": "USER_LOCATION_COUNTRY", "type": "string", "categorical":True},
        {"name": "USER_LOCATION_REGION", "type": "string", "categorical": True}
    ],
    "version": "1.0"
}

create_schema_users = personalize.create_schema(name = "user-schema", schema = json.dumps(schema_users))
schema_arn_users = create_schema_users['schemaArn']
f'schema_arn_users: {schema_arn_users}'

'schema_arn_users: arn:aws:personalize:eu-west-1:257446244580:schema/user-schema'

**ITEMS schema**

In [18]:
schema_items = {
    "type": "record",
    "name": "Items",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {"name": "ITEM_ID", "type": "string"},
        {"name": "SRCH_DESTINATION_TYPE_ID", "type": "string", "categorical": True},
        {"name": "HOTEL_CONTINENT", "type": "string", "categorical": True},
        {"name": "HOTEL_COUNTRY", "type": "string", "categorical":True},
        {"name": "HOTEL_MARKET", "type": "string", "categorical": True}
    ],
    "version": "1.0"
}

create_schema_items = personalize.create_schema(name = "item-schema", schema = json.dumps(schema_items))
schema_arn_items = create_schema_items['schemaArn']
f'schema_arn_items: {schema_arn_items}'

'schema_arn_items: arn:aws:personalize:eu-west-1:257446244580:schema/item-schema'

### Creating a dataset group

In [19]:
create_dataset_group_response = personalize.create_dataset_group(name = "expedia")
dataset_group_arn = create_dataset_group_response['datasetGroupArn']
f'dataset_group_arn: {dataset_group_arn}'

'dataset_group_arn: arn:aws:personalize:eu-west-1:257446244580:dataset-group/expedia'

### Creating datasets within dataset group

**INTERACTIONS dataset**

In [21]:
inter_ds = personalize.create_dataset(name = 'interactions-ds',
                                      schemaArn = schema_arn_inter,
                                      datasetGroupArn = dataset_group_arn,
                                      datasetType = 'Interactions')

f"inter_ds_arn: {inter_ds['datasetArn']}"

'inter_ds_arn: arn:aws:personalize:eu-west-1:257446244580:dataset/expedia/INTERACTIONS'

**USERS dataset**

In [23]:
users_ds = personalize.create_dataset(name = 'users-ds',
                                      schemaArn = schema_arn_users,
                                      datasetGroupArn = dataset_group_arn,
                                      datasetType = 'Users')

f"inter_ds_arn: {users_ds['datasetArn']}"

'inter_ds_arn: arn:aws:personalize:eu-west-1:257446244580:dataset/expedia/USERS'

**ITEMS dataset**

In [24]:
items_ds = personalize.create_dataset(name = 'items-ds',
                                      schemaArn = schema_arn_items,
                                      datasetGroupArn = dataset_group_arn,
                                      datasetType = 'Items')

f"inter_ds_arn: {items_ds['datasetArn']}"

'inter_ds_arn: arn:aws:personalize:eu-west-1:257446244580:dataset/expedia/ITEMS'

### Creating Import Jobs to load data from S3 into Personalize datasets

In [11]:
s3 = boto3.client("s3")

policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket),
                "arn:aws:s3:::{}/*".format(bucket)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy))

{'ResponseMetadata': {'RequestId': '129466020D1FB060',
  'HostId': 'nEN+qc0Caimz80HaFZlkSr4mj3E5pxcJZvlKu8SNidMfYQwav3YQXpLhnwx69EQDdvVA2vS8/Ps=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'nEN+qc0Caimz80HaFZlkSr4mj3E5pxcJZvlKu8SNidMfYQwav3YQXpLhnwx69EQDdvVA2vS8/Ps=',
   'x-amz-request-id': '129466020D1FB060',
   'date': 'Sat, 07 Dec 2019 00:04:35 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

**INTERACTIONS import job**

In [25]:
inter_ij = personalize.create_dataset_import_job(
    jobName = 'interactions-ij',
    datasetArn = inter_ds['datasetArn'],
    dataSource = {'dataLocation':f's3://{bucket}/inter.csv'},
    roleArn = 'arn:aws:iam::257446244580:role/expedia')

inter_ij_arn = inter_ij['datasetImportJobArn']
print ('Interactons dataset Import Job arn: ' + inter_ij_arn)

description = personalize.describe_dataset_import_job(datasetImportJobArn = inter_ij_arn)['datasetImportJob']

print('Name: ' + description['jobName'])
print('ARN: ' + description['datasetImportJobArn'])
print('Status: ' + description['status'])

Interactons dataset Import Job arn: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/interactions-ij
Name: interactions-ij
ARN: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/interactions-ij
Status: CREATE PENDING


**USERS import job**

In [26]:
users_ij = personalize.create_dataset_import_job(
    jobName = 'users-ij',
    datasetArn = users_ds['datasetArn'],
    dataSource = {'dataLocation':f's3://{bucket}/users.csv'},
    roleArn = 'arn:aws:iam::257446244580:role/expedia')

users_ij_arn = users_ij['datasetImportJobArn']
print ('Users dataset Import Job arn: ' + users_ij_arn)

description = personalize.describe_dataset_import_job(datasetImportJobArn = users_ij_arn)['datasetImportJob']

print('Name: ' + description['jobName'])
print('ARN: ' + description['datasetImportJobArn'])
print('Status: ' + description['status'])

Users dataset Import Job arn: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/users-ij
Name: users-ij
ARN: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/users-ij
Status: CREATE PENDING


**ITEMS import job**

In [29]:
schema_items_no_market = {
    "type": "record",
    "name": "Items",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {"name": "ITEM_ID", "type": "string"},
        {"name": "SRCH_DESTINATION_TYPE_ID", "type": "string", "categorical": True},
        {"name": "HOTEL_CONTINENT", "type": "string", "categorical": True},
        {"name": "HOTEL_COUNTRY", "type": "string", "categorical":True}
    ],
    "version": "1.0"
}

create_schema_items_no_market = personalize.create_schema(name = "item-schema-no-market", schema = json.dumps(schema_items))
schema_arn_items_no_market = create_schema_items_no_market['schemaArn']
f'schema_arn_items_no_market: {schema_arn_items_no_market}'

'schema_arn_items_no_market: arn:aws:personalize:eu-west-1:257446244580:schema/item-schema-no-market'

In [27]:
items_ij = personalize.create_dataset_import_job(
    jobName = 'items-ij',
    datasetArn = items_ds['datasetArn'],
    dataSource = {'dataLocation':f's3://{bucket}/items.csv'},
    roleArn = 'arn:aws:iam::257446244580:role/expedia')

items_ij_arn = items_ij['datasetImportJobArn']
print ('Items dataset Import Job arn: ' + items_ij_arn)

description = personalize.describe_dataset_import_job(datasetImportJobArn = items_ij_arn)['datasetImportJob']

print('Name: ' + description['jobName'])
print('ARN: ' + description['datasetImportJobArn'])
print('Status: ' + description['status'])

Items dataset Import Job arn: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/items-ij
Name: items-ij
ARN: arn:aws:personalize:eu-west-1:257446244580:dataset-import-job/items-ij
Status: CREATE PENDING
