# Bulding an Expedia recommender engine with Amazon Personalize

### Imports

In [1]:
import pandas as pd
from functools import reduce
import random
import boto3

### Helper functions

In [2]:
def extract_feature(d, idx, col):
    a = d.groupby([idx, col])[idx].nunique().to_frame('col')
    a = a.reset_index().drop('col', axis=1).pivot(index=idx, columns=col,values=col)
    a = a.astype('str')
    a['new'] = a.apply('|'.join, axis=1)
    a[col] = a.new.apply(lambda x: x.replace('|nan', '').replace('nan|', '').replace('.0', ''))
    return a.loc[:,[col]].reset_index()

def aggregate_features(d, idx, l, new):
    u = []
    for el in zip([idx]*len(l), l):
        print(*el)
        u.append(extract_feature(d, *el))
    
    u = reduce(lambda x, y: pd.merge(x, y, on = idx), u)
    u.rename(columns={idx: new}, inplace=True)
    u.columns = [c.upper() for c in u.columns]
    return u
    
def upload_to_s3(b, f): boto3.Session().resource('s3').Bucket(b).Object(f).upload_file(f)

### Loading data sample and select only transactions ending up in a booking

In [3]:
p = 0.03  # 3% of the lines
# keep the header, then take only 3% of lines
# if random from [0,1] interval is greater than 0.03 the row will be skipped
df = pd.read_csv(
         '/home/ec2-user/SageMaker/data/train.csv',
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)

In [4]:
df.shape

(1129142, 24)

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
date_time,2014-08-09 18:05:16,2014-11-23 16:42:42,2014-10-10 17:01:59,2013-01-10 18:30:27,2013-12-02 02:08:28
site_name,2,30,2,37,24
posa_continent,3,4,3,1,2
user_location_country,66,195,66,69,3
user_location_region,442,991,462,923,64
user_location_city,35390,47725,41898,23649,12576
orig_destination_distance,913.193,,2671.96,,
user_id,93,1048,1482,2140,2451
is_mobile,0,0,0,1,0
is_package,0,0,1,0,0


In [6]:
df.user_id.unique().shape

(522553,)

In [7]:
df.is_booking.sum()

89913

In [8]:
df.is_booking.sum()/len(df)

0.0796294885851381

In [9]:
df = df.loc[df.is_booking==1,:]

In [10]:
df.shape

(89913, 24)

### Getting user features

In [11]:
user_feats = ['site_name',
             'posa_continent',
             'user_location_country',
             'user_location_region']

In [12]:
users = aggregate_features(df, 'user_id', user_feats, 'USER_ID')

user_id site_name
user_id posa_continent
user_id user_location_country
user_id user_location_region


In [13]:
users.shape

(78664, 5)

In [14]:
users.head()

Unnamed: 0,USER_ID,SITE_NAME,POSA_CONTINENT,USER_LOCATION_COUNTRY,USER_LOCATION_REGION
0,40,2,3,66,348
1,41,2,3,66,174
2,55,2,3,66,348
3,57,2,3,57,342
4,67,2,3,66,246


In [15]:
users.to_csv('users.csv', index=False)

### Getting items (hotels) features

In [16]:
item_feats = ['srch_destination_type_id',
             'hotel_continent',
             'hotel_country',
             'hotel_market']

In [17]:
items = aggregate_features(df, 'hotel_cluster', user_feats, 'ITEM_ID')

hotel_cluster site_name
hotel_cluster posa_continent
hotel_cluster user_location_country
hotel_cluster user_location_region


In [18]:
items.shape

(100, 5)

In [19]:
items.head()

Unnamed: 0,ITEM_ID,SITE_NAME,POSA_CONTINENT,USER_LOCATION_COUNTRY,USER_LOCATION_REGION
0,0,2|8|10|11|13|18|19|23|24|25|26|30|32|33|34|35|37,0|1|2|3|4,3|19|23|26|30|39|46|48|49|57|62|63|66|69|70|71...,0|11|13|19|23|25|26|38|39|45|47|48|49|50|51|60...
1,1,2|8|10|11|13|17|18|19|21|23|24|25|26|28|29|30|...,0|1|2|3|4,1|3|23|24|44|46|48|52|62|66|68|69|71|77|80|82|...,0|10|11|17|20|22|23|25|26|31|37|38|39|40|45|48...
2,2,2|7|8|9|10|11|13|14|15|16|17|18|20|21|22|23|24...,0|1|2|3|4,0|1|3|5|11|12|21|22|23|27|28|32|45|46|47|48|50...,0|1|2|3|4|5|7|9|10|11|12|13|14|17|20|21|22|23|...
3,3,2|6|8|9|10|11|13|14|15|16|17|18|20|22|23|24|25...,0|1|2|3|4,0|1|3|12|19|23|24|28|30|32|46|48|50|55|62|63|6...,0|12|13|14|18|20|22|23|25|31|32|38|40|45|47|48...
4,4,2|8|11|13|15|17|18|19|21|23|24|25|26|29|30|31|...,0|1|2|3|4,0|1|3|19|23|24|25|28|29|30|39|44|46|48|50|52|5...,0|11|12|14|16|17|20|22|32|36|38|39|40|41|44|45...


In [20]:
items.to_csv('items.csv', index=False)

### Getting user-items interactions

In [25]:
inter = df.loc[:,['user_id', 'hotel_cluster', 'date_time']]
inter.rename(columns={'user_id': 'USER_ID', 'hotel_cluster': 'ITEM_ID', 'date_time': 'TIMESTAMP'}, inplace=True)
inter.shape

(89913, 3)

In [26]:
inter.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
9,3925,42,2013-11-09 07:45:52
23,4539,40,2014-09-17 18:02:52
34,6929,33,2014-02-20 09:37:23
62,8798,73,2014-02-01 10:06:49
73,10106,97,2014-09-17 06:58:44


In [27]:
inter.to_csv('inter.csv', index=False)

## AWS Personalize

### Moving CSV files to S3

In [28]:
bucket = 'francesco-ml-tests'
upload_to_s3(bucket, 'inter.csv')
upload_to_s3(bucket, 'users.csv')
upload_to_s3(bucket, 'items.csv')

### Creating schemas for Personalize to properly read the data

In [29]:
personalize = boto3.client('personalize')

In [30]:
# schema = {
#     "type": "record",
#     "name": "Interactions",
#     "namespace": "com.amazonaws.personalize.schema",
#     "fields": [
#         {
#             "name": "USER_ID",
#             "type": "string"
#         },
#         {
#             "name": "ITEM_ID",
#             "type": "string"
#         },
#         {
#             "name": "TIMESTAMP",
#             "type": "long"
#         }
#     ],
#     "version": "1.0"
# }

# create_schema_response = personalize.create_schema(
#     name = "personalize-demo-schema",
#     schema = json.dumps(schema)
# )

# schema_arn = create_schema_response['schemaArn']
# print(json.dumps(create_schema_response, indent=2))