# Bulding an Expedia recommender engine with Amazon Personalize

### Imports

In [26]:
import pandas as pd
from functools import reduce
import random
import boto3
import time

### Helper functions

In [2]:
def extract_feature(d, idx, col):
    a = d.groupby([idx, col])[idx].nunique().to_frame('col')
    a = a.reset_index().drop('col', axis=1).pivot(index=idx, columns=col,values=col)
    a = a.astype('str')
    a['new'] = a.apply('|'.join, axis=1)
    a[col] = a.new.apply(lambda x: x.replace('|nan', '').replace('nan|', '').replace('.0', ''))
    return a.loc[:,[col]].reset_index()

def aggregate_features(d, idx, l, new):
    u = []
    for el in zip([idx]*len(l), l):
        print(*el)
        u.append(extract_feature(d, *el))
    
    u = reduce(lambda x, y: pd.merge(x, y, on = idx), u)
    u.rename(columns={idx: new}, inplace=True)
    u.columns = [c.upper() for c in u.columns]
    return u
    
def upload_to_s3(b, f): boto3.Session().resource('s3').Bucket(b).Object(f).upload_file(f)

### Loading data sample and select only transactions ending up in a booking

In [3]:
p = 0.03  # 3% of the lines
# keep the header, then take only 3% of lines
# if random from [0,1] interval is greater than 0.03 the row will be skipped
df = pd.read_csv(
         '/home/ec2-user/SageMaker/data/train.csv',
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)

In [4]:
df.shape

(1129976, 24)

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
date_time,2014-11-22 21:15:06,2014-11-23 17:18:41,2014-09-15 15:08:49,2013-01-10 18:26:40,2014-07-18 01:00:38
site_name,30,30,2,37,24
posa_continent,4,4,3,1,2
user_location_country,195,195,66,69,3
user_location_region,991,991,311,923,64
user_location_city,47725,47725,45008,23649,12576
orig_destination_distance,,,,,
user_id,1048,1048,1561,2140,2451
is_mobile,1,0,0,1,0
is_package,0,0,0,0,0


In [6]:
df.user_id.unique().shape

(522877,)

In [7]:
df.is_booking.sum()

89656

In [8]:
df.is_booking.sum()/len(df)

0.07934327808732221

In [9]:
df = df.loc[df.is_booking==1,:]

In [10]:
df.shape

(89656, 24)

### Getting user features

In [11]:
user_feats = ['site_name',
             'posa_continent',
             'user_location_country',
             'user_location_region']

In [12]:
users = aggregate_features(df, 'user_id', user_feats, 'USER_ID')

user_id site_name
user_id posa_continent
user_id user_location_country
user_id user_location_region


In [13]:
users.shape

(78509, 5)

In [14]:
users.head()

Unnamed: 0,USER_ID,SITE_NAME,POSA_CONTINENT,USER_LOCATION_COUNTRY,USER_LOCATION_REGION
0,8,2,3,66,348
1,115,2,3,154,196
2,165,2,3,66,246
3,181,2,3,66,348
4,188,2,3,70,47


In [15]:
users.to_csv('users.csv', index=False)

### Getting items (hotels) features

In [16]:
item_feats = ['srch_destination_type_id',
             'hotel_continent',
             'hotel_country',
             'hotel_market']

In [17]:
items = aggregate_features(df, 'hotel_cluster', user_feats, 'ITEM_ID')

hotel_cluster site_name
hotel_cluster posa_continent
hotel_cluster user_location_country
hotel_cluster user_location_region


In [18]:
items.shape

(100, 5)

In [19]:
items.head()

Unnamed: 0,ITEM_ID,SITE_NAME,POSA_CONTINENT,USER_LOCATION_COUNTRY,USER_LOCATION_REGION
0,0,2|8|9|10|11|17|18|19|23|24|25|26|30|33|34|37|40,0|1|2|3|4,0|3|19|23|32|46|52|63|66|69|70|71|74|77|83|85|...,0|11|21|38|40|41|47|48|49|50|51|62|63|64|65|68...
1,1,2|8|10|11|13|17|18|19|23|24|25|26|28|30|31|32|...,0|1|2|3|4,0|1|3|15|19|23|28|29|30|39|46|48|50|55|57|62|6...,0|11|12|13|20|23|25|26|38|41|43|45|47|48|49|50...
2,2,2|7|8|9|10|11|13|14|15|17|18|20|22|23|24|25|26...,0|1|2|3|4,0|1|3|4|5|12|19|21|22|23|24|26|28|29|30|32|39|...,0|4|5|8|9|11|12|13|17|20|21|22|23|24|29|31|32|...
3,3,2|8|10|11|13|14|15|17|22|23|24|25|27|28|30|32|...,0|1|2|3|4,0|1|3|5|11|12|19|22|23|24|28|29|30|46|48|50|52...,0|9|10|11|12|13|14|17|20|23|25|31|32|36|38|39|...
4,4,2|8|10|11|13|17|18|22|23|24|25|26|27|28|30|32|...,0|1|2|3|4,0|1|3|5|12|19|23|30|32|39|46|47|48|49|51|62|66...,0|9|11|13|18|20|21|22|23|31|37|38|39|40|41|45|...


In [20]:
items.to_csv('items.csv', index=False)

### Getting user-items interactions

In [21]:
inter = df.loc[:,['user_id', 'hotel_cluster', 'date_time']]
inter.rename(columns={'user_id': 'USER_ID', 'hotel_cluster': 'ITEM_ID', 'date_time': 'TIMESTAMP'}, inplace=True)
inter.shape

(89656, 3)

In [33]:
inter.TIMESTAMP = pd.to_datetime(inter.TIMESTAMP)
inter.TIMESTAMP = inter.TIMESTAMP.apply(lambda x: time.mktime(x.timetuple()))

In [34]:
inter.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
13,3925,83,1411228000.0
32,6549,78,1408355000.0
68,9681,60,1417722000.0
69,9681,61,1417965000.0
81,14099,91,1394569000.0


In [35]:
inter.to_csv('inter.csv', index=False)

## AWS Personalize

### Moving CSV files to S3

In [41]:
bucket = 'francesco-ml-tests'
upload_to_s3(bucket, 'inter.csv')
upload_to_s3(bucket, 'users.csv')
upload_to_s3(bucket, 'items.csv')

### Creating schemas for Personalize to properly read the data

In [29]:
personalize = boto3.client('personalize')

In [30]:
# schema = {
#     "type": "record",
#     "name": "Interactions",
#     "namespace": "com.amazonaws.personalize.schema",
#     "fields": [
#         {
#             "name": "USER_ID",
#             "type": "string"
#         },
#         {
#             "name": "ITEM_ID",
#             "type": "string"
#         },
#         {
#             "name": "TIMESTAMP",
#             "type": "long"
#         }
#     ],
#     "version": "1.0"
# }

# create_schema_response = personalize.create_schema(
#     name = "personalize-demo-schema",
#     schema = json.dumps(schema)
# )

# schema_arn = create_schema_response['schemaArn']
# print(json.dumps(create_schema_response, indent=2))