# Bulding an Expedia recommender engine with Amazon Personalize

### Imports

In [1]:
import pandas as pd
from functools import reduce
import random
import boto3

### Helper functions

In [2]:
def extract_feature(d, idx, col):
    a = d.groupby([idx, col])[idx].nunique().to_frame('col')
    a = a.reset_index().drop('col', axis=1).pivot(index=idx, columns=col,values=col)
    a = a.astype('str')
    a['new'] = a.apply('|'.join, axis=1)
    a[col] = a.new.apply(lambda x: x.replace('|nan', '').replace('nan|', '').replace('.0', ''))
    return a.loc[:,[col]].reset_index()

def aggregate_features(d, idx, l, new):
    u = []
    for el in zip([idx]*len(l), l):
        print(*el)
        u.append(extract_feature(d, *el))
    
    u = reduce(lambda x, y: pd.merge(x, y, on = idx), u)
    u.rename(columns={idx: new}, inplace=True)
    u.columns = [c.upper() for c in u.columns]
    return u
    
def upload_to_s3(b, f): boto3.Session().resource('s3').Bucket(b).Object(f).upload_file(f)

### Loading data sample and select only transactions ending up in a booking

In [3]:
p = 0.03  # 3% of the lines
# keep the header, then take only 3% of lines
# if random from [0,1] interval is greater than 0.03 the row will be skipped
df = pd.read_csv(
         '/home/ec2-user/SageMaker/data/train.csv',
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)

In [4]:
df.shape

(1129696, 24)

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
date_time,2014-01-08 13:38:03,2014-07-18 01:30:54,2014-12-25 03:25:58,2014-12-29 01:18:50,2014-12-30 01:19:53
site_name,2,24,24,24,24
posa_continent,3,2,2,2,2
user_location_country,66,3,3,3,3
user_location_region,462,64,64,64,64
user_location_city,41898,12576,9448,9448,9448
orig_destination_distance,2454.86,,,,
user_id,1482,2451,2451,2451,2451
is_mobile,0,0,1,0,1
is_package,1,0,0,0,0


In [6]:
df.user_id.unique().shape

(523315,)

In [7]:
df.is_booking.sum()

90223

In [8]:
df.is_booking.sum()/len(df)

0.07986484859643657

In [9]:
df = df.loc[df.is_booking==1,:]

In [10]:
df.shape

(90223, 24)

### Getting user features

In [11]:
user_feats = ['site_name',
             'posa_continent',
             'user_location_country',
             'user_location_region']

In [12]:
users = aggregate_features(df, 'user_id', user_feats, 'USER_ID')

user_id site_name
user_id posa_continent
user_id user_location_country
user_id user_location_region


In [13]:
users.shape

(78886, 5)

In [14]:
users.head()

Unnamed: 0,USER_ID,SITE_NAME,POSA_CONTINENT,USER_LOCATION_COUNTRY,USER_LOCATION_REGION
0,26,2,3,66,315
1,57,2,3,57,342
2,129,2,3,66,220
3,137,2,3,66,351
4,198,2,3,12,790


In [23]:
#users.to_csv('/home/ec2-user/SageMaker/data/users.csv', index=False)

### Getting items (hotels) features

In [15]:
item_feats = ['srch_destination_type_id',
             'hotel_continent',
             'hotel_country',
             'hotel_market']

In [16]:
items = aggregate_features(df, 'hotel_cluster', user_feats, 'ITEM_ID')

hotel_cluster site_name
hotel_cluster posa_continent
hotel_cluster user_location_country
hotel_cluster user_location_region


In [17]:
items.shape

(100, 5)

In [18]:
items.head()

Unnamed: 0,ITEM_ID,SITE_NAME,POSA_CONTINENT,USER_LOCATION_COUNTRY,USER_LOCATION_REGION
0,0,2|8|11|13|17|19|22|24|25|26|28|29|30|34|37,0|1|2|3|4,0|3|5|23|29|30|46|52|57|58|66|68|69|70|71|76|7...,0|3|11|21|25|26|38|44|47|48|49|50|51|57|58|63|...
1,1,2|8|9|10|11|13|17|18|19|23|24|25|26|28|29|30|3...,0|1|2|3|4,0|1|3|5|19|23|28|30|32|39|46|48|50|52|58|62|66...,0|9|11|13|16|17|20|23|26|32|37|38|41|45|47|48|...
2,2,2|7|8|9|11|13|14|15|17|18|22|23|24|25|26|27|28...,0|1|2|3|4,0|1|3|5|12|15|16|23|24|29|32|35|39|44|46|48|50...,0|3|4|9|12|13|16|17|18|20|23|24|26|27|28|29|31...
3,3,2|8|10|11|13|15|17|18|22|23|24|25|26|28|29|30|...,0|1|2|3|4,0|1|3|6|12|19|23|24|28|29|44|46|47|48|50|52|55...,0|11|13|20|22|23|25|32|38|39|41|44|45|47|48|49...
4,4,2|8|10|11|13|17|21|23|24|25|26|27|28|29|30|32|...,0|1|2|3|4,0|1|3|4|6|19|20|23|29|39|46|48|52|57|58|62|63|...,0|12|14|21|23|38|41|45|47|48|49|50|51|52|60|63...


In [24]:
#items.to_csv('/home/ec2-user/SageMaker/data/items.csv', index=False)

### Getting user-items interactions

In [21]:
inter = df.loc[:,['user_id', 'hotel_cluster', 'date_time']]
inter.rename({'user_id': 'USER_ID', 'hotel_cluster': 'ITEM_ID', 'date_time': 'TIMESTAMP'}, inplace=True)
inter.shape

(90223, 3)

In [7]:
inter.head()

NameError: name 'inter' is not defined

In [25]:
#inter.to_csv('/home/ec2-user/SageMaker/data/inter.csv', index=False)

## AWS Personalize

### Moving CSV files to S3

In [6]:
bucket = 'francesco-ml-tests'
upload_to_s3(bucket, 'inter.csv')
upload_to_s3(bucket, 'users.csv')
upload_to_s3(bucket, 'items.csv')

### Creating schemas for Personalize to properly read the data

In [1]:
personalize = boto3.client('personalize')

In [None]:
schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalize-demo-schema",
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))