#### Importing the libs

In [21]:
import pandas as pd
import os
import numpy as np
import csv

import utils
from joblib import Parallel, delayed

#### Loading the data

In [22]:
train_data = pd.read_csv("./Data/train.csv")
metadata_data = pd.read_csv("./Data/item_metadata.csv")

In [23]:
train_data.head(5)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [24]:
train_data.drop_duplicates(subset=['user_id']).user_id

0           00RL8Z82B2Z1
16          02HGRBA06ODU
81          02SRUT1NQYH1
116         03K8AXBL4BX2
123         03P4VFKK12UO
                ...     
15932956    ZU702MR210D2
15932966    ZV6EJXAD9929
15932967    ZVIDWWE0KWNB
15932971    ZYDH48FW5Q3Q
15932973    ZYNMLE3MV3LK
Name: user_id, Length: 730803, dtype: object

In [25]:
user_id = "0473FZ8UNXRS"

In [26]:
metadata_data.head(5)

Unnamed: 0,item_id,properties
0,5101,Satellite TV|Golf Course|Airport Shuttle|Cosme...
1,5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
2,5834,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
3,5910,Satellite TV|Sailing|Cosmetic Mirror|Telephone...
4,6066,Satellite TV|Sailing|Diving|Cosmetic Mirror|Sa...


### Need to tranform the metadata dataframe
The properties attribute has a string with the elements separated with "|" and I want a correlation matrix between the item and all the properties

#### 1. Split the properties attribute into a list

In [27]:
metadata_data["properties"] = metadata_data["properties"].str.split("|")

#### 2. Process the list into columns and binary values

In [28]:
metadata_data = metadata_data.explode("properties").pivot_table(
    index="item_id", columns="properties", aggfunc="size", fill_value=0
).reset_index()

In [29]:
metadata_data.head(5)

properties,item_id,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,Air Conditioning,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
0,5001,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,5002,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,5003,0,0,0,1,0,1,0,0,1,...,1,0,1,1,0,0,0,1,1,1
3,5004,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,1,1
4,5005,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


### Get a list of hotels from the train data

In [30]:
example_session = train_data[train_data['user_id'] == user_id].tail(1)
example_session

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
180,0473FZ8UNXRS,bcc452f3350eb,1541062532,3,clickout item,3143258,AU,"Legian, Indonesia",desktop,,1258184|3866722|8929970|2315702|116619|1511641...,51|43|69|49|62|50|55|42|87|46|43|114|194|50|19...


In [31]:
impressions = example_session["impressions"].str.split("|")
impressions

180    [1258184, 3866722, 8929970, 2315702, 116619, 1...
Name: impressions, dtype: object

In [32]:
impressionsList = impressions.tolist()[0]
intImpressionsList = [int(num) for num in impressionsList]
len(impressionsList)

25

## Search User who clicked any hotel

In [33]:
clicked_df = train_data[train_data['reference'].isin(impressionsList)]
clicked_df = clicked_df.drop_duplicates(subset=['user_id', 'reference'])
users = clicked_df['user_id'].drop_duplicates().tolist()

if user_id not in users:
    users.append(user_id)

## Get all clicked hotels by all the users

In [34]:
clicked_df = train_data[train_data['user_id'].isin(users)]
clicked_df = clicked_df.drop_duplicates(subset=['user_id', 'reference'])

clicked_df

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
178,0473FZ8UNXRS,bcc452f3350eb,1541062453,1,interaction item rating,749441,AU,"Seminyak, Indonesia",desktop,,,
179,0473FZ8UNXRS,bcc452f3350eb,1541062460,2,search for item,1258184,AU,"Legian, Indonesia",desktop,,,
180,0473FZ8UNXRS,bcc452f3350eb,1541062532,3,clickout item,3143258,AU,"Legian, Indonesia",desktop,,1258184|3866722|8929970|2315702|116619|1511641...,51|43|69|49|62|50|55|42|87|46|43|114|194|50|19...
32729,F45QW49X6VOX,a434d14e81300,1541243154,1,clickout item,116620,AU,"Legian, Indonesia",tablet,,116620|111929|93911|3866722|1511641|3143258|34...,60|55|101|45|50|96|133|230|107|198|198|327|188...
32730,F45QW49X6VOX,a434d14e81300,1541243545,2,clickout item,114598,AU,"Legian, Indonesia",tablet,,114598|93911|1288161|111929|5192338|99551|3866...,94|101|98|55|132|186|45|53|50|120|73|133|230|9...
...,...,...,...,...,...,...,...,...,...,...,...,...
15820183,P33KQNRB4KJX,3088fae09edc3,1541318072,118,interaction item image,4940052,JP,"Ubud, Indonesia",desktop,,,
15820190,P33KQNRB4KJX,3088fae09edc3,1541318201,125,interaction item image,153964,JP,"Ubud, Indonesia",desktop,,,
15820216,P33KQNRB4KJX,3088fae09edc3,1541318488,151,interaction item image,113891,JP,"Ubud, Indonesia",desktop,,,
15820241,P33KQNRB4KJX,3088fae09edc3,1541318593,176,change of sort order,rating and recommended,JP,"Ubud, Indonesia",desktop,,,


## Prepare all hotels clicked list

In [35]:
hotels_clicked = clicked_df[clicked_df["reference"].str.isnumeric()].drop_duplicates(subset=['reference'])['reference'].tolist()

hotels_clicked

['749441',
 '1258184',
 '3143258',
 '116620',
 '114598',
 '93911',
 '5203238',
 '1289618',
 '344371',
 '1838723',
 '4573192',
 '6473852',
 '3134160',
 '2660484',
 '1293950',
 '2109344',
 '1217454',
 '1714515',
 '343816',
 '96264',
 '739901',
 '4461496',
 '5007734',
 '130904',
 '5618704',
 '6918776',
 '9130052',
 '97829',
 '97908',
 '1950413',
 '2023137',
 '121077',
 '2024225',
 '2564644',
 '3785158',
 '1450939',
 '8862702',
 '8270752',
 '98113',
 '6236626',
 '5081942',
 '6092202',
 '104163',
 '2399520',
 '3154900',
 '119269',
 '823931',
 '3866722',
 '2599578',
 '1951305',
 '1773331',
 '1016689',
 '97907',
 '5722996',
 '1989485',
 '3167682',
 '1888239',
 '2832458',
 '3152684',
 '2590018',
 '153964',
 '1411216',
 '1723173',
 '153963',
 '3156051',
 '2073706',
 '3252443',
 '4852816',
 '1370149',
 '1123340',
 '1157071',
 '3223738',
 '909075',
 '5142264',
 '8706814',
 '1767221',
 '4120180',
 '111929',
 '2045937',
 '9144056',
 '9701474',
 '6784968',
 '1288161',
 '2179654',
 '344506',
 '132881

## Prepare matrix

In [38]:
data_uid = f'clicked_all_history_user_{user_id}'

path = f'../ProcessedData/metadata/{data_uid}'

os.makedirs(path, exist_ok=True)

with open(f'{path}/{data_uid}-relations.csv', 'w') as f:
    write = csv.writer(f, lineterminator='\n')
    for user in users:

        user_actions = clicked_df[clicked_df["user_id"] == user]
        clicked_elements = user_actions[user_actions["reference"].str.isnumeric()]["reference"].tolist()

        # Array with ids that will be 1
        user_relation_ids = Parallel(n_jobs=10)(
            delayed(lambda elem: hotels_clicked.index(elem) if elem in hotels_clicked else None)(clicked_element) for
            clicked_element in clicked_elements)

        user_relation_ids = set(filter(None, user_relation_ids))

        if users.index(user) % 100 == 0:
            print(users.index(user))

        write.writerow(user_relation_ids)

0
100
200
300


In [39]:
with open(f'{path}/{data_uid}-objects.csv', 'w') as f:
    write = csv.writer(f, lineterminator='\n')
    write.writerow(users)

with open(f'{path}/{data_uid}-attributes.csv', 'w') as f:
    write = csv.writer(f, lineterminator='\n')
    write.writerow(hotels_clicked)