#### Importing the libs

In [24]:
import pandas as pd
import os
import numpy as np
import csv

import utils
from joblib import Parallel, delayed

#### Loading the data

In [2]:
train_data = pd.read_csv("./Data/train.csv")
metadata_data = pd.read_csv("./Data/item_metadata.csv")

In [3]:
train_data.head(5)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [4]:
metadata_data.head(5)

Unnamed: 0,item_id,properties
0,5101,Satellite TV|Golf Course|Airport Shuttle|Cosme...
1,5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
2,5834,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
3,5910,Satellite TV|Sailing|Cosmetic Mirror|Telephone...
4,6066,Satellite TV|Sailing|Diving|Cosmetic Mirror|Sa...


### Need to tranform the metadata dataframe
The properties attribute has a string with the elements separated with "|" and I want a correlation matrix between the item and all the properties

#### 1. Split the properties attribute into a list

In [5]:
metadata_data["properties"] = metadata_data["properties"].str.split("|")

#### 2. Process the list into columns and binary values

In [6]:
metadata_data = metadata_data.explode("properties").pivot_table(
    index="item_id", columns="properties", aggfunc="size", fill_value=0
).reset_index()

In [7]:
metadata_data.head(5)

properties,item_id,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,Air Conditioning,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
0,5001,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,5002,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,5003,0,0,0,1,0,1,0,0,1,...,1,0,1,1,0,0,0,1,1,1
3,5004,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,1,1
4,5005,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


### Get a list of hotels from the train data

In [8]:
example_session = train_data[train_data['user_id'] == '00RL8Z82B2Z1'].tail(1)
example_session

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
15,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109|129343|54824|2297972|109014|1257342|1031...,162|25|150|143|101|49|118|131|18|100|101|143|5...


In [9]:
impressions = example_session["impressions"].str.split("|")
impressions

15    [55109, 129343, 54824, 2297972, 109014, 125734...
Name: impressions, dtype: object

In [27]:
impressionsList = impressions.tolist()[0]
intImpressionsList = [int(num) for num in impressionsList]
len(impressionsList)

25

## Search User who clicked any hotel

In [20]:
clicked_df = train_data[train_data['reference'].isin(impressionsList)]
clicked_df = clicked_df.drop_duplicates(subset=['user_id', 'reference'])
clicked_df

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
15,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109|129343|54824|2297972|109014|1257342|1031...,162|25|150|143|101|49|118|131|18|100|101|143|5...
198476,OIPLP05G2DNX,cd70b75089786,1541219585,17,interaction item image,10077318,AU,"Sydney, Australia",mobile,,,
205163,2VT2BFH97JSO,5b7c65d248b8a,1541294748,2,clickout item,54833,NZ,"Sydney, Australia",tablet,,54833|10335350|8229216|7939292|10226346|841908...,103|101|197|176|160|80|103|67|66|199|56|313|10...
207617,9112310YDHJ9,a1587e7118d4b,1541326844,17,clickout item,109014,AU,"Sydney, Australia",mobile,Hotel|4 Star|5 Star|Free WiFi (Combined)|Air C...,54921|55109|54885|109014|55146|4040790|55093|1...,77|156|161|138|157|218|226|202|234|252|185|265...
236432,32CP913ITOZ0,835577240e392,1541473490,2,clickout item,54824,AU,"Sydney, Australia",desktop,,54902|5790318|3924838|54824|109040|54803|45515...,187|90|113|90|137|162|106|75|171|101|219|96|15...
...,...,...,...,...,...,...,...,...,...,...,...,...
15808794,RZS33KEWGVPN,d3dea95ded878,1541223061,3,clickout item,1166793,AU,"Sydney, Australia",tablet,,54794|1039380|117990|55051|2445766|1342536|108...,164|151|140|252|339|179|199|238|199|208|165|16...
15812906,4M3IQ2JL5B7Y,ec216358ec2d2,1541329208,2,clickout item,54885,AU,"Sydney, Australia",mobile,,4040790|109034|55093|54982|117990|54901|14211|...,143|105|134|257|108|147|249|133|119|157|154|14...
15866156,0G22RO7KVWI6,470eb5337c42f,1541161304,10,clickout item,9132132,AU,"Sydney, Australia",desktop,Sort by Price|Good Rating,1431482|9132132|1088584|109033|1253731|6657740...,32|63|66|69|69|69|74|89|90|92|92|97|97|98|98|1...
15929499,OWSKI3H77PFS,22759f347ddbf,1541464976,3,interaction item info,54804,PH,"Sydney, Australia",desktop,,,


In [26]:
users = clicked_df['user_id'].drop_duplicates().tolist()
len(users)

448

In [30]:
path = f'../ProcessedData/metadata/clicked_user'

os.makedirs(path, exist_ok=True)

with open(f'{path}/clicked_user-relations.csv', 'w') as f:
    write = csv.writer(f, lineterminator='\n')
    for user in users:

        user_actions = clicked_df[clicked_df["user_id"] == user]
        clicked_elements = user_actions[user_actions["reference"].str.isnumeric()]["reference"].tolist()

        # Array with ids that will be 1
        user_relation_ids = Parallel(n_jobs=10)(
            delayed(lambda elem: impressionsList.index(elem) if elem in impressionsList else None)(clicked_element) for
            clicked_element in clicked_elements)

        user_relation_ids = set(filter(None, user_relation_ids))

        if users.index(user) % 100 == 0:
            print(users.index(user))

        write.writerow(user_relation_ids)

0
100
200
300
400


In [31]:
with open(f'{path}/clicked_user-objects.csv', 'w') as f:
    write = csv.writer(f, lineterminator='\n')
    write.writerow(users)

In [32]:
with open(f'{path}/clicked_user-attributes.csv', 'w') as f:
    write = csv.writer(f, lineterminator='\n')
    write.writerow(impressionsList)