#### Importing the libs

In [1]:
import pandas as pd
import os
import numpy as np
import csv

import utils

#### Loading the data

In [2]:
train_data = pd.read_csv("./Data/train.csv")
metadata_data = pd.read_csv("./Data/item_metadata.csv")

In [3]:
train_data.head(5)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [4]:
metadata_data.head(5)

Unnamed: 0,item_id,properties
0,5101,Satellite TV|Golf Course|Airport Shuttle|Cosme...
1,5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
2,5834,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
3,5910,Satellite TV|Sailing|Cosmetic Mirror|Telephone...
4,6066,Satellite TV|Sailing|Diving|Cosmetic Mirror|Sa...


### Need to tranform the metadata dataframe
The properties attribute has a string with the elements separated with "|" and I want a correlation matrix between the item and all the properties

#### 1. Split the properties attribute into a list

In [5]:
metadata_data["properties"] = metadata_data["properties"].str.split("|")

#### 2. Process the list into columns and binary values

In [6]:
metadata_data = metadata_data.explode("properties").pivot_table(
    index="item_id", columns="properties", aggfunc="size", fill_value=0
).reset_index()

In [7]:
metadata_data.head(5)

properties,item_id,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,Air Conditioning,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
0,5001,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,5002,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,5003,0,0,0,1,0,1,0,0,1,...,1,0,1,1,0,0,0,1,1,1
3,5004,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,1,1
4,5005,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


### Get a list of hotels from the train data

In [8]:
example_session = train_data[train_data['user_id'] == '00RL8Z82B2Z1'].tail(1)
example_session

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
15,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109|129343|54824|2297972|109014|1257342|1031...,162|25|150|143|101|49|118|131|18|100|101|143|5...


In [9]:
impressions = example_session["impressions"].str.split("|")
impressions

15    [55109, 129343, 54824, 2297972, 109014, 125734...
Name: impressions, dtype: object

In [10]:
impressionsList = impressions.tolist()[0]
impressionsList = [int(num) for num in impressionsList]
impressionsList

[55109,
 129343,
 54824,
 2297972,
 109014,
 1257342,
 1031578,
 109018,
 1332971,
 666916,
 54833,
 54885,
 2237222,
 10077318,
 1166793,
 9132132,
 1474297,
 3909420,
 6622154,
 55091,
 8444418,
 54804,
 2050977,
 666936,
 55088]

In [11]:
utils.load_df_csv(metadata_data, impressionsList, "00RL8Z82B2Z1")