# Prepare database

In [1]:
import data_model
from neomodel import db

In [2]:
data_model.connect("bolt://neo4j:neo4j2@localhost:7687")

# Prepare test

In [3]:
import pandas as pd
import os
import numpy as np
import csv

from joblib import Parallel, delayed

### Loading data

In [4]:
train_data = pd.read_csv("./Data/train.csv")
metadata_data = pd.read_csv("./Data/item_metadata.csv")

In [5]:
train_data.head(5)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


## Extract test users
### Extract user list

In [6]:
all_users = train_data.drop_duplicates(subset=['user_id']).user_id.tolist()
all_users

['00RL8Z82B2Z1',
 '02HGRBA06ODU',
 '02SRUT1NQYH1',
 '03K8AXBL4BX2',
 '03P4VFKK12UO',
 '0473FZ8UNXRS',
 '066TUPQWUEV5',
 '06S61EKCW1JY',
 '06SZHKMYOOI8',
 '098CQXLJZ868',
 '09L0Y03JYTAC',
 '0FXZGX34RQMS',
 '0FZ7FKT6CB3D',
 '0IVOT7X0FJWE',
 '0K009FGORI30',
 '0KCQVHPEIWT0',
 '0L2TX0JNYVQ6',
 '0LEN9Z40SUSJ',
 '0LPWXP7ZIW88',
 '0MQZRPGN1GOS',
 '0NA8E4AD2VY7',
 '0NN8D1GWBW1F',
 '0O1CTEP95YKQ',
 '0P3WZY097PQI',
 '0Q3605EUBZG3',
 '0RH6DTZGIKY8',
 '0S2J3RLMF1LO',
 '0SO132S6MH3O',
 '0SVH5QYX372P',
 '0T819ODEIRA4',
 '0VNR91BTRLCP',
 '0VOL2T63B8FS',
 '0X3OMZZH37ZS',
 '0YBM60V6E5V6',
 '0Z9ZRJCL362A',
 '0ZD5QS10KOWM',
 '10GT4T9XLWZL',
 '10NPM3E92EQB',
 '12JXY7YLPVZD',
 '132NQM41P53Z',
 '13A1MZHQ298F',
 '13A3K98RFJXO',
 '15DDTTYLYLJJ',
 '18ET932PUJ6V',
 '19EMYD6N24VI',
 '19VHB90GOK1S',
 '1D32JQ688GK8',
 '1ESSLJKXNME4',
 '1GHT8M98OWVP',
 '1JL92Y9II096',
 '1JPHSHD43KQ1',
 '1KTPEDQB89UY',
 '1MYZFNQNA1JL',
 '1NRDBHYY4Q23',
 '1PFOT2YWS146',
 '1QF3QZ74IK8B',
 '1QV3DSC0DW9U',
 '1R2O77QTOU5A',
 '1TODIEGPC342

### Extract users from list

In [7]:
selected_users = np.random.choice(all_users, size=5, replace=False).tolist()
selected_users

['75M5HB45BO78',
 'J554Y3M811SS',
 'E1E0H28R5LQH',
 'LCYH8N0OR8OS',
 '40Q32HAQEIGV']

## Process train information
### Need to tranform the metadata dataframe
The properties attribute has a string with the elements separated with "|" and I want a correlation matrix between the item and all the properties

1. Split the properties attribute into a list
2. Process the list into columns and binary values

In [8]:
metadata_data["properties"] = metadata_data["properties"].str.split("|")

In [9]:
metadata_data = metadata_data.explode("properties").pivot_table(
    index="item_id", columns="properties", aggfunc="size", fill_value=0
).reset_index()

In [10]:
metadata_data.head(5)

properties,item_id,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,Air Conditioning,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
0,5001,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,5002,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,5003,0,0,0,1,0,1,0,0,1,...,1,0,1,1,0,0,0,1,1,1
3,5004,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,1,1
4,5005,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


# Starting the testing
For each user:
1. Extract impressions
2. Search users who clicked any hotel in the impression
3. Extract click history for the users
4. Prepare the csv
5. Create the user experiment database
6. Execute Java to process FCA

In [11]:
def extract_user_impression(user):
    last_user_session = train_data[train_data['user_id'] == user].tail(1)
    impressions = last_user_session["impressions"].str.split("|").tolist()[0]

    # Transform list to int
    #impressions = [int(num) for num in impressions]

    return impressions

In [12]:
def search_users_clicked_hotel(hotel_list):
    clicked_df = train_data[train_data['reference'].isin(hotel_list)]
    clicked_df = clicked_df.drop_duplicates(subset=['user_id', 'reference'])
    return clicked_df['user_id'].drop_duplicates().tolist()

In [13]:
def clicked_history_from_user_list(users):
    clicked_df = train_data[train_data['user_id'].isin(users)]
    return clicked_df.drop_duplicates(subset=['user_id', 'reference'])

In [14]:
def save_data_csv(users, hotels_clicked, clicked_df, uid):
    data_uid = f'automatic_test_{uid}'

    path = f'../ProcessedData/metadata/{data_uid}'

    os.makedirs(path, exist_ok=True)

    with open(f'{path}/{data_uid}-relations.csv', 'w') as f:
        write = csv.writer(f, lineterminator='\n')
        for user in users:

            user_actions = clicked_df[clicked_df["user_id"] == user]
            clicked_elements = user_actions[user_actions["reference"].str.isnumeric()]["reference"].tolist()

            # Array with ids that will be 1
            user_relation_ids = Parallel(n_jobs=10)(
                delayed(lambda elem: hotels_clicked.index(elem) if elem in hotels_clicked else None)(clicked_element) for
                clicked_element in clicked_elements)

            user_relation_ids = set(filter(None, user_relation_ids))

            write.writerow(user_relation_ids)

    with open(f'{path}/{data_uid}-objects.csv', 'w') as f:
        write = csv.writer(f, lineterminator='\n')
        write.writerow(users)

    with open(f'{path}/{data_uid}-attributes.csv', 'w') as f:
        write = csv.writer(f, lineterminator='\n')
        write.writerow(hotels_clicked)

In [15]:
cont = 0
for user in selected_users:

    uid = f'test{cont}'

    user_impressions = extract_user_impression(user)

    print(f'{uid}: obtained user impressions: {user_impressions}')

    if len(user_impressions) == 0 or (type(user_impressions) != list and pd.isna(user_impressions)):
        continue

    print(f'{uid}: Getting users who clicked the hotels')

    users_clicked_hotels = search_users_clicked_hotel(user_impressions)

    print(f'{uid}: Obtained users who clicked')

    if len(users_clicked_hotels) == 0 or (type(users_clicked_hotels) != list and pd.isna(users_clicked_hotels)):
        continue

    # add the current user if not in list
    if user not in users_clicked_hotels:
        users_clicked_hotels.append(user)

    hotel_df = clicked_history_from_user_list(users_clicked_hotels)
    clicked_hotels = hotel_df[hotel_df["reference"].str.isnumeric()].drop_duplicates(subset=['reference'])['reference'].tolist()

    if len(clicked_hotels) == 0 or (type(clicked_hotels) != list and pd.isna(clicked_hotels)):
        continue

    save_data_csv(users_clicked_hotels, clicked_hotels, hotel_df, uid)

    data_model.create_database(uid, db)

    # Execute java

    cont += 1



test0: obtained user impressions: ['2610110', '5773468', '3940568', '9713532', '1796791', '2441706', '2718025', '3215419', '7991658', '5582458', '1376136', '5818212', '6702206', '4277534', '3995530', '4076064', '7722648', '9307388']
test0: Getting users who clicked the hotels
test0: Obtained users who clicked
test1: obtained user impressions: ['1152096', '7098548', '1907569', '4134572', '5078314', '107973', '2342618', '3371806', '4605404', '1828333', '5667788', '2774378', '5857682', '4681742', '2732236', '5425212', '8777766', '5069578', '7151848', '9527094', '7912978']
test1: Getting users who clicked the hotels
test1: Obtained users who clicked
test2: obtained user impressions: ['2856914', '3533712', '49931', '395086', '49526', '481491', '749636', '150080', '2417008', '50119', '6626144', '141088', '2243464', '153525', '83161', '153527', '2858242', '1066786', '49881', '2001201', '49426', '719596', '49256', '5426070', '4856670']
test2: Getting users who clicked the hotels
test2: Obtaine