#### Import library

In [2]:
import pandas as pd
import os
import numpy as np
import csv

import utils
import data_model
from data_model import Attribute, Object, FormalConcept

from neomodel import Q,db

from neo4j import GraphDatabase

#### Loading the data

In [3]:
train_data = pd.read_csv("./Data/train.csv")
metadata_data = pd.read_csv("./Data/item_metadata.csv")

In [4]:
train_data.head(5)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [5]:
metadata_data.head(5)

Unnamed: 0,item_id,properties
0,5101,Satellite TV|Golf Course|Airport Shuttle|Cosme...
1,5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
2,5834,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
3,5910,Satellite TV|Sailing|Cosmetic Mirror|Telephone...
4,6066,Satellite TV|Sailing|Diving|Cosmetic Mirror|Sa...


In [6]:
metadata_data = utils.process_attributes(metadata_data)

metadata_data.head(5)

properties,item_id,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,Air Conditioning,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
0,5001,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,5002,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,5003,0,0,0,1,0,1,0,0,1,...,1,0,1,1,0,0,0,1,1,1
3,5004,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,1,1
4,5005,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


In [7]:
train_data[train_data['user_id'] == '00RL8Z82B2Z1'].tail(1)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
15,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109|129343|54824|2297972|109014|1257342|1031...,162|25|150|143|101|49|118|131|18|100|101|143|5...


In [8]:
example_session = train_data[train_data['user_id'] == '00RL8Z82B2Z1'].iloc[:-1]

expected_result = train_data[train_data['user_id'] == '00RL8Z82B2Z1'].tail(1)

example_session

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
6,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
7,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
8,00RL8Z82B2Z1,aff3928535f48,1541037542,9,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
9,00RL8Z82B2Z1,aff3928535f48,1541037542,10,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [9]:
clicked_elements = example_session[example_session["reference"].str.isnumeric()]["reference"].tolist()
clicked_elements

['666856',
 '666856',
 '666856',
 '109038',
 '666856',
 '109038',
 '666856',
 '109038',
 '109038',
 '109038',
 '109038',
 '109038',
 '109038']

In [10]:
element_metadata = metadata_data[metadata_data["item_id"] == 666856]
element_metadata = element_metadata.loc[:, ~(element_metadata == 0).any()]
element_metadata["5 Star"]

KeyError: '5 Star'

In [11]:
attr_frequency = {}

for clicked_element in clicked_elements:
    element_metadata = metadata_data[metadata_data["item_id"] == int(clicked_element)]
    element_metadata = element_metadata.loc[:, ~(element_metadata == 0).any()]

    for attr, value in element_metadata.iloc[:, 1:].iteritems():

        if attr in attr_frequency:
            attr_frequency[attr] += 1
        else:
            attr_frequency[attr] = 1

attr_frequency = {k: b for k, b in sorted(attr_frequency.items(), key=lambda element: element[1], reverse=True)}

attr_frequency

{'Business Hotel': 13,
 'Car Park': 13,
 'Central Heating': 13,
 'Computer with Internet': 13,
 'Desk': 13,
 'Family Friendly': 13,
 'Fridge': 13,
 'From 2 Stars': 13,
 'From 3 Stars': 13,
 'Good Rating': 13,
 'Laundry Service': 13,
 'Non-Smoking Rooms': 13,
 'Openable Windows': 13,
 'Porter': 13,
 'Satisfactory Rating': 13,
 'Self Catering': 13,
 'Terrace (Hotel)': 13,
 'Wheelchair Accessible': 13,
 'WiFi (Public Areas)': 13,
 'WiFi (Rooms)': 13,
 '3 Star': 8,
 'Accessible Hotel': 8,
 'Accessible Parking': 8,
 'Air Conditioning': 8,
 'Airport Hotel': 8,
 'Beach': 8,
 'Beach Bar': 8,
 'Business Centre': 8,
 'Cable TV': 8,
 'Cot': 8,
 'Eco-Friendly hotel': 8,
 'Electric Kettle': 8,
 'Express Check-In / Check-Out': 8,
 'Fan': 8,
 'Flatscreen TV': 8,
 'Free WiFi (Combined)': 8,
 'Free WiFi (Public Areas)': 8,
 'Free WiFi (Rooms)': 8,
 'Gay-friendly': 8,
 'Hotel': 8,
 'Hotel Bar': 8,
 'Ironing Board': 8,
 'Large Groups': 8,
 'Lift': 8,
 'Luxury Hotel': 8,
 'On-Site Boutique Shopping': 8,
 

In [25]:
query = '''MATCH (a:Attribute)<-[:INTENSION]-(f:FormalConcept)-[:EXTENSION]->(o:Object)
WITH o, collect(a.name) as attrs, $attrs as required
WHERE apoc.coll.containsAll(attrs, required)
RETURN o;'''


data_model.connect("bolt://neo4j:neo4j2@localhost:7687")

objects = []

cont = 0

while len(objects) < 25:
    attrs = list(attr_frequency.keys())

    if cont > 0:
        attrs = attrs[:cont*-1]

    print (attrs)

    results, meta = db.cypher_query(query, {"attrs":attrs})
    new_objects = [Object.inflate(row[0]) for row in results]

    for new_object in new_objects:
        if not new_object in objects:
            objects.append(new_object)

    print (len(objects))

    cont += 1

objects

['Business Hotel', 'Car Park', 'Central Heating', 'Computer with Internet', 'Desk', 'Family Friendly', 'Fridge', 'From 2 Stars', 'From 3 Stars', 'Good Rating', 'Laundry Service', 'Non-Smoking Rooms', 'Openable Windows', 'Porter', 'Satisfactory Rating', 'Self Catering', 'Terrace (Hotel)', 'Wheelchair Accessible', 'WiFi (Public Areas)', 'WiFi (Rooms)', '3 Star', 'Accessible Hotel', 'Accessible Parking', 'Air Conditioning', 'Airport Hotel', 'Beach', 'Beach Bar', 'Business Centre', 'Cable TV', 'Cot', 'Eco-Friendly hotel', 'Electric Kettle', 'Express Check-In / Check-Out', 'Fan', 'Flatscreen TV', 'Free WiFi (Combined)', 'Free WiFi (Public Areas)', 'Free WiFi (Rooms)', 'Gay-friendly', 'Hotel', 'Hotel Bar', 'Ironing Board', 'Large Groups', 'Lift', 'Luxury Hotel', 'On-Site Boutique Shopping', 'Radio', 'Restaurant', 'Room Service', 'Room Service (24/7)', 'Safe (Hotel)', 'Safe (Rooms)', 'Satellite TV', 'Senior Travellers', 'Shower', 'Sitting Area (Rooms)', 'Tennis Court', 'Concierge', 'Hostel', 

[<Object: {'name': '54885', 'id': 26547}>,
 <Object: {'name': '1031578', 'id': 26552}>,
 <Object: {'name': '109018', 'id': 26550}>,
 <Object: {'name': '55088', 'id': 26548}>,
 <Object: {'name': '54804', 'id': 2965}>,
 <Object: {'name': '3909420', 'id': 29395}>,
 <Object: {'name': '55091', 'id': 27154}>,
 <Object: {'name': '55109', 'id': 26549}>,
 <Object: {'name': '2297972', 'id': 26553}>,
 <Object: {'name': '54824', 'id': 29392}>,
 <Object: {'name': '1166793', 'id': 27671}>,
 <Object: {'name': '54833', 'id': 26546}>,
 <Object: {'name': '109014', 'id': 26565}>,
 <Object: {'name': '129343', 'id': 29409}>,
 <Object: {'name': '2050977', 'id': 28085}>,
 <Object: {'name': '1332971', 'id': 29394}>,
 <Object: {'name': '6622154', 'id': 26773}>,
 <Object: {'name': '666936', 'id': 29393}>,
 <Object: {'name': '10077318', 'id': 26554}>,
 <Object: {'name': '1474297', 'id': 27377}>,
 <Object: {'name': '8444418', 'id': 14637}>,
 <Object: {'name': '666916', 'id': 26551}>,
 <Object: {'name': '2237222',

In [24]:
expected_result["reference"].tolist()[0]

'1257342'