In [11]:
import pandas as pd, numpy as np
import os, json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torch_data
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
os.getcwd()
from sentence_transformers import SentenceTransformer


In [2]:
TRAIN_RATIO = 0.7
VALID_RATIO = 0.2
TEST_RATIO = 1 - (TRAIN_RATIO + VALID_RATIO)

WINDOW_SIZE = 168
HORIZON = 6
NORM_METHOD = 'z_score'
DATASET = 'Houston'
START_POI = 0
END_POI = 400
BATCH_SIZE = 1
TOTAL_DAYS = 400

CAT_COLS = ['top_category', 'sub_category']

RESULT_TRAIN_FILE = f'output/{DATASET}/train_{START_POI}_{END_POI}'

In [3]:
df = pd.read_pickle(f'./cache_data/data-{START_POI}-{END_POI}-{TOTAL_DAYS}-{DATASET}.pkl')
df['phone_number'] = df['phone_number'].fillna(0)
df['postal_code'] = df['postal_code'].fillna(0)
df['phone_number'] = df['phone_number'].astype(int)
df['postal_code'] = df['postal_code'].astype(int)
# Create sentences for each POI using as many columns as possible
sentences = []
# replace all nan values in columns 'location_name', 'street_address, 'city', 'region', 'postal_code', 'open_hours', 'phone_number', 'top_category', 'sub_category' with "unknown"
columns_to_fill = ['location_name', 'street_address', 'city', 'region', 'postal_code', 'open_hours', 'phone_number', 'top_category', 'sub_category']
for col in columns_to_fill:
    df[col] = df[col].fillna('unknown')
    if col == 'phone_number' or col == 'postal_code':
        df[col] = df[col].replace(0, 'unknown')
# create a sentence description for each POI
for index, row in df.iterrows():
    poi_details = f"This point of interest is {row['location_name']} located at {row['street_address']}, {row['city']}, {row['region']}, {row['postal_code']}. It is open for business during {row['open_hours']} and can be contacted by phone at {row['phone_number']}. This location belongs to the top category {row['top_category']}, with sub-category {row['sub_category']}."
    poi_details = poi_details.replace('unknown, unknown, unknown, unknown', 'unknown')
    # add sentences to the list
    sentences.append(poi_details)
    if index >= END_POI - START_POI - 1:
        break
# create a sentence description for each meta-node
for index, row in df.iloc[END_POI - START_POI:len(df)].iterrows():
    if row['top_category'] == 'Global':
        # meta-node represents all POIs in a city
        meta_node_details = f"This is the meta node that represents all the points of interests in {df.loc[0, 'city']}."
    else:
        # meta-node represents a category in a city
        meta_node_details = f"This is the meta node that represents all the points of interests in {df.loc[0, 'city']} that belong to the category {row['top_category']}."
    sentences.append(meta_node_details)
sentences

['This point of interest is George Bush Intercontinental Airport located at 2800 N Terminal Rd, Houston, TX, 77032. It is open for business during unknown and can be contacted by phone at unknown. This location belongs to the top category Support Activities for Air Transportation, with sub-category Other Airport Operations.',
 'This point of interest is American Express Centurion Lounge located at 2800 N Terminal Rd Terminal D, Houston, TX, 77032. It is open for business during { "Mon": [["5:30", "21:00"]], "Tue": [["5:30", "21:00"]], "Wed": [["5:30", "21:00"]], "Thu": [["5:30", "21:00"]], "Fri": [["5:30", "21:00"]], "Sat": [["5:30", "21:00"]], "Sun": [["5:30", "21:00"]] } and can be contacted by phone at unknown. This location belongs to the top category Gasoline Stations, with sub-category Gasoline Stations with Convenience Stores.',
 'This point of interest is Simon mall located at 5085 Westheimer Rd, Houston, TX, 77056. It is open for business during { "Mon": [["10:00", "19:00"]], 

In [45]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences)
embeddings.shape

(445, 768)

In [46]:
(embeddings[93] * embeddings[296]).sum()

0.60260606