# Imports

In [None]:
import pickle
import random
import numpy

In [None]:
# Read the data
f = open('feature_train_data.pickle', 'rb')
(X, y) = pickle.load(f)

In [None]:

# dictlist is a list of dictionaries. The first dimension is the store ID. For each store, we have a bunch of data points. Each data point is a dictionary. 
# The key is the tuple of (day_of_week, promo, year, month, day), and the value is the sale.
# All these datapoints are added
dictlist = [{} for _ in range(1115)]
for feature, sale in zip(X, y):
    store = feature[1]
    dictlist[store][tuple(feature[2:7])] = sale

# Load the embeddings
with open("embeddings.pickle", 'rb') as f:
    embeddings = pickle.load(f)
store_embeddings = embeddings[0]

In [None]:
# For a selected pairs of stores, calculate the average distance between the sales
# over all instances where their features agree.
def distance(store_pairs, dictlist):
    '''Distance as defined in the paper'''
    absdiffs = []
    a, b = store_pairs
    for key in dictlist[a]: # for each data point in store a
        if key in dictlist[b]: # if the same data point exists in store b
            absdiffs.append(abs(dictlist[a][key] - dictlist[b][key])) # add the abs difference in sales to the list
    return sum(absdiffs) / float(len(absdiffs)) # return the average

In [None]:
# For a selected pairs of stores, calculate the euclidean distance between the embeddings
def embed_distance(store_pairs, em):
    '''Distance in the embedding space'''
    a, b = store_pairs
    a_vec = em[a]
    b_vec = em[b]
    return(numpy.linalg.norm(a_vec - b_vec))

In [None]:
# Generate n random store pairs
n = 10000
pairs = set()
while len(pairs) < n:
    a, b = random.sample(range(1115), 2)
    if a < b:
        pairs.add((a, b))

In [None]:
# Calcuate distances
with open('distances.csv', 'w') as f:
    for pair in pairs:
        d = distance(pair, dictlist)
        d_em = embed_distance(pair, store_embeddings)
        print(d, d_em, file=f)