### Creates user embeddings for customers

In [1]:
import pandas as pd, os, numpy as np
import plotly.express as px
pd.options.display.max_columns = 50
import swifter, datetime, pickle as pkl
import tensorflow_hub as hub
from tqdm.notebook import tqdm

In [2]:
df = pd.read_parquet('./data/train.parquet')

In [3]:
with open('./data/emb_map.pkl', 'rb') as handle:
    emb_map = pkl.load(handle)

In [4]:
d_end = datetime.datetime(2019, 9, 22).date()

In [5]:
trdf = df[(df['date'] <= d_end)].reset_index(drop=True).copy()

In [6]:
trdf.shape

(16889901, 13)

In [7]:
with open('./data/cust_bought.pkl', 'rb') as handle:
    cb = pkl.load(handle)

In [8]:
cbs = set(cb)

In [9]:
trdf = trdf[trdf['cust_id'].isin(cbs)].reset_index(drop=True).copy()

In [10]:
trdf.shape

(3502676, 13)

In [11]:
trdf['cust_id'].nunique()

89033

In [12]:
len(cbs)

97992

In [18]:
trdf.sort_values(['cust_id', 'date'], ascending=[True, True], inplace=True)

In [20]:
cids = trdf['cust_id'].unique()
cgroups = trdf[['article_id', 'cust_id']].groupby('cust_id')

In [21]:
cemb_map = {}

In [22]:
def moveVector(sourceVector, destinationVector, direction, magnitude = None):
    magnitude = magnitude if magnitude is not None else 0.5
    new_q = sourceVector + direction*magnitude*(destinationVector - sourceVector)
    return new_q

In [23]:
#x = A
#x = A + 0.5(B-A) = 0.5A + 0.5B
#x = 0.5A + 0.5B + 0.5(C - (0.5A + 0.5B)) = 0.5A +0.5B + 0.5(C-0.5A-0.5B) = 0.5A + 0.5B + 0.5C -.25A -0.25B = 0.25A + 0.25B + 0.5C

In [25]:
def getCustEmb(purchases):
    statevector = None
    for i, v in enumerate(purchases):
        if i == 0:
            statevector = v
        else:
            statevector = moveVector(statevector, v, direction=1, magnitude=0.5)
    
    statevector = statevector / np.linalg.norm(statevector)
    return statevector

In [26]:
all_embs = []
for cid in tqdm(cids, total=len(cids)):
    curr_group = cgroups.get_group(cid)
    keys = curr_group['article_id'].values
    embs = [emb_map[x] for x in keys]
    embs = np.array(embs)
    cemb = getCustEmb(embs)
    all_embs.append(cemb)
    cemb_map[cid] = cemb

  0%|          | 0/89033 [00:00<?, ?it/s]

In [28]:
len(all_embs)

89033

In [29]:
with open('./data/cemb_map.pkl', 'wb') as handle:
    pkl.dump(cemb_map, handle, protocol=3)