Generates the classier predictions for users

### Plan

Candidate Generation:
1. Generate candidates that that customer has bought in the last 1 month
2. Generate candidates that are the most popular last month
3. Generate random sampled candidates that have no overlap to above 2

In [1]:
import pandas as pd, os, numpy as np
import plotly.express as px
pd.options.display.max_columns = 50
import swifter, datetime, pickle as pkl
from tqdm.notebook import tqdm

In [2]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import regularizers

In [None]:
model = keras.models.load_model('../models/dnn1')

In [4]:
tdf = pd.read_parquet('../data/train.parquet')

### Get top-k most popular items last month

In [5]:
# predicting what they bought next week (so train on data before this)
d_start = datetime.datetime(2020, 9, 1).date()
tdf = tdf[tdf['date'] >= d_start].reset_index(drop=True).copy()

In [6]:
dd = tdf[['cust_id', 'article_id']].drop_duplicates().copy()

In [7]:
dd = dd['article_id'].value_counts().reset_index().copy()

In [8]:
dd.shape

(26252, 2)

In [9]:
top_20_most_purchased = dd['index'].values[:20]

In [10]:
top_20_most_purchased

array([751471001, 918522001, 909370001, 915526001, 918292001, 915529003,
       924243001, 751471043, 448509014, 706016001, 863595006, 865799006,
       898694001, 673677002, 896152002, 762846027, 714790020, 863583001,
       762846031, 850917001], dtype=int64)

In [11]:
most_purchased = dd['index'].values[20:200]

### Load Articles Embeddings Map

In [12]:
with open('../data/emb_map.pkl', 'rb') as handle:
    adf_dict = pkl.load(handle)

### Load Customer embeddings

In [13]:
with open('../data/all_cemb_map.pkl', 'rb') as handle:
    c_dict = pkl.load(handle)

In [14]:
all_products = tdf['article_id'].unique()
all_custs = tdf['cust_id'].unique()

### Create a df to compute each customers' last 12 bought products

In [15]:
tdf = tdf.groupby(["cust_id"])["article_id"].agg(
    lambda x: list(x.values[np.sort(np.unique(x.values, return_index=True)[1])])[-12:]).reset_index()

In [16]:
tdf.head()

Unnamed: 0,cust_id,article_id
0,0,[568601043]
1,2,[794321007]
2,6,"[719530003, 448509014]"
3,30,[685814001]
4,38,"[777148006, 835801001, 923134005, 865929003, 5..."


In [17]:
bdf_dict = tdf.set_index('cust_id')['article_id'].to_dict()

### Create Data

In [18]:
def getArticleFeatures(aid):
    return adf_dict[aid]

def getCustomerFeatures(cid):
    return c_dict[cid]

def getRandomSample(exclude, n):
    positives = set(getPositiveSamples(cid))
    num_valid = len(avail_articles) - len(positives)
    p = np.array([0 if x in positives else 1 for x in avail_articles])
    p = p/sum(p)
    negatives = np.random.choice(avail_articles, n, replace=False, p=p)
    return negatives
    

In [19]:
P,A,C = [], [], []
for cid in tqdm(all_custs, total=len(all_custs)):
    X = []
    c_feats = getCustomerFeatures(cid)
    seen_candidates = set()
    
    for aid in top_20_most_purchased:
        a_feats = getArticleFeatures(aid)
        combined_feats = np.concatenate((c_feats, a_feats))
        X.append(combined_feats)
        C.append(cid)
        A.append(aid)
        seen_candidates.add(aid)
    
    last_purchases = bdf_dict[cid]
    
    for aid in last_purchases:
        if aid in seen_candidates:
            continue
            
        a_feats = getArticleFeatures(aid)
        combined_feats = np.concatenate((c_feats, a_feats))
        X.append(combined_feats)
        C.append(cid)
        A.append(aid)
        seen_candidates.add(aid)
        
    p = np.array([0 if x in seen_candidates else 1 for x in most_purchased])
    p = p/sum(p)
    
    sampled_items = np.random.choice(most_purchased, 20, replace=False, p=p)
    
    for aid in sampled_items:
        if aid in seen_candidates:
            continue
            
        a_feats = getArticleFeatures(aid)
        combined_feats = np.concatenate((c_feats, a_feats))
        X.append(combined_feats)
        C.append(cid)
        A.append(aid)
        seen_candidates.add(aid)
        
    X = np.array(X)    
    cfx = X[:,[x for x in range(512)]]
    afx = X[:,[x for x in range(512, 1024)]]
    
    preds = model.predict((cfx, afx)).flatten()
    P += preds.tolist()

  0%|          | 0/189510 [00:00<?, ?it/s]

In [20]:
with open('../data/P.pkl', 'wb') as handle:
    pkl.dump(P, handle, protocol=3)

In [21]:
with open('../data/A.pkl', 'wb') as handle:
    pkl.dump(A, handle, protocol=3)

In [22]:
with open('../data/C.pkl', 'wb') as handle:
    pkl.dump(C, handle, protocol=3)