Creates the training data required for the model in user-product paired format. Pairs are generated using 3 sampling strategies.

In [1]:
import pandas as pd, os, numpy as np
import plotly.express as px
pd.options.display.max_columns = 50
import swifter, datetime, pickle as pkl
from tqdm.notebook import tqdm

In [2]:
df = pd.read_parquet('../data/train.parquet')

In [3]:
d_end = datetime.datetime(2019, 9, 22).date()
d_start = datetime.datetime(2019, 9, 1).date()

In [4]:
tdf = df[(df['date'] >= d_start) & (df['date'] <= d_end)].reset_index(drop=True).copy()

In [5]:
avail_articles = tdf['article_id'].unique()

In [6]:
bdf = pd.read_parquet('../data/bought_articles_in_order.parquet')

In [7]:
with open('../data/emb_map.pkl', 'rb') as handle:
    adf_dict = pkl.load(handle)

In [8]:
with open('../data/cemb_map.pkl', 'rb') as handle:
    cdf_dict = pkl.load(handle)

In [9]:
def getPositiveSamples(cid):
    purchases = bdf_dict[cid]
    return purchases
    
def positiveCount(cid):
    return len(bdf_dict[cid])
    
def getArticleFeatures(aid):
    return adf_dict[aid]

def getCustomerFeatures(cid):
    return cdf_dict[cid]

def getNegatives(cid, n):
    positives = set(getPositiveSamples(cid))
    num_valid = len(avail_articles) - len(positives)
    p = np.array([0 if x in positives else 1 for x in avail_articles])
    p = p/sum(p)
    negatives = np.random.choice(avail_articles, n, replace=False, p=p)
    return negatives
    

In [10]:
custs = set(bdf['cust_id']).intersection(set(list(cdf_dict.keys())))

In [11]:
custs = list(custs)

In [12]:
custs = np.random.choice(custs, size=40000, replace=False)

In [13]:
bdf_dict = bdf.set_index('cust_id')['article_id'].to_dict()

In [14]:
X,Y,Q = [], [], []
for cid in tqdm(custs, total=len(custs)):
    x_i, y_i, q_i = [], [], []
    
    c_feats = getCustomerFeatures(cid)
    positives = getPositiveSamples(cid)
    
    for aid in positives:
        a_feats = getArticleFeatures(aid)
        combined_feats = np.concatenate((c_feats, a_feats))
        
        x_i.append(combined_feats)
        y_i.append(1)
        q_i.append(cid)
        
    n_negatives = 15
    
    negatives = getNegatives(cid, n_negatives)
    
    for aid in negatives:
        a_feats = getArticleFeatures(aid)
        combined_feats = np.concatenate((c_feats, a_feats))
        
        x_i.append(combined_feats)
        y_i.append(0)
        q_i.append(cid)
    
    X.append(x_i)
    Y += y_i
    Q += q_i
    
    print(len(Y))

  0%|          | 0/40000 [00:00<?, ?it/s]

21
37
54
71
97
115
134
159
202
226
257
275
295
312
330
351
367
384
401
419
436
453
470
487
504
520
537
556
575
592
608
627
644
662
687
705
725
744
760
780
798
815
832
848
865
883
903
919
943
974
991
1012
1033
1074
1095
1112
1132
1149
1166
1190
1209
1229
1246
1278
1294
1314
1333
1349
1368
1386
1403
1420
1437
1456
1473
1489
1508
1525
1553
1569
1587
1605
1640
1656
1674
1708
1729
1748
1764
1782
1798
1816
1834
1851
1872
1904
1931
1950
1972
1994
2026
2044
2060
2081
2097
2114
2132
2150
2169
2187
2203
2223
2239
2256
2273
2292
2308
2324
2340
2365
2382
2399
2416
2439
2456
2475
2497
2529
2558
2577
2599
2616
2634
2650
2666
2682
2699
2721
2739
2758
2774
2796
2815
2833
2850
2868
2892
2909
2931
2950
2967
2985
3006
3025
3048
3064
3083
3105
3122
3139
3155
3173
3190
3208
3227
3244
3261
3279
3301
3318
3334
3351
3369
3389
3410
3426
3448
3471
3490
3506
3528
3545
3562
3579
3596
3615
3635
3654
3670
3687
3706
3725
3746
3763
3780
3803
3820
3838
3854
3871
3892
3911
3929
3950
3975
3991
4009
4025
4043
4059
4076
4

In [15]:
X = np.concatenate((X))

In [16]:
X.shape

(754154, 1024)

In [17]:
fdf = pd.DataFrame(X)

In [18]:
cfeats = ['c_' + str(x) for x in range(512)]
afeats = ['a_' + str(x) for x in range(512)]

In [19]:
fdf.columns = cfeats + afeats

In [20]:
fdf['Y'] = Y

In [21]:
fdf['Q'] = Q

In [22]:
fdf['Y'].value_counts()

0    600000
1    154154
Name: Y, dtype: int64

In [23]:
fdf.to_parquet('../data/sampleTrain.parquet')