In [56]:
import pandas as pd
import numpy as np
from random import random, shuffle

In [2]:
df = pd.read_excel('Online Retail.xlsx')

In [27]:
df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [28]:
df.dropna(inplace=True)

In [29]:
df.shape

(406829, 8)

In [30]:
df.CustomerID.nunique()

4372

In [34]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [37]:
df['StockCode'] = df['StockCode'].astype(str)

In [43]:
df_orders = df.groupby('CustomerID').apply(lambda x: " ".join(x.StockCode)).reset_index(name='StockCodeSentennce')

In [44]:
df_orders.shape

(4372, 2)

In [45]:
df_orders.head()

Unnamed: 0,CustomerID,StockCodeSentennce
0,12346.0,23166 23166
1,12347.0,85116 22375 71477 22492 22771 22772 22773 2277...
2,12348.0,84992 22951 84991 84991 21213 21213 22616 2198...
3,12349.0,23112 23460 21564 21411 21563 22131 22195 4819...
4,12350.0,21908 22412 79066K 79191C 22348 84086C 22551 2...


In [46]:
df_orders.dtypes

CustomerID            float64
StockCodeSentennce     object
dtype: object

In [51]:
customers = df_orders['CustomerID'].tolist()

In [58]:
shuffle(customers)

In [59]:
# take 90 % customer ID for training 
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

In [60]:
# split data for train and test
train_df = df[df['CustomerID'].isin(customers_train)]
test_df = df[~df['CustomerID'].isin(customers_train)]

In [62]:
# purchase history of customers 
purchase_hist_train = []

# populate with stock codes for each customer from customer for training data
for i in customers_train:
    temp = train_df[train_df['CustomerID']==i]['StockCode'].tolist()
    purchase_hist_train.append(temp)

In [77]:
# purchase history of customers for test data now
purchase_hist_test = []

# populate with stock codes for each customer from customer for training data
for i in test_df['CustomerID'].unique():
    temp = test_df[test_df['CustomerID']==i]['StockCode'].tolist()
    purchase_hist_test.append(temp)

In [79]:
len(purchase_hist_train) # a list of customers with a list of stockcodes of purchases 

3935

In [82]:
from gensim.models import Word2Vec
import nltk

In [83]:
# train word2vec model, # not sure what the parameters are for... 
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchase_hist_train, progress_per=200)

model.train(purchase_hist_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(3667413, 3704730)

In [84]:
print(model) # 3180 vocabulary, with vector size of 100 each 

Word2Vec<vocab=3180, vector_size=100, alpha=0.03>


In [94]:
# extract all vectors
X = model.wv.vectors

X.shape

(3180, 100)

Making Predictions from Embeddings

In [97]:
products = train_df[['StockCode', 'Description']].copy()

In [98]:
# drop duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep='last')

In [100]:
products_dict=products.groupby('StockCode')['Description'].apply(list).to_dict() # turns into a dictionary of

In [103]:
products_dict['85116']

['BLACK CANDELABRA T-LIGHT HOLDER']

In [105]:
# seeing vectors for a specific product
model.wv['85116']

array([-1.12337932e-01,  1.17319867e-01,  8.15182924e-02, -5.83326370e-02,
        6.55032322e-02, -5.50647266e-02, -2.12121353e-01,  2.32561193e-02,
       -6.70790905e-03, -5.33204973e-02,  4.28800322e-02, -2.04933390e-01,
        1.05022892e-01, -1.97778158e-02,  2.50801817e-02, -1.14791933e-02,
        8.07166621e-02,  8.37571770e-02, -1.07036069e-01, -4.79292087e-02,
       -1.78245962e-01, -6.48487806e-02,  3.43301333e-02,  1.37956040e-02,
       -7.04274103e-02,  1.14728563e-01, -7.15274960e-02,  5.58164343e-02,
        9.51602310e-02,  6.37888908e-02,  1.58054270e-02,  2.76659336e-02,
        3.20722535e-02,  1.39598459e-01,  1.41642347e-01,  5.95028736e-02,
        9.14749280e-02, -5.19945547e-02,  2.70709302e-02,  1.57832071e-01,
        1.18664742e-01, -2.93566380e-02,  9.09362137e-02, -3.66026945e-02,
       -4.32023890e-02, -6.13529794e-02,  2.04884093e-02,  2.19273165e-01,
        6.11920623e-05, -1.26724288e-01,  1.40771698e-02, -1.46797001e-01,
        6.22835904e-02,  

In [106]:
sim_prods = model.wv.most_similar('85116')

In [108]:
sim_prods[:5]

[('84952C', 0.507075846195221),
 ('84952B', 0.4939388632774353),
 ('47343A', 0.48911792039871216),
 ('90129F', 0.46960723400115967),
 ('84625A', 0.4571723937988281)]

In [143]:
# most similar products 
# function to get recommendation for products 

def recommender(stockcode, n=5):
    sim_prods = model.wv.most_similar(stockcode)[:n]
    prod_names=[]
    for code in sim_prods:
        prod_name = products_dict[code[0]]
        prod_names.append(prod_name)
    return prod_names


In [144]:
recommender('22941')

[['CHRISTMAS LIGHTS 10 SANTAS '],
 ['CHRISTMAS LIGHTS 10 VINTAGE BAUBLES'],
 ['RIBBON REEL SNOWY VILLAGE'],
 ['MINI LIGHTS WOODLAND MUSHROOMS'],
 ['15CM CHRISTMAS GLASS BALL 20 LIGHTS']]

In [145]:
products_dict['22941']

['CHRISTMAS LIGHTS 10 REINDEER']

In [48]:
temp_string = "85116 22375 71477 22492 22771 22772 22773 2277"

In [49]:
temp_toke = nltk.word_tokenize(temp_string)

[source](https://www.analyticsvidhya.com/blog/2019/07/how-to-build-recommendation-system-word2vec-python/)
* some of the code is outdated with gensim word2vec and were updated 

In [134]:
test_df_firstbuy = test_df.groupby('CustomerID')[['StockCode','Description']].first()

In [137]:
test_df_firstbuy_code = test_df_firstbuy['StockCode'].tolist()

In [149]:
reco_list = list()
for code in test_df_firstbuy_code:
    try:
        reco = recommender(code)
        reco_list.append(reco)
    except: # some codes are missing from the vocabulary 
        reco = None
        reco_list.append(reco)


In [150]:
test_df_firstbuy['recommendation'] = reco_list

In [151]:
test_df_firstbuy

Unnamed: 0_level_0,StockCode,Description,recommendation
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12428.0,21578,WOODLAND DESIGN COTTON TOTE BAG,"[[SAVE THE PLANET COTTON TOTE BAG], [RED RETRO..."
12430.0,23144,ZINC T-LIGHT HOLDER STARS SMALL,"[[ZINC T-LIGHT HOLDER STAR LARGE], [HANGING HE..."
12431.0,22941,CHRISTMAS LIGHTS 10 REINDEER,"[[CHRISTMAS LIGHTS 10 SANTAS ], [CHRISTMAS LIG..."
12471.0,22752,SET 7 BABUSHKA NESTING BOXES,"[[SPOTS ON RED BOOKCOVER TAPE], [SILK PURSE BA..."
12473.0,21498,RED RETROSPOT WRAP,"[[BLUE POLKADOT WRAP], [SKULLS AND CROSSBONES ..."
...,...,...,...
18242.0,84879,ASSORTED COLOUR BIRD ORNAMENT,"[[PAINTED METAL PEARS ASSORTED], [HEART IVORY ..."
18256.0,22600,CHRISTMAS RETROSPOT STAR WOOD,"[[CHRISTMAS RETROSPOT ANGEL WOOD], [CHRISTMAS ..."
18262.0,21400,RED PUDDING SPOON,"[[BLUE PUDDING SPOON], [RED EGG SPOON], [BLU..."
18263.0,21790,VINTAGE SNAP CARDS,"[[VINTAGE HEADS AND TAILS CARD GAME ], [WOODEN..."
