# Installing packages

In [None]:
# %pip install numpy
# %pip install pandas
# %pip install pot
# %pip install gensim
# %pip install scikit-learn
# %pip install scipy
# %pip install tqdm
# %pip install prophet

# importing packages

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import gensim
from sklearn.model_selection import train_test_split
from collections import Counter
from gensim.models import Word2Vec
from scipy.spatial.distance import cdist, pdist, squareform, euclidean
import ot 
from tqdm import tqdm
import os
import pickle
import ast
import datetime
from prophet import Prophet
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, mean_squared_error, r2_score, mean_absolute_error
from math import sqrt

## Pre-Processing

In [4]:
warnings.filterwarnings('ignore')

dtrans = pd.read_csv("./Dataset/Dunnhumby Dataset/transaction_data.csv")
dproducts = pd.read_csv("./Dataset/Dunnhumby Dataset/product.csv")

dproducts = dproducts[["PRODUCT_ID","SUB_COMMODITY_DESC"]]

df = dtrans[["household_key","BASKET_ID","DAY","PRODUCT_ID","QUANTITY","WEEK_NO"]]

df = pd.merge(dproducts,df, on='PRODUCT_ID', how='inner')

df['user_id'] = df['household_key'].rank(method='dense').astype(int)

df['order_id'] = df['BASKET_ID'].rank(method='dense').astype(int)

df['product_id'] = df['PRODUCT_ID'].rank(method='dense').astype(int)

df = df.drop(["household_key","BASKET_ID","PRODUCT_ID"],axis = 1)

df.columns  = ["commodity","day",'quantity',"weekno","user_id","order_id","product_id"]

dunnhumby_data_sorted = df.sort_values(by=['user_id', 'day'])
dunnhumby_data_sorted

dunnhumby_data_sorted[dunnhumby_data_sorted["user_id"]==1]

train = dunnhumby_data_sorted.iloc[0:0]
prior = dunnhumby_data_sorted.iloc[0:0]
print("Train & test files are created")

for i in range(1,dunnhumby_data_sorted["user_id"].nunique()+1):
    latestOrder = dunnhumby_data_sorted[dunnhumby_data_sorted["user_id"]==i].iloc[-1].copy()
    priorOrders = dunnhumby_data_sorted[dunnhumby_data_sorted["user_id"]==i].iloc[:-1].copy()

    train.loc[len(train)] = latestOrder
    prior = pd.concat([priorOrders,prior], ignore_index=True)
    
orders = pd.concat([train,prior],ignore_index=True)
orders = df.sort_values(by=['user_id', 'day'])

orders['order_count'] = orders.groupby(['user_id', 'order_id'])['order_id'].transform('count')

ordersClean = orders[orders['order_count'] > 5].copy()

ordersClean = ordersClean.drop(columns=['order_count'])

ordersClean.reset_index(drop=True, inplace=True)

ordersClean = ordersClean.drop(["commodity","product_id","quantity"],axis =1)

ordersClean = ordersClean.groupby('order_id').apply(lambda x: x.drop_duplicates(subset=['user_id'])).reset_index(drop=True)

orderNew = orders.iloc[0:0]

for i in range(1,ordersClean["user_id"].nunique()+1):
    user = ordersClean[ordersClean["user_id"]==i]
    if len(user) > 10:
        user['days_since_prior_order'] = user['day'] - user['day'].shift(1)
        user['order_number'] = range(1, len(user) + 1)

        orderNew = pd.concat([user,orderNew], ignore_index=True)
orderNew.to_csv("./Dataset/Customized Dunnhumby Dataset/orders.csv")
print("Order file is created")

common_order_ids = orderNew['order_id'].unique()

train = train[train['order_id'].isin(common_order_ids)]
prior = prior[prior['order_id'].isin(common_order_ids)]

combined = pd.concat([train,prior],ignore_index=True)

train.to_csv("./Dataset/Customized Dunnhumby Dataset/train.csv")
prior.to_csv("./Dataset/Customized Dunnhumby Dataset/prior.csv")
combined.to_csv("./Dataset/Customized Dunnhumby Dataset/combined.csv")

Train & test files are created
Order file is created


In [5]:
path_train = "./Dataset/Customized Dunnhumby Dataset/train.csv"
path_prior = "./Dataset/Customized Dunnhumby Dataset/prior.csv"
path_products = "./Dataset/Dunnhumby Dataset/product.csv"

train_orders = pd.read_csv(path_train)
prior_orders = pd.read_csv(path_prior)
products = pd.read_csv(path_products)

#Turn the product ID to a string
#This is necessary because Gensim's Word2Vec expects sentences, so we have to resort to this dirty workaround
train_orders["product_id"] = train_orders["product_id"].astype(str)
prior_orders["product_id"] = prior_orders["product_id"].astype(str)

train_products = train_orders.groupby("order_id").apply(lambda order: order['product_id'].tolist())
prior_products = prior_orders.groupby("order_id").apply(lambda order: order['product_id'].tolist())

#Create the final sentences
sentences = prior_products._append(train_products).values

#Train Word2Vec model
model = gensim.models.Word2Vec(sentences, vector_size=50, window=5, min_count=50, workers=4)

model.save("product2vec.model")
model.wv.save_word2vec_format("product2vec.model.bin", binary=True)

In [6]:
class BasketConstructor(object):
    '''
        Group products into baskets(type: list)
    '''
    def __init__(self, raw_data_dir, cache_dir):
        self.raw_data_dir = raw_data_dir
        self.cache_dir = cache_dir
    
    def get_orders(self):
        '''
            get order context information
        '''
        orders = pd.read_csv("./Dataset/Customized Dunnhumby Dataset/orders.csv")
        orders = orders.fillna(0.0)

        orders['days'] = orders.groupby(['user_id'])['days_since_prior_order'].cumsum()
        orders['days_last'] = orders.groupby(['user_id'])['days'].transform(max)
        orders['days_up_to_last'] = orders['days_last'] - orders['days']
        del orders['days_last']
        del orders['days']
        return orders
    
    def get_orders_items(self, prior_or_train):
        '''
            get detailed information of prior or train orders 
        '''
        orders_products = pd.read_csv(self.raw_data_dir + '%s.csv'%prior_or_train)
        return orders_products
    
    def get_users_orders(self, prior_or_train):
        '''
            get users' prior detailed orders
        '''
        if os.path.exists(self.cache_dir + 'users_orders.pkl'):
            with open(self.cache_dir + 'users_orders.pkl', 'rb') as f:
                users_orders = pickle.load(f)
        else:
            orders = self.get_orders()
            order_products_prior = self.get_orders_items(prior_or_train)
            users_orders = pd.merge(order_products_prior, orders[['user_id', 'order_id', 'order_number', 'days_up_to_last']], 
                        on = ['order_id', 'user_id'], how = 'left')
            with open(self.cache_dir + 'users_orders.pkl', 'wb') as f:
                pickle.dump(users_orders, f, pickle.HIGHEST_PROTOCOL)
        return users_orders
    
    def get_users_products(self, prior_or_train):
        '''
            get users' all purchased products
        '''
        if os.path.exists(self.cache_dir + 'users_products.pkl'):
            with open(self.cache_dir + 'users_products.pkl', 'rb') as f:
                users_products = pickle.load(f)
        else:
            users_products = self.get_users_orders(prior_or_train)[['user_id', 'product_id']].drop_duplicates()
            users_products['product_id'] = users_products.product_id.astype(int)
            users_products['user_id'] = users_products.user_id.astype(int)
            users_products = users_products.groupby(['user_id'])['product_id'].apply(list).reset_index()
            with open(self.cache_dir + 'users_products.pkl', 'wb') as f:
                pickle.dump(users_products, f, pickle.HIGHEST_PROTOCOL)
        return users_products

    def get_items(self, gran):
        '''
            get items' information
            gran = [departments, aisles, products]
        '''
        items = pd.read_csv(self.raw_data_dir + '%s.csv'%gran)
        return items
    
    def get_baskets(self, prior_or_train, reconstruct = False, none_idx = 49689):
        '''
            get users' baskets
        '''
        filepath = self.cache_dir + './basket_' + prior_or_train + '.pkl'
       
        if os.path.exists(filepath):
            with open(filepath, 'rb') as f:
                up_basket = pickle.load(f)
        else:          
            up = self.get_users_orders(prior_or_train).sort_values(['user_id', 'order_number', 'product_id'], ascending = True)
            uid_oid = up[['user_id', 'order_number']].drop_duplicates()
            up = up[['user_id', 'order_number', 'product_id']]
            up_basket = up.groupby(['user_id', 'order_number'])['product_id'].apply(list).reset_index()
            up_basket = pd.merge(uid_oid, up_basket, on = ['user_id', 'order_number'], how = 'left')
            for row in up_basket.loc[up_basket.product_id.isnull(), 'product_id'].index:
                up_basket.at[row, 'product_id'] = [none_idx]
            up_basket = up_basket.sort_values(['user_id', 'order_number'], ascending = True).groupby(['user_id'])['product_id'].apply(list).reset_index()
            up_basket.columns = ['user_id', 'basket']
            with open(filepath, 'wb') as f:
                pickle.dump(up_basket, f, pickle.HIGHEST_PROTOCOL)
        return up_basket
        
    def get_item_history(self, prior_or_train, reconstruct = False, none_idx = 49689):
        filepath = self.cache_dir + './item_history_' + prior_or_train + '.pkl'
        if (not reconstruct) and os.path.exists(filepath):
            with open(filepath, 'rb') as f:
                item_history = pickle.load(f)
        else:
            up = self.get_users_orders(prior_or_train).sort_values(['user_id', 'order_number', 'product_id'], ascending = True)
            item_history = up.groupby(['user_id', 'order_number'])['product_id'].apply(list).reset_index()
            item_history.loc[item_history.order_number == 1, 'product_id'] = item_history.loc[item_history.order_number == 1, 'product_id'] + [none_idx]
            item_history = item_history.sort_values(['user_id', 'order_number'], ascending = True)
            # accumulate 
            item_history['product_id'] = item_history.groupby(['user_id'])['product_id'].transform(pd.Series.cumsum)
            # get unique item list
            item_history['product_id'] = item_history['product_id'].apply(set).apply(list)
            item_history = item_history.sort_values(['user_id', 'order_number'], ascending = True)
            # shift each group to make it history
            item_history['product_id'] = item_history.groupby(['user_id'])['product_id'].shift(1)
            for row in item_history.loc[item_history.product_id.isnull(), 'product_id'].index:
                item_history.at[row, 'product_id'] = [none_idx]
            item_history = item_history.sort_values(['user_id', 'order_number'], ascending = True).groupby(['user_id'])['product_id'].apply(list).reset_index()
            item_history.columns = ['user_id', 'history_items']

            with open(filepath, 'wb') as f:
                pickle.dump(item_history, f, pickle.HIGHEST_PROTOCOL)
        return item_history 


class TaFengBasketConstructor(object):
    def __init__(self):
        pass
    
    def get_baskets(self):
        with open('./data/TaFeng/user_tran', 'r') as fd:
            lines = [l.strip().split()[2:-1] for l in fd.readlines()]
        with open('./data/TaFeng/user_tran', 'r') as fd:
            user_ids = [l.strip().split()[0] for l in fd.readlines()]
        with open('./data/TaFeng/user_tran', 'r') as fd:
            transaction_id = [l.strip().split()[1] for l in fd.readlines()]

        df = pd.DataFrame()
        df['User Id'] = user_ids
        df['Transaction Id'] = transaction_id
        df['Products'] = lines

        df = df.groupby(['User Id', 'Transaction Id']).agg(sum).reset_index()
        
        all_baskets = []
        for user in df['User Id']:
            all_baskets.append([])
            df_tmp = df[df['User Id'] == user]
            for trans in df_tmp['Transaction Id']:
                all_baskets[-1].append(df_tmp[df_tmp['Transaction Id'] == trans]['Products'].values[0])
        
        return all_baskets

In [7]:
class EmbeddingWrapper(object):
    def __init__(self, type):
        if type == 'product':
            self.model = Word2Vec.load("product2vec.model")
            
        elif type == 'aisles':
            self.model = Word2Vec.load("aisles2vec_min_count_50.model")
            self.products = pd.read_csv("/kaggle/input/instacart-market-basket-analysis/products.csv")
            self.p2aisles = dict(zip(self.products.product_id.astype(str), self.products.aisle_id.astype(str)))
            
            
        elif type == 'tafeng_products':
            self.model = Word2Vec.load("tafeng2vec_min_count_50.model")

        self.vocab_len = len(self.model.wv.index_to_key)
        self.word2index = dict(zip([self.model.wv.index_to_key[i] for i in range(self.vocab_len)],
                              [i for i in range(self.vocab_len)]))
        self.word_index_df = pd.DataFrame(data=list(self.word2index.items()), columns=['product_id', 'emb_id'])
        
    def p2aisle_f(self, i):
        return self.p2aisles[i]

    def lookup_ind_f(self, i):
        return self.word2index[i]

    def get_closest_of_set(self, item_id, set_of_candidates):
        vec_of_interest = self.model.wv.vectors[item_id]
        closest = np.argmin([euclidean(vec_of_interest, self.model.wv.vectors[x]) for x in set_of_candidates])
        return set_of_candidates[closest]
    
    def find_closest_from_preds(self, pred, candidates_l_l):
        closest_from_history = []
        for p in pred:
            closest_from_history.append(self.get_closest_of_set(p, [x for seq in candidates_l_l for x in seq]))
        return closest_from_history
        
    def basket_dist_REMD(self, baskets):
        #Relaxed EMD as lower bound. It is basically a nearest neighborhood search to 
        #find the closest word in doc B for each word in doc A and then take sum of all minimum distances.    
        basket1_vecs = self.model.wv.vectors[[x for x in baskets[0]]]
        basket2_vecs = self.model.wv.vectors[[x for x in baskets[1]]]
        
        distance_matrix = cdist(basket1_vecs, basket2_vecs)
        
        return max(np.mean(np.min(distance_matrix, axis=0)),
                   np.mean(np.min(distance_matrix, axis=1)))
        
    def basket_dist_EMD(self, baskets):
        basket1 = baskets[0]
        basket2 = baskets[1]
        dictionary = np.unique(list(basket1) + list(basket2))
        vocab_len_ = len(dictionary)
        product2ind = dict(zip(dictionary, np.arange(vocab_len_)))

        # Compute distance matrix.
        dictionary_vecs = self.model.wv.vectors[[x for x in dictionary]]
        distance_matrix = squareform(pdist(dictionary_vecs))

        if np.sum(distance_matrix) == 0.0:
            # `emd` gets stuck if the distance matrix contains only zeros.
            return float('inf')

        def nbow(document):
            bow = np.zeros(vocab_len_, dtype=np.float32)
            for d in document:
                bow[product2ind[d]] += 1.
            return bow / len(document)

        # Compute nBOW representation of documents.
        d1 = nbow(basket1)
        d2 = nbow(basket2)

        # Compute WMD.
        return ot.emd2(d1, d2, distance_matrix)

    def remove_products_wo_embeddings(self, all_baskets, user_ids):
        all_baskets_filtered = []
        new_user_ids = []
        for (s, u_id) in zip(all_baskets, user_ids):
            s_cp = []
            for b in s:
                b_cp = [x for x in b if x in self.model.wv.index_to_key]
                if len(b_cp) > 0:
                    s_cp.append(b_cp)
            if len(s_cp) > 0:
                all_baskets_filtered.append(s_cp)
                new_user_ids.append(u_id)
        return all_baskets_filtered, new_user_ids
    
#     def pad_sequences(self, sequences, max_len, padding_value=0):
#         return [sequence + [padding_value] * (max_len - len(sequence)) for sequence in sequences]

#     def remove_products_wo_embeddings_gpu(self, all_baskets):
#         max_len = max(len(seq) for s in all_baskets for seq in s)

#         padded_all_baskets = [
#             self.pad_sequences(s, max_len, padding_value='') for s in all_baskets
#         ]

#         tf_remove_products_wo_embeddings = tf.function(self.remove_products_wo_embeddings)

#         ragged_all_baskets = tf.ragged.constant(padded_all_baskets, dtype=tf.int32)

#         result_on_gpu = tf_remove_products_wo_embeddings(ragged_all_baskets)

#         result_on_cpu = result_on_gpu.numpy()

#         return result_on_cpu

In [15]:
def nested_change(item, func):
    if isinstance(item, list):
        return [nested_change(x, func) for x in item]
    return func(item)

def remove_products_which_are_uncommon(all_baskets, user_ids, max_num=500):
    print('Removing all but {} most common products'.format(max_num))
    p = []
    for s in all_baskets:
        for b in s:
            p.extend(b)
    product_counter = Counter(p)
    most_common_products = [x for x, _ in product_counter.most_common(max_num)]
    new_user_ids = []
    all_baskets_filtered = []
    for (s, u_id) in zip(all_baskets, user_ids):
        s_cp = []
        for b in s:
            b_cp = [x for x in b if x in most_common_products]
            if len(b_cp) > 0:
                s_cp.append(b_cp)
        if len(s_cp) > 0:
            new_user_ids.append(u_id)
            all_baskets_filtered.append(s_cp)
    return all_baskets_filtered, new_user_ids

def remove_short_baskets(all_baskets, user_ids, l_b = 0, l_s = 3):
    new_user_ids = []
    all_baskets_filtered = []
    for (s, u_id) in zip(all_baskets, user_ids):
        s_cp = []
        for b in s:
            if len(b) > l_b:
                s_cp.append(b)
        if len(s_cp) > l_s:
            new_user_ids.append(u_id)
            all_baskets_filtered.append(s_cp)
    return all_baskets_filtered, new_user_ids

# def split_data(all_baskets):
#     users = []
#     train_ub, test_ub = train_test_split(all_baskets, test_size=0.07, random_state=0)
#     train_ub, val_ub = train_test_split(train_ub, test_size=0.07, random_state=0)
    
#     train_user_id = train_ub.user_id.values.tolist()
#     test_user_id = test_ub.user_id.values.tolist()
#     val_user_id = val_ub.user_id.values.tolist()
    
#     users.append(train_user_id)
#     users.append(test_user_id)
#     users.append(val_user_id)
    
#     print(train_ub)
#     train_ub = nested_change(list(train_ub), int)
#     print("train done")
#     test_ub = nested_change(list(test_ub), int)
#     print("test done")
#     val_ub = nested_change(list(val_ub), int)
#     print("val done")
    
#     test_ub_input = [x[:-1] for x in test_ub]
#     test_ub_target = [x[-1] for x in test_ub]
    
#     val_ub_input = [x[:-1] for x in val_ub]
#     val_ub_target = [x[-1] for x in val_ub]
    
#     return train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target, val_user_id, test_user_id

def convertType(item):
    return [ast.literal_eval(x) for x in item]

def split_data(all_baskets):
    users = []
    train_ub, test_ub = train_test_split(all_baskets, test_size=0.07, random_state=0)
    train_ub, val_ub = train_test_split(train_ub, test_size=0.07, random_state=0)
    
    train_user_id = train_ub.user_id.values.tolist()
    test_user_id = test_ub.user_id.values.tolist()
    val_user_id = val_ub.user_id.values.tolist()
    
    users.append(train_user_id)
    users.append(test_user_id)
    users.append(val_user_id)
    
    train_ub = convertType(train_ub['basket'])
#     train_ub = pd.DataFrame({'basket':list(train_ubC),'user_id':train_ub['user_id']})
    print("train done")
    
    test_ub = convertType(test_ub['basket'])
#     test_ub = pd.DataFrame({'basket':list(test_ubC),'user_id':test_ub['user_id']})
    print("test done")
    
    val_ub = convertType(val_ub['basket'])
#     val_ub = pd.DataFrame({'basket':val_ubC,'user_id':val_ub['user_id']})
    print("val done")
    
    test_ub_input = [x[:-1] for x in test_ub]
    test_ub_target = [x[-1] for x in test_ub]
    
    val_ub_input = [x[:-1] for x in val_ub]
    val_ub_target = [x[-1] for x in val_ub]
    
    return train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target, val_user_id, test_user_id

In [9]:
class KnnDtw(object):    
    def __init__(self, n_neighbors):
        self.n_neighbors = n_neighbors  
        self.length_to_consider = 10
    
    def _spring_dtw_distance(self, ts_a, ts_b, best_for_ts_a, d, d_lower_bound):
        """Returns the DTW subsequence similarity distance between two 2-D
        timeseries numpy arrays.
        
        Following Subsequence Matching in Data Streams, Machiko Toyoda, Yasushi Sakurai

        Arguments
        ---------
        ts_a, ts_b : array of shape [n_samples, n_timepoints]
            Two arrays containing n_samples of timeseries data
            whose DTW distance between each sample of A and B
            will be compared
            
        best_for_ts_a: list of length n_neighbors. The entries denote the
            stortest distances found so far. This is for stopping the 
            calculation early utilizing a lower bound approximation.
        
        d : DistanceMetric object the distance measure used for market baskets A_i - B_j 
        in the DTW dynamic programming function
        
        d_lower_bound : Lower bound of DistanceMetric object the distance measure used for market baskets A_i - B_j 
        in the DTW dynamic programming function
        
        Returns
        -------
        DTW distance between A and B
        """
        
        # Create cost matrix via broadcasting with large int
        M, N = len(ts_a), len(ts_b)

        #Compute REMD distances
        REMD_gen = map(d_lower_bound, [(i,j) for i in ts_a for j in ts_b])
        d_REMD_min = np.fromiter(REMD_gen, dtype=np.float32)

        #Break here if there is no chance that this is the shortest
        if np.sum(d_REMD_min[np.argpartition(d_REMD_min, M)][:M]) > max(best_for_ts_a):
            return np.inf, ts_b[0]

        cost = np.inf * np.ones((M, N))

        #Compute all distances
        d_mat = np.zeros((M,N))
        for i in range(M):
            for j in range(N):
                d_mat[i,j] = d((ts_a[i], ts_b[j]))

        # Initialize the first row and column
        cost[0, 0] = d((ts_a[0], ts_b[0]))
        for i in range(1, M):
            cost[i, 0] = cost[i-1, 0] + d_mat[i, 0]

        for j in range(1, N):
            cost[0, j] = d_mat[0, j]
            
        # Populate rest of cost matrix within window
        for i in range(1, M):
            w = 1.
            for j in range(1, N):
                choices = cost[i-1, j-1], cost[i, j-1], cost[i-1, j]
                cost[i, j] = min(choices) + w * d_mat[i,j]

        min_idx = np.argmin(cost[-1,:-1])
        # Return DTW distance, prediction for next basket
        return cost[-1,min_idx], ts_b[min_idx + 1]
  
    def _dist_matrix(self, x, y, d, d_lower_bound):
#         x_s = np.shape(x)
#         y_s = np.shape(y)
        x_s = [len(x)]
        y_s = [len(y)]
        dm = np.inf * np.ones((x_s[0], y_s[0])) 
        next_baskets = np.empty((x_s[0], y_s[0]), dtype=object)
        
        for i in tqdm(range(0, x_s[0])):
            # Ensure all elements of x have the same length
            max_length = max(len(seq) for seq in x[i] if seq is not None)
            x[i] = [np.array(seq)[-max_length:] for seq in x[i]]

            best_dist = [np.inf] * max(self.n_neighbors)
            for j in range(0, y_s[0]):
                # Ensure all elements of y have the same length
                max_length_y = max(len(seq_y) for seq_y in y[j])
                y[j] = [np.array(seq_y)[-max_length_y:] for seq_y in y[j]]

                dist, pred = self._spring_dtw_distance(x[i], y[j], best_dist, d, d_lower_bound)
                if dist < np.max(best_dist):
                    best_dist[np.argmax(best_dist)] = dist               
                dm[i, j] = dist
                next_baskets[i, j] = pred
    
        return dm, next_baskets
        
        
    def predict(self, tr_d, te_d, d, d_lower_bound):
        dm, predictions = self._dist_matrix(te_d, tr_d, d, d_lower_bound)
        
        preds_total_l = []
        distances_total_l = []
        for k in self.n_neighbors:
            # Identify the k nearest neighbors
            knn_idx = dm.argsort()[:, :k]
            preds_k_l = []
            distances_k_l = []
                
            for i in range(len(te_d)):
                preds = [predictions[i][knn_idx[i][x]] for x in range(knn_idx.shape[1])]
                distances = np.mean([dm[i][knn_idx[i][x]] for x in range(knn_idx.shape[1])])
                pred_len = int(np.mean([len(te_d[i][x]) for x in range(len(te_d[i]))]))
                preds = [x for x, y in Counter([n for s in preds for n in s]).most_common(pred_len)]                
                preds_k_l.append(preds)
                distances_k_l.append(distances)
            preds_total_l.append(preds_k_l)
            distances_total_l.append(distances_k_l)
            
        return preds_total_l, distances_total_l

# FB Prophet

In [10]:
def load_data():
    df = pd.read_csv("./Dataset/Customized Dunnhumby Dataset/combined.csv")
    return df

def preprocess_data(retail_dataframe, cust_list):
    df = retail_dataframe[['SHOP_DATE', 'order_id', 'user_id', 'product_id', 'quantity']]
    df = df[df['user_id'].isin(cust_list)]
    df = df.dropna()
    df['SHOP_DATE'] = df['SHOP_DATE'].astype(str)
    df['user_id'] = df['user_id'].astype(str)
    df['product_id'] = df['product_id'].astype(str)
    df['Date'] = pd.to_datetime(df['SHOP_DATE'], format='%Y%m%d').df.strftime('%Y/%m/%d')
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.groupby(['Date', 'order_id', 'user_id', 'product_id'])['quantity'].apply(sum).reset_index()
    df = df[['Date', 'user_id', 'product_id', 'quantity']].rename({'Date': 'ds', 'quantity': 'y'}, axis='columns')
    return df

def predict(pred_df):
    cust_list = pred_df['User'].tolist()
    fullcsv_df = load_data()
    
    train_df = preprocess_data(fullcsv_df, cust_list)
    FINAL_DF = pd.DataFrame()
    
    for cust_id in cust_list:
        prod_list = pred_df.loc[pred_df['User'] == cust_id, 'product_id'].iloc[0]
        def_new = train_df[train_df['user_id'] == cust_id]
        def_new = df_new[df_new['product_id'].isin(prod_list)]
        split_group = df_new.groupby('product_id')
        splits = [split_group.get_groups(x) for x in split_group.groups]
        
        for p_df in splits:
            productcode = p_df['product_id'].iloc[0]
            
            if(len(p_df) > 1):
                product_df = p_df[['ds', 'y']]
                last_date = product_df['ds'].max().strftime('%Y-%m-%d')
                m = Prophet(interval_width=0.95)
                m.fit(product_df)
                future = m.make_future_dataframe(periods=1, freq='W', include_history=False)
                forecast = m.predict(future)
                forecast = forecast[['yhat', 'ds']]
                forecast['user_id'] = cust_id
                forecast['product_id'] = productcode
                forecast = forecast.rename(columns={'yhat': 'quantity'})
                
                FINAL_DF = pd.concat([FINAL_DF, forecast], ignore_index=True)
                
    return FINAL_DF

# Evaluation Metrics Functions

In [11]:
def eval_item_pred(val_pred_array, val_target_array):
    jac_coef_list = []
    for p_list, r_list in zip(val_pred_array, val_target_array):
        p = len(list(set(p_list).intersection(r_list)))
        q = len([i for i in p_list if i not in r_list])
        r = len([i for i in r_list if i in p_list])
        j = p/(p+q+r)
        jac_coef_list.append(j)
    jaccard_coefficient = sum(jac_coef_list) / len(jac_coef_list)
    
    multibinarizer = MultiLabelBinarizer()
    
    target_array = multibinarizer.fit(val_target_array).transform(val_taget_array)
    pred_array = multibinarizer.transform(val_pred_array)
    
    p = precision_score(target_array, pred_array, average='samples')
    r = recall_score(target_array, pred_array, average='samples')
    
    f1_score = 2 * ((p*r)/(p+r))
    
    return jaccard_coefficient, f1_score

def eval_quantity_pred(result_df):
    df = pd.read_csv('./Dataset/Customized Dunnhumby Dataset/combined.csv', index_col=0)
    df['Date'] = pd.to_datetime(df['Date'])
    result_df_new = result_df
    result_df_new['product_id'] = result_df_new['product_id'].astype(str)
    user_list = list(set(result_df_new.user_id.to_list()))
    df = df[df['user_id'].isin(user_list)]
    df_fb = df[df.groupby('user_id')['Date'].transform('max') == df['Date']]
    df_fb = df_fb.rename(columns={'quantity': 'actual_quantity'})
    df_fb["product_id"] = df_fb["product_id"].astype(str)
    final_df = pd.merge(result_df_new, df_fb, on=['user_id', 'product_id'])
    final_df['bool'] = np.where((final_df['quantity'] == final_df['actual_quantity']), True, False)
    
    mse = mean_squared_error(final_df.actual_quantity, final_df.quantity)
    r2_s = r2_score(final_df.actual_quantity, final_df.quantity)
    mae = mean_absolute_error(final_df.actual_quantity, final_df.quantity)
    rmse = sqrt(mean_squared_error(final_df.actual_quantity, final_df.quantity))
    hit_count = final_df['bool'].value_counts()[True]
    hit_precentage = hit_count/len(final_df)
    
    return mse, r2_s, mae, rmse, hit_precentage

In [16]:
def run():
    embedding_wrapper = EmbeddingWrapper('product')
    bc = BasketConstructor('./Dataset/Customized Dunnhumby Dataset/', './Dataset/Customized Dunnhumby Dataset/')
    ub_basket = bc.get_baskets('prior', reconstruct=False)
    user_ids = ub_basket.user_id.values.tolist()
    all_baskets = ub_basket.basket.values
    all_baskets = nested_change(list(all_baskets), str)
    all_baskets, user_ids = embedding_wrapper.remove_products_wo_embeddings(all_baskets, user_ids)
    all_baskets, user_ids = remove_products_which_are_uncommon(all_baskets, user_ids)
    all_baskets, user_ids = remove_short_baskets(all_baskets, user_ids)
    all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f)
    
    max_length = max(len(basket) for basket in all_baskets)
    all_baskets_padded = [basket + [] * (max_length - len(basket)) for basket in all_baskets]
    all_baskets_df = pd.DataFrame(all_baskets_padded, columns=[f'item_{i}' for i in range(max_length)])
    all_baskets_array = all_baskets_df.astype(str).agg(', '.join, axis=1)
    all_baskets_df = pd.DataFrame(all_baskets_array, columns=['basket'])
    all_baskets_df['user_id'] = user_ids

    train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target, val_user, test_user = split_data(all_baskets_df)
    train_ub = [list(filter(lambda x: x is not None, sublist)) for sublist in train_ub]
    val_ub_input = [list(filter(lambda x: x is not None, sublist)) for sublist in val_ub_input]
    
    # Item prediction using KNN-DTW
    knndtw = KnnDtw(n_neighbors=[5])
    print(train_ub)
    print("Predicting..")
    preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, 
                                          embedding_wrapper.basket_dist_REMD)
    
    preds_all = np.array(preds_all)
    distances = np.array(distances)
    [preds_all] = preds_all
    [distances] = distances
    
    final_pred_df = pd.DataFrame({'User': val_user, 'Pred_Basket': preds_all, 'distances': distances})
    target_df = pd.DataFrame({'User': val_user, 'Basket': val_ub_target})
    
    emd_df = embedding_wrapper.word_index_df
    emd_df["product_id"] = emd_df["product_id"].astype(str)
    
    final_pred__list = final_pred_df['Pred_Basket'].to_list()
    final_target__list = target_df['Basket'].to_list()
    
    final_pred_df['Basket'] = final_pred__list
    target_df['Basket'] = final_target__list
    
    new_df = final_pred_df.explode('Basket').reeset_index(drop=True)
    new_target_df = target_df.explode('Basket').reeset_index(drop=True)

    new_df['product_id'] = new_df['Basket'].map(emd_df.set_index('emd_id')['product_id'])
    new_target_df['product_id'] = new_target_df['Basket'].map(emd_df.set_index('emd_id')['product_id'])
    new_basket_df = new_df.groupby(['User', 'distances'])['product_id'].apply(list).reset_index()
    
    result_df = predict(new_basket_df)
    
    jaccard_coefficient, f1_score = eval_item_pred(final_pred__list, val_ub_target)
    mse, r2_s, mae, rmse, hit_percentage = eval_quantity_pred(result_df)
    
    print(result_df)
    return result_df
    

if __name__ == "__main__":
    run()

Removing all but 500 most common products
train done
test done
val done
[[[11, 493, 5, 217, 65, 86, 1, 138, 14], [157, 217, 86, 4], [157, 217, 65, 101, 86, 82, 1, 229, 442, 404], [346, 436], [5, 217, 178, 101, 86, 1, 60, 69], [11, 361, 86, 203, 95, 33, 6], [71], [5, 2, 85], [1], [16, 2, 278], [5, 101, 2, 278, 60, 69], [11, 5, 239, 86, 2, 135, 306, 240, 85], [11, 5, 239, 19, 37, 240, 20], [5, 65, 105, 19, 86, 20], [11, 10, 5, 65, 101, 2, 69], [5, 217, 65, 16, 460, 2, 85], [5, 65, 178, 16, 105, 97, 2, 85], [5, 58], [21, 5, 59], [11, 217, 65, 3, 82, 404]], [[393, 1, 198], [148, 57, 118, 233, 1, 47], [3, 1, 0], [233, 133], [231, 1, 478], [23, 3, 1, 318], [146, 31, 1], [23, 318], [145, 220, 233, 1], [1], [140, 134, 148, 1, 53, 0, 176, 47, 18], [280, 66, 478, 176], [134, 2, 0, 246], [148, 2, 53, 33], [134, 496, 3, 2], [118, 2, 82], [25, 134, 89, 31], [148, 2, 31, 326, 47, 149], [134, 2, 33, 318], [19, 2, 323], [180, 134, 148, 338, 2, 478], [134, 338, 278, 149, 323], [439, 469, 23, 283, 134, 

 38%|█████████████▊                      | 51/133 [7:37:39<12:15:50, 538.42s/it]


KeyboardInterrupt: 