In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve

In [4]:
retail_data = pd.read_csv('res_wo_ind.csv')
retail_data.head(5)

Unnamed: 0,CardHolder,ChequeId,CALDAY,Discount,SalesCount,SalesSumma,Material,Subcategory,net_weight,rpa_wgh1,rpa_wgh2,rpa_wgh3,rpa_wgh4,unit_of_wt,vendor,engname_rpa3,engname_rpa2,engname_rpa4,Class
0,10008254,1013172924,20200130,397.28,1.258,927.0,299994,130102,0.0,FOOD,13.0,1301,130102.0,G,,Fresh salmon,Fresh and processed fish products,"Cut salmon, service",FOOD
1,13185016,493914559,20181116,174.99,0.982,755.35,299994,130102,0.0,FOOD,13.0,1301,130102.0,G,,Fresh salmon,Fresh and processed fish products,"Cut salmon, service",FOOD
2,14790718,246904176,20180404,92.96,0.65,522.84,299994,130102,0.0,FOOD,13.0,1301,130102.0,G,,Fresh salmon,Fresh and processed fish products,"Cut salmon, service",FOOD
3,14790718,246904334,20180404,81.33,0.57,458.68,299994,130102,0.0,FOOD,13.0,1301,130102.0,G,,Fresh salmon,Fresh and processed fish products,"Cut salmon, service",FOOD
4,15088785,178450469,20180128,49.9,1.044,939.18,299994,130102,0.0,FOOD,13.0,1301,130102.0,G,,Fresh salmon,Fresh and processed fish products,"Cut salmon, service",FOOD


In [5]:
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995136 entries, 0 to 995135
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   CardHolder    995136 non-null  int64  
 1   ChequeId      995136 non-null  int64  
 2   CALDAY        995136 non-null  int64  
 3   Discount      995136 non-null  float64
 4   SalesCount    995136 non-null  float64
 5   SalesSumma    995136 non-null  float64
 6   Material      995136 non-null  int64  
 7   Subcategory   995136 non-null  int64  
 8   net_weight    995136 non-null  float64
 9   rpa_wgh1      995136 non-null  object 
 10  rpa_wgh2      995136 non-null  float64
 11  rpa_wgh3      995136 non-null  int64  
 12  rpa_wgh4      995136 non-null  float64
 13  unit_of_wt    995136 non-null  object 
 14  vendor        910395 non-null  float64
 15  engname_rpa3  995136 non-null  object 
 16  engname_rpa2  995136 non-null  object 
 17  engname_rpa4  995136 non-null  object 
 18  Clas

In [6]:
cleaned_retail = retail_data.loc[pd.isnull(retail_data.CardHolder) == False]

In [7]:
item_lookup = cleaned_retail[['Material', 'engname_rpa3']].drop_duplicates() # Only get unique item/description pairs
# item_lookup['StockCode'] = item_lookup.StockCode.astype(str)
item_lookup.head()

Unnamed: 0,Material,engname_rpa3
0,299994,Fresh salmon
82,299997,Fresh salmon
1707,14228,Fresh salmon
4077,299996,Fresh salmon
4144,14233,Fresh salmon


In [9]:
cleaned_retail['SalesCount'] = cleaned_retail.SalesCount.astype(int) # Convert to int for customer ID
cleaned_retail_2 = cleaned_retail[['Material', 'SalesCount', 'CardHolder']] # Get rid of unnecessary info
grouped_cleaned = cleaned_retail_2.groupby(['CardHolder', 'Material']).sum().reset_index() # Group together
grouped_cleaned.SalesCount.loc[grouped_cleaned.SalesCount == 0] = 1 # Replace a sum of zero purchases with a one to
# indicate purchased
grouped_purchased = grouped_cleaned.query('SalesCount > 0')

In [10]:
grouped_purchased.head()

Unnamed: 0,CardHolder,Material,SalesCount
0,10008254,299994,1
1,10011935,12276,3
2,10011935,12825,1
3,10011935,14228,1
4,10011935,18234,2


In [16]:
customers = list(np.sort(grouped_purchased.CardHolder.unique())) # Get our unique customers
products = list(grouped_purchased.Material.unique()) # Get our unique products that were purchased
SalesCount = list(grouped_purchased.SalesCount) # All of our purchases

rows = grouped_purchased.CardHolder.astype(pd.api.types.CategoricalDtype(categories=customers)).cat.codes 
# Get the associated row indices
cols = grouped_purchased.Material.astype(pd.api.types.CategoricalDtype(categories=products)).cat.codes 
# Get the associated column indices
purchases_sparse = sparse.csr_matrix((SalesCount, (rows, cols)), shape=(len(customers), len(products)))

In [17]:
purchases_sparse

<18977x7251 sparse matrix of type '<class 'numpy.longlong'>'
	with 547196 stored elements in Compressed Sparse Row format>

In [18]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

99.6023349244195

In [19]:
import random

In [20]:
def make_train(ratings, pct_test = 0.2):
    
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

In [21]:
product_train, product_test, product_users_altered = make_train(purchases_sparse, pct_test = 0.2)

In [22]:
def implicit_weighted_ALS(training_set, lambda_val = 0.1, alpha = 40, iterations = 10, rank_size = 20, seed = 0):
    
    
    conf = (alpha*training_set) # To allow the matrix to stay sparse, I will add one later when each row is taken 
                                # and converted to dense. 
    num_user = conf.shape[0]
    num_item = conf.shape[1] # Get the size of our original ratings matrix, m x n
    
    # initialize our X/Y feature vectors randomly with a set seed
    rstate = np.random.RandomState(seed)
    
    X = sparse.csr_matrix(rstate.normal(size = (num_user, rank_size))) # Random numbers in a m x rank shape
    Y = sparse.csr_matrix(rstate.normal(size = (num_item, rank_size))) # Normally this would be rank x n but we can 
                                                                 # transpose at the end. Makes calculation more simple.
    X_eye = sparse.eye(num_user)
    Y_eye = sparse.eye(num_item)
    lambda_eye = lambda_val * sparse.eye(rank_size) # Our regularization term lambda*I. 
    
    # We can compute this before iteration starts. 
    
    # Begin iterations
   
    for iter_step in range(iterations): # Iterate back and forth between solving X given fixed Y and vice versa
        # Compute yTy and xTx at beginning of each iteration to save computing time
        print('number of iteration :{}'.format(iter_step))
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)
        # Being iteration to solve for X based on fixed Y
        for u in range(num_user):
            conf_samp = conf[u,:].toarray() # Grab user row from confidence matrix and convert to dense
            pref = conf_samp.copy() 
            pref[pref != 0] = 1 # Create binarized preference vector 
            CuI = sparse.diags(conf_samp, [0]) # Get Cu - I term, which is just CuI since we never added 1
            yTCuIY = Y.T.dot(CuI).dot(Y) # This is the yT(Cu-I)Y term 
            yTCupu = Y.T.dot(CuI + Y_eye).dot(pref.T) # This is the yTCuPu term, where we add the eye back in
                                                      # Cu - I + I = Cu
            X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu) 
            # Solve for Xu = ((yTy + yT(Cu-I)Y + lambda*I)^-1)yTCuPu, equation 4 from the paper  
        # Begin iteration to solve for Y based on fixed X 
        for i in range(num_item):
            conf_samp = conf[:,i].T.toarray() # transpose to get it in row format and convert to dense
            pref = conf_samp.copy()
            pref[pref != 0] = 1 # Create binarized preference vector
            CiI = sparse.diags(conf_samp, [0]) # Get Ci - I term, which is just CiI since we never added 1
            xTCiIX = X.T.dot(CiI).dot(X) # This is the xT(Cu-I)X term
            xTCiPi = X.T.dot(CiI + X_eye).dot(pref.T) # This is the xTCiPi term
            Y[i] = spsolve(xTx + xTCiIX + lambda_eye, xTCiPi)
            # Solve for Yi = ((xTx + xT(Cu-I)X) + lambda*I)^-1)xTCiPi, equation 5 from the paper
    # End iterations
    return X, Y.T # Transpose at the end to make up for not being transposed at the beginning. 
                         # Y needs to be rank x n. Keep these as separate matrices for scale reasons. 

In [23]:
user_vecs, item_vecs = implicit_weighted_ALS(product_train, lambda_val = 0.1, alpha = 15, iterations = 5,
                                            rank_size = 20)

number of iteration :0
number of iteration :1
number of iteration :2
number of iteration :3
number of iteration :4


In [24]:
from sklearn import metrics

In [25]:
def auc_score(predictions, test):

    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr) 

In [26]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):

    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for user in altered_users: # Iterate through each user that had an item altered
        training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
        zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  

In [30]:
calc_mean_auc(product_train, product_users_altered, 
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs)], product_test)

(0.865, 0.891)

In [28]:
user_vecs.shape

(18977, 20)

In [29]:
item_vecs.T.shape

(7251, 20)

In [31]:
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) 

In [36]:
def get_items_purchased(customer_id, mf_train, customers_list, products_list, item_lookup):
    
    cust_ind = np.where(customers_list == customer_id)[0][0] # Returns the index row of our customer id
    purchased_ind = mf_train[cust_ind,:].nonzero()[1] # Get column indices of purchased items
    prod_codes = products_list[purchased_ind] # Get the stock codes for our purchased items
    return item_lookup.loc[item_lookup.Material.isin(prod_codes)][]

In [75]:
customers_arr[:7]

array([10008254, 10011935, 10036221, 10050032, 10050960, 10069565,
       10071527])

In [83]:
get_items_purchased(10069565, product_train, customers_arr, products_arr, item_lookup)

Unnamed: 0,Material,engname_rpa3
4144,14233,Fresh salmon
14873,18234,Factory-made bakery
15416,51291,Factory-made bakery
16060,18236,Factory-made bakery
64449,371391,Special-purpose materials
91376,299948,Cookies
249896,371339,Yoghurts and curd desserts
250471,371338,Yoghurts and curd desserts
291722,17859,Greens
303819,17858,Greens


In [35]:
item_lookup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7267 entries, 0 to 995135
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Material      7267 non-null   int64 
 1   engname_rpa3  7267 non-null   object
dtypes: int64(1), object(1)
memory usage: 170.3+ KB


In [74]:
from sklearn.preprocessing import MinMaxScaler

In [67]:
def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    
    
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs) # Get dot product of user vector and all item vectors
    print(type(rec_vector.todense()))
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.todense().reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.engname_rpa3.loc[item_lookup.Material == code].iloc[0]]) 
        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'Material': codes, 'engname_rpa3': descriptions}) # Create a dataframe 
    return final_frame[['Material', 'engname_rpa3']] # Switch order of columns around

In [84]:
rec_items(10069565, product_train, user_vecs, item_vecs, customers_arr, products_arr, item_lookup,
                       num_items = 20)

<class 'numpy.matrix'>


Unnamed: 0,Material,engname_rpa3
0,168210,Cucumbers
1,51593,Greens
2,19377,Exotic fruit
3,52602,Tomato
4,19560,Greens
5,12275,Exotic fruit
6,370331,Sour cream
7,463130,Sour cream
8,127515,Chicken eggs
9,50069,"""Peaches, nectarines """
