In [0]:
# Collaborative filtering 알고리즘 구현
## data: Implicit data를 활용
## model-based filtering: matrix factorization으로 잠재변수 활용
## matrix factorization: SVD 기법, latent factor를 찾아내는 작업

# ALS: latent feature를 찾아내기 위한 기법
## matrix factorization은 두개의 latent 행렬로 구성
## 한번에 두 행렬의 값을 찾는 것이 아닌, 한 행렬을 고정한 상태에서 다른 행렬의 값을 찾는 것을 반복

In [0]:
# 알고리즘 학습에 사용할 데이터
## "Online Retail" from UCI Machine Learning repository
## https://archive.ics.uci.edu/ml/datasets/Online+Retail

# 알고리즘 reference
## Collaborative Filtering for Implicict Feedpack Datasets 구현
## https://nbviewer.jupyter.org/github/jmsteinw/Notebooks/blob/master/RecEngine_NB.ipynb

In [0]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve

website_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'
retail_data = pd.read_excel(website_url)

In [4]:
retail_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
retail_data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [0]:
# customerID가 null인것은 사용 불가
cleaned_retail = retail_data.loc[pd.isnull(retail_data.CustomerID) == False]

In [8]:
# item의 정보를 담고있는 것은 따로 빼내기. 추후 추천 후 확인에 활용
item_lookup = cleaned_retail[['StockCode', 'Description']].drop_duplicates()
item_lookup['StockCode'] = item_lookup.StockCode.astype(str)
item_lookup.head()

Unnamed: 0,StockCode,Description
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,71053,WHITE METAL LANTERN
2,84406B,CREAM CUPID HEARTS COAT HANGER
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,84029E,RED WOOLLY HOTTIE WHITE HEART.


In [9]:
item_lookup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3916 entries, 0 to 540421
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   StockCode    3916 non-null   object
 1   Description  3916 non-null   object
dtypes: object(2)
memory usage: 91.8+ KB


In [10]:
# sparse matrix 구성하기위한 사전작업
cleaned_retail['CustomerID'] = cleaned_retail.CustomerID.astype(int)
cleaned_retail = cleaned_retail[['StockCode', 'Quantity', 'CustomerID']] # sparse matrix 구성에 필요한것만 남긴다

grouped_cleaned = cleaned_retail.groupby(['CustomerID', 'StockCode']).sum().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
grouped_cleaned.Quantity.loc[grouped_cleaned.Quantity == 0] = 1 # Replace a sum of zero purchases with a one to
grouped_purchased = grouped_cleaned.query('Quantity > 0') # Only get customers where purchase totals were positive

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
grouped_purchased.head()

Unnamed: 0,CustomerID,StockCode,Quantity
0,12346,23166,1
1,12347,16008,24
2,12347,17021,36
3,12347,20665,6
4,12347,20719,40


In [13]:
grouped_purchased.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266723 entries, 0 to 267614
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   CustomerID  266723 non-null  int64 
 1   StockCode   266723 non-null  object
 2   Quantity    266723 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 8.1+ MB


In [14]:
customers = list(np.sort(grouped_purchased.CustomerID.unique()))
products = list(grouped_purchased.StockCode.unique())
quantity = list(grouped_purchased.Quantity)

rows = grouped_purchased.CustomerID.astype('category').cat.codes 
cols = grouped_purchased.StockCode.astype('category').cat.codes 
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))
purchases_sparse

<4338x3664 sparse matrix of type '<class 'numpy.longlong'>'
	with 266723 stored elements in Compressed Sparse Row format>

In [15]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

98.32190920694744

In [0]:
# training/test set 생성: 특정 값들만 숨겨야 함

import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy()
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

In [0]:
product_train, product_test, product_users_altered = make_train(purchases_sparse, pct_test = 0.2)

In [20]:
!pip install implicit

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/5a/d8/6b4f1374ffa2647b72ac76960c71b984c6f3238090359fb419d03827d87a/implicit-0.4.2.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 2.8MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.2-cp36-cp36m-linux_x86_64.whl size=3468836 sha256=902e23b5c5e5a58c6fe555f2662fc333517773255379c2b35b2ddf9783051401
  Stored in directory: /root/.cache/pip/wheels/1b/48/b1/1aebe3acc3afb5589e72d3e7c3ffc3f637dc4721c1a974dff7
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.2


In [21]:
import implicit

alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((product_train*alpha).astype('double'), 
                                                          factors=20, 
                                                          regularization = 0.1, 
                                                         iterations = 50)

This method is deprecated. Please use the AlternatingLeastSquares class instead
GPU training requires factor size to be a multiple of 32. Increasing factors from 20 to 32.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [0]:
from sklearn import metrics

def auc_score(predictions, test):
  fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
  return metrics.auc(fpr, tpr)  

def calc_mean_auc(training_set, altered_users, predictions, test_set):
  store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
  popularity_auc = [] # To store popular AUC scores
  pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
  item_vecs = predictions[1]
  for user in altered_users: # Iterate through each user that had an item altered
      training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
      zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
      # Get the predicted values based on our user/item vectors
      user_vec = predictions[0][user,:]
      pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
      # Get only the items that were originally zero
      # Select all ratings from the MF prediction for this user that originally had no iteraction
      actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
      # Select the binarized yes/no interaction pairs from the original full data
      # that align with the same pairs in training 
      pop = pop_items[zero_inds] # Get the item popularity for our chosen items
      store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
      popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
  # End users iteration
  
  return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
  # Return the mean AUC rounded to three decimal places for both test and popularity benchmark

In [24]:
calc_mean_auc(product_train, product_users_altered, 
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)], product_test)

(0.871, 0.815)

In [0]:
# 실제 응용

customers_arr = np.array(customers)
products_arr = np.array(products)

In [0]:
def get_items_purchased(customer_id, mf_train, customers_list, products_list, item_lookup):
    cust_ind = np.where(customers_list == customer_id)[0][0] # Returns the index row of our customer id
    purchased_ind = mf_train[cust_ind,:].nonzero()[1] # Get column indices of purchased items
    prod_codes = products_list[purchased_ind] # Get the stock codes for our purchased items
    return item_lookup.loc[item_lookup.StockCode.isin(prod_codes)]

In [27]:
get_items_purchased(12346, product_train, customers_arr, products_arr, item_lookup)

Unnamed: 0,StockCode,Description
31495,22258,FELT FARM ANIMAL RABBIT


In [0]:
from sklearn.preprocessing import MinMaxScaler

def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.Description.loc[item_lookup.StockCode == code].iloc[0]]) 
        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'StockCode': codes, 'Description': descriptions}) # Create a dataframe 
    return final_frame[['StockCode', 'Description']] # Switch order of columns around

In [30]:
rec_items(12346, product_train, user_vecs, item_vecs, 
          customers_arr, products_arr, item_lookup, num_items = 10)

Unnamed: 0,StockCode,Description
0,22264,FELT FARM ANIMAL WHITE BUNNY
1,22247,BUNNY DECORATION MAGIC GARDEN
2,21819,GLITTER CHRISTMAS STAR
3,22425,ENAMEL COLANDER CREAM
4,22393,PAPERWEIGHT VINTAGE COLLAGE
5,84678,CLASSICAL ROSE SMALL VASE
6,23400,SHELF WITH 4 HOOKS HOME SWEET HOME
7,23540,WALL ART THE MAGIC FOREST
8,21636,MADRAS NOTEBOOK MEDIUM
9,23418,LAVENDER TOILETTE BOTTLE


In [31]:
get_items_purchased(12353, product_train, customers_arr, products_arr, item_lookup)

Unnamed: 0,StockCode,Description
3594,22559,SEASIDE FLYING DISC
3637,21158,MOODY GIRL DOOR HANGER
231161,23125,6PC WOOD PLATE SET DISPOSABLE
325437,23411,TRELLIS COAT RACK


In [32]:
rec_items(12353, product_train, user_vecs, item_vecs, 
          customers_arr, products_arr, item_lookup, num_items = 10)

Unnamed: 0,StockCode,Description
0,84598,BOYS ALPHABET IRON ON PATCHES
1,22190,LOCAL CAFE MUG
2,22191,IVORY DINER WALL CLOCK
3,21865,PINK UNION JACK PASSPORT COVER
4,84946,ANTIQUE SILVER TEA GLASS ETCHED
5,21065,BOOM BOX SPEAKER GIRLS
6,85071B,RED CHARLIE+LOLA PERSONAL DOORSIGN
7,35913B,WHITE/PINK CHICK EASTER DECORATION
8,84944,SET OF 6 KASHMIR FOLKART BAUBLES
9,35912B,WHITE/PINK CHICK DECORATION


In [0]:
# 부록: ALS 알고리즘 처음부터 직접짜기
# reference: https://nbviewer.jupyter.org/github/jmsteinw/Notebooks/blob/master/RecEngine_NB.ipynb
# 직접 구현한 ALS 코드를 통한 학습: 느림. implicit 라이브러리를 사용하는 것이 빠름

def implicit_weighted_ALS(training_set, lambda_val = 0.1, alpha = 40, iterations = 10, rank_size = 20, seed = 0):
  # first set up our confidence matrix
    
    conf = (alpha*training_set) # To allow the matrix to stay sparse, I will add one later when each row is taken 
                                # and converted to dense. 
    num_user = conf.shape[0]
    num_item = conf.shape[1] # Get the size of our original ratings matrix, m x n
    
    # initialize our X/Y feature vectors randomly with a set seed
    rstate = np.random.RandomState(seed)
    
    X = sparse.csr_matrix(rstate.normal(size = (num_user, rank_size))) # Random numbers in a m x rank shape
    Y = sparse.csr_matrix(rstate.normal(size = (num_item, rank_size))) # Normally this would be rank x n but we can 
                                                                 # transpose at the end. Makes calculation more simple.
    X_eye = sparse.eye(num_user)
    Y_eye = sparse.eye(num_item)
    lambda_eye = lambda_val * sparse.eye(rank_size) # Our regularization term lambda*I. 
    
    # We can compute this before iteration starts. 
    
    # Begin iterations
   
    for iter_step in range(iterations): # Iterate back and forth between solving X given fixed Y and vice versa
        # Compute yTy and xTx at beginning of each iteration to save computing time
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)
        # Being iteration to solve for X based on fixed Y
        for u in range(num_user):
            conf_samp = conf[u,:].toarray() # Grab user row from confidence matrix and convert to dense
            pref = conf_samp.copy() 
            pref[pref != 0] = 1 # Create binarized preference vector 
            CuI = sparse.diags(conf_samp, [0]) # Get Cu - I term, which is just CuI since we never added 1
            yTCuIY = Y.T.dot(CuI).dot(Y) # This is the yT(Cu-I)Y term 
            yTCupu = Y.T.dot(CuI + Y_eye).dot(pref.T) # This is the yTCuPu term, where we add the eye back in
                                                      # Cu - I + I = Cu
            X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu) 
            # Solve for Xu = ((yTy + yT(Cu-I)Y + lambda*I)^-1)yTCuPu, equation 4 from the paper  
        # Begin iteration to solve for Y based on fixed X 
        for i in range(num_item):
            conf_samp = conf[:,i].T.toarray() # transpose to get it in row format and convert to dense
            pref = conf_samp.copy()
            pref[pref != 0] = 1 # Create binarized preference vector
            CiI = sparse.diags(conf_samp, [0]) # Get Ci - I term, which is just CiI since we never added 1
            xTCiIX = X.T.dot(CiI).dot(X) # This is the xT(Cu-I)X term
            xTCiPi = X.T.dot(CiI + X_eye).dot(pref.T) # This is the xTCiPi term
            Y[i] = spsolve(xTx + xTCiIX + lambda_eye, xTCiPi)
            # Solve for Yi = ((xTx + xT(Cu-I)X) + lambda*I)^-1)xTCiPi, equation 5 from the paper
    # End iterations
    
    return X, Y.T
    # Transpose at the end to make up for not being transposed at the beginning. 
    # Y needs to be rank x n. Keep these as separate matrices for scale reasons. 
