## 前處理

In [3]:
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import pandas as pd
import numpy as np

In [4]:
retail_data = pd.read_csv("rs.csv")

In [5]:
retail_data.head()

Unnamed: 0.1,Unnamed: 0,user,item,qty,datetime
0,0,32011003635952,16198,1,2014-01-01 00:39:00
1,1,32009100468450,18107,4,2014-01-01 01:07:00
2,2,32013007873699,13612,2,2014-01-01 07:00:00
3,3,32011004803503,42496,1,2014-01-01 07:06:00
4,4,32011004221857,8875,4,2014-01-01 07:09:00


In [6]:
item_lookup = retail_data[['item']].drop_duplicates()# 物品清單(無重複)
item_lookup['item'] = item_lookup.item

In [7]:
item_lookup.head()

Unnamed: 0,item
0,16198
1,18107
2,13612
3,42496
4,8875


In [8]:
retail_data['user'] = retail_data.user.astype("str")
retail_data = retail_data[[ 'user','item', 'qty',]] # 只保留我們想要的值
grouped_cleaned = retail_data.groupby(['user', 'item']).sum().reset_index() # 將所有購物紀錄結合起來
grouped_cleaned.qty.loc[grouped_cleaned.qty == 0] = 1 # 將 0值轉成 1
grouped_purchased = grouped_cleaned.query('qty > 0') # 只保留正值

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [9]:
grouped_purchased.head()

Unnamed: 0,user,item,qty
0,32009000000620,6095,1
1,32009000000620,6937,1
2,32009000000620,7650,2
3,32009000000620,8838,4
4,32009000000620,8841,5


In [10]:
customers = list(np.sort(grouped_purchased.user.unique())) #顧客清單
products = list(grouped_purchased.item.unique()) # 產品清單
quantity = list(grouped_purchased.qty) # 溝買紀錄
#
rows = grouped_purchased.user.astype('category', categories = customers).cat.codes 
cols = grouped_purchased.item.astype('category', categories = products).cat.codes 
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

In [11]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity
#95.12% 關聯矩陣為稀疏矩陣

95.12475

## 分成訓練集和測試集

In [12]:
import random

In [13]:
def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() 
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # 利用zip結合user,item
    random.seed(0) # 設定seed
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # 四捨五入
    samples = random.sample(nonzero_pairs, num_samples) 
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 
    training_set.eliminate_zeros() 
    return training_set, test_set, list(set(user_inds)) 

In [14]:
product_train, product_test, product_users_altered = make_train(purchases_sparse, pct_test = 0.2)

In [16]:
## ALS 找出 user及 item 的 向量
import implicit
alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((product_train*alpha).astype('double'), 
                                                          factors=20, 
                                                          regularization = 0.1, 
                                                         iterations = 50)

This method is deprecated. Please use the AlternatingLeastSquares class instead


## 測AUC

In [17]:
from sklearn import metrics

In [18]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)   

In [19]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for user in altered_users: # Iterate through each user that had an item altered
        training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
        zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs.T).toarray()[0,zero_inds].reshape(-1)
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
   # Return the mean AUC rounded to three decimal places for both test and popularity benchmark

In [20]:
calc_mean_auc(product_train, product_users_altered, 
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs)], product_test)

(0.702, 0.693)

## 推薦系統

In [21]:
customers_arr = np.array(customers) 
products_arr = np.array(products)

In [22]:
#實際買的
def get_items_purchased(customer_id, mf_train, customers_list, products_list, item_lookup):
    cust_ind = np.where(customers_list == customer_id)[0][0]
    purchased_ind = mf_train[cust_ind,:].nonzero()[1]
    prod_codes = products_list[purchased_ind] #得到你買的item id
    return item_lookup.loc[item_lookup.item.isin(prod_codes)]

In [23]:
customers_arr[:5]

array(['32009000000620', '32009000001559', '32009000002334',
       '32009000006899', '32009000030191'], 
      dtype='<U14')

In [24]:
# 會員32009000000620所買物品
get_items_purchased("32009000000620", product_train, customers_arr, products_arr, item_lookup)

Unnamed: 0,item
4,8875
13,16450
24,19803
33,11645
47,41855
57,19802
67,19805
70,19941
74,8841
92,10751


In [25]:
from sklearn.preprocessing import MinMaxScaler

In [26]:
# 推薦所買的
def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append(code) 
        # Append our descriptions to the list
    codes = [item for item in rec_list]
    final_frame = pd.DataFrame({'item': codes}) # Create a dataframe 
    return final_frame[['item']] # Switch order of columns around

In [27]:
# 推薦會員32009000000620前十名
rec_items("32009000000620", product_train, user_vecs, item_vecs, customers_arr, products_arr, item_lookup,
                       num_items = 10)

Unnamed: 0,item
0,8838
1,9065
2,9616
3,12163
4,16594
5,19835
6,16792
7,19804
8,8937
9,8842


In [29]:
rec_items("32009000030191", product_train, user_vecs, item_vecs, customers_arr, products_arr, item_lookup,
                       num_items = 5)

Unnamed: 0,item
0,9629
1,9063
2,9465
3,9563
4,9065
