# 載入模組

In [14]:
import pandas as pd
import os
import time
from surprise import SVD, SlopeOne, CoClustering, NMF, NormalPredictor, BaselineOnly
from surprise import KNNWithZScore, KNNBasic, KNNBaseline, KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import dump

# 資料前處理

In [2]:
# 讀取資料
df = pd.read_csv("retail_data_1029_2020.csv",encoding="utf-8")
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,customer_id,gender,invoice_no,invoice_date,invoice_time,product_name,product_code,unit_price,quantity,subtotal,region_id,region_name,category_name,date_time
0,1,Male,536544,2018-11-29,14:32:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,3,105,324,平鎮區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2018-11-29 14:32:00
1,1,Male,536544,2018-11-29,14:32:00,沙威隆抗菌潔淨沐浴乳茶樹精油,82580,149,1,149,324,平鎮區,美妝護理_個人清潔_沐浴用品_沐浴乳,2018-11-29 14:32:00
2,1,Male,536544,2018-11-29,14:32:00,韓國鄉村泡菜,82583,255,1,255,324,平鎮區,生鮮食品_冷藏食品_調理食品_沙拉．味噌．泡菜．醬料,2018-11-29 14:32:00
3,1,Male,536544,2018-11-29,14:32:00,瑪榭舒適萊卡透氣襪-女(紅)-紅色,21774,59,1,59,324,平鎮區,服飾鞋包_流行鞋襪_襪子_女襪,2018-11-29 14:32:00
4,1,Male,536544,2018-11-29,14:32:00,康寶全新鮮味炒手素食500g,21787,165,1,165,324,平鎮區,米油沖泡_調味品．罐頭．湯品_調味粉．醬_味精,2018-11-29 14:32:00


### 新增一欄商品購買率(某商品總數量 / 全商品總數量)

In [3]:
# 商品購買率的 DataFrame
quantity_total = df['quantity'].sum()
buy_rate_df = df.groupby('product_code').agg({'quantity': lambda x : (x.sum())/quantity_total})
buy_rate_df.rename(columns={'quantity':'bought_rate'}, inplace=True)

# 將購買率於原資料做合併
df = pd.merge(df,buy_rate_df, on='product_code')
df.head()

Unnamed: 0,customer_id,gender,invoice_no,invoice_date,invoice_time,product_name,product_code,unit_price,quantity,subtotal,region_id,region_name,category_name,date_time,bought_rate
0,1,Male,536544,2018-11-29,14:32:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,3,105,324,平鎮區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2018-11-29 14:32:00,0.002732
1,100,Female,539453,2018-12-15,17:08:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,4,140,320,中壢區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2018-12-15 17:08:00,0.002732
2,1000,Male,561820,2019-07-27,16:00:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,1,35,333,龜山區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2019-07-27 16:00:00,0.002732
3,1009,Male,562417,2019-08-02,16:32:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,4,140,327,新屋區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2019-08-02 16:32:00,0.002732
4,1010,Female,562420,2019-08-02,16:38:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,1,35,320,中壢區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2019-08-02 16:38:00,0.002732


### 依照欄位數值大小, 均分成五等分, 依大小回傳1, 2, 3, 4, 5

In [4]:
# 找出均分的門檻值
quantiles = df.quantile(q=[0.2,0.4,0.6,0.8]).to_dict()
print(quantiles)

# 按照門檻值, 由小到大, 搭配 apply, 回傳 1, 2, 3, 4, 5
def quantity_class(x,k,d):
    if x < d[k][0.2]:
        return 1
    elif x < d[k][0.4]:
        return 2
    elif x < d[k][0.6]:
        return 3
    elif x < d[k][0.8]:
        return 4
    else:
        return 5
def price_class(x,k,d):
    if x < d[k][0.2]:
        return 1
    elif x < d[k][0.4]:
        return 2
    elif x < d[k][0.6]:
        return 3
    elif x < d[k][0.8]:
        return 4
    else:
        return 5
def buy_index_class(x,k,d):
    if x < d[k][0.2]:
        return 1
    elif x < d[k][0.4]:
        return 2
    elif x < d[k][0.6]:
        return 3
    elif x < d[k][0.8]:
        return 4
    else:
        return 5

df['quantity_quartile'] = df['quantity'].apply(quantity_class, args=('quantity',quantiles,))
df['price_quartile'] = df['unit_price'].apply(price_class, args=('unit_price',quantiles,))
df['bought_rate_quartile'] = df['bought_rate'].apply(buy_index_class, args=('bought_rate',quantiles,))
df.head()

{'invoice_no': {0.2: 545530.0, 0.4: 555851.0, 0.6: 565401.0, 0.8: 573904.0}, 'unit_price': {0.2: 69.0, 0.4: 109.0, 0.6: 166.0, 0.8: 279.0}, 'quantity': {0.2: 1.0, 0.4: 1.0, 0.6: 2.0, 0.8: 3.0}, 'subtotal': {0.2: 109.0, 0.4: 198.0, 0.6: 338.0, 0.8: 660.0}, 'region_id': {0.2: 325.0, 0.4: 328.0, 0.6: 333.0, 0.8: 336.0}, 'bought_rate': {0.2: 0.00020217995646609, 0.4: 0.0004165885393313387, 0.6: 0.0007198584740304738, 0.8: 0.0013736823655054907}}


Unnamed: 0,customer_id,gender,invoice_no,invoice_date,invoice_time,product_name,product_code,unit_price,quantity,subtotal,region_id,region_name,category_name,date_time,bought_rate,quantity_quartile,price_quartile,bought_rate_quartile
0,1,Male,536544,2018-11-29,14:32:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,3,105,324,平鎮區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2018-11-29 14:32:00,0.002732,5,1,5
1,100,Female,539453,2018-12-15,17:08:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,4,140,320,中壢區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2018-12-15 17:08:00,0.002732,5,1,5
2,1000,Male,561820,2019-07-27,16:00:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,1,35,333,龜山區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2019-07-27 16:00:00,0.002732,3,1,5
3,1009,Male,562417,2019-08-02,16:32:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,4,140,327,新屋區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2019-08-02 16:32:00,0.002732,5,1,5
4,1010,Female,562420,2019-08-02,16:38:00,王子麵-火鍋/滷味專用50g*5入組,22469,35,1,35,320,中壢區,米油沖泡_泡麵．麵條_米粉．麵條_米粉及其它口味,2019-08-02 16:38:00,0.002732,3,1,5


### 轉換資料型態

In [5]:
df['customer_id'] = df['customer_id'].apply(lambda x : str(x))
df['product_code'] = df['product_code'].apply(lambda x : str(x))

# 產生用戶對商品的評分

### 設計權重(0.1 ~ 0.8排列組合)

In [6]:
weight_list =[
             [0.1,0.1,0.8,(0.1,0.1,0.8)],
             [0.1,0.2,0.7,(0.1,0.2,0.7)],
             [0.1,0.3,0.6,(0.1,0.3,0.6)],
             [0.1,0.4,0.5,(0.1,0.4,0.5)],
             [0.1,0.5,0.4,(0.1,0.5,0.4)],
             [0.1,0.6,0.3,(0.1,0.6,0.3)],
             [0.1,0.7,0.2,(0.1,0.7,0.2)],
             [0.1,0.8,0.1,(0.1,0.8,0.1)],

             [0.2,0.1,0.7,(0.2,0.1,0.7)],
             [0.2,0.2,0.6,(0.2,0.2,0.6)],
             [0.2,0.3,0.5,(0.2,0.3,0.5)],
             [0.2,0.4,0.4,(0.2,0.4,0.4)],
             [0.2,0.5,0.3,(0.2,0.5,0.3)],
             [0.2,0.6,0.2,(0.2,0.6,0.2)],
             [0.2,0.7,0.1,(0.2,0.7,0.1)],

             [0.3,0.1,0.6,(0.3,0.1,0.6)],
             [0.3,0.2,0.5,(0.3,0.2,0.5)],
             [0.3,0.3,0.4,(0.3,0.3,0.4)],
             [0.3,0.4,0.3,(0.3,0.4,0.3)],
             [0.3,0.5,0.2,(0.3,0.5,0.2)],
             [0.3,0.6,0.1,(0.3,0.6,0.1)],

             [0.4,0.1,0.5,(0.4,0.1,0.5)],
             [0.4,0.2,0.4,(0.4,0.2,0.4)],
             [0.4,0.3,0.3,(0.4,0.3,0.3)],
             [0.4,0.4,0.2,(0.4,0.4,0.2)],
             [0.4,0.5,0.1,(0.4,0.5,0.1)],

             [0.5,0.1,0.4,(0.5,0.1,0.4)],
             [0.5,0.2,0.3,(0.5,0.2,0.3)],
             [0.5,0.3,0.2,(0.5,0.3,0.2)],
             [0.5,0.4,0.1,(0.5,0.4,0.1)],

             [0.6,0.1,0.3,(0.6,0.1,0.3)],
             [0.6,0.2,0.2,(0.6,0.2,0.2)],
             [0.6,0.3,0.1,(0.6,0.3,0.1)],

             [0.7,0.1,0.2,(0.7,0.1,0.2)],
             [0.7,0.2,0.1,(0.7,0.2,0.1)],

             [0.8,0.1,0.1,(0.8,0.1,0.1)],    
             ]

### 依照設定的權重，產出評分，這邊以(0.2, 0.3, 0.5為例)

In [7]:
def get_rating(df,quantity_weight, price_weight, buy_rate_weight):
    #依權重新增'rating'欄位
    df['rating'] = df['quantity_quartile'] * quantity_weight \
                 + df['price_quartile'] * price_weight \
                 + df['bought_rate_quartile'] * buy_rate_weight
    # 建造customerID的list, 並去掉重複值
    customer_id_list = df['customer_id'].tolist()
    customer_id_list = list(set(customer_id_list))
    
    # 把顧客購物清單上, 重複的商品rating做平均, 回傳新的df, 欄位為['customer_id','product_code','rating']
    rating_df = df.groupby(by = ['customer_id','product_code'], as_index=False).agg({'rating': lambda x : x.mean()})
    return rating_df
rating_df = get_rating(df,0.2,0.3,0.5)
rating_df.head()

Unnamed: 0,customer_id,product_code,rating
0,1,11001,3.0
1,1,16236,1.9
2,1,16237,3.7
3,1,16238,3.1
4,1,17003,4.5


### 固定用KNN演算法，找出RMSE最小的權重組合

In [8]:
t1 = time.time()
rmse_w_df = pd.DataFrame(columns=['test_rmse','weight'])
for w in weight_list:
    rating_df = get_rating(df,w[0], w[1], w[2])
    # 需要reader, 並設定評分範圍(1~5)
    reader = Reader(rating_scale=(1, 5))
    # 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
    data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)
    
    #設定演算法
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    bsl_options = {'method': 'als', #sgd 隨機梯度下降法   #als交替最小二乘法
                   'n_epochs': 20,}
    algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)
    
    #Perform cross validation
    results = cross_validate(algo, data, measures=['RMSE','MAE'], cv=3, verbose=False)
    
    # get rmse
    rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']
    
    rmse_w_df = rmse_w_df.append({'test_rmse':rmse,'weight':w[3]},ignore_index=True)
print('花了',time.time()-t1, '秒')
rmse_w_df.sort_values('test_rmse')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline si

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computi

Unnamed: 0,test_rmse,weight
3,0.119237,"(0.1, 0.4, 0.5)"
4,0.120656,"(0.1, 0.5, 0.4)"
2,0.121115,"(0.1, 0.3, 0.6)"
1,0.127454,"(0.1, 0.2, 0.7)"
5,0.128018,"(0.1, 0.6, 0.3)"
0,0.137903,"(0.1, 0.1, 0.8)"
6,0.139991,"(0.1, 0.7, 0.2)"
11,0.142034,"(0.2, 0.4, 0.4)"
10,0.143969,"(0.2, 0.3, 0.5)"
12,0.144504,"(0.2, 0.5, 0.3)"


In [9]:
# 由上可看出權重(0.1, 0.4, 0.5)的RMSE最小，所以用此評分去跑模型
rating_df = get_rating(df,0.1,0.4,0.5)
rating_df.head()

Unnamed: 0,customer_id,product_code,rating
0,1,11001,3.0
1,1,16236,1.7
2,1,16237,3.6
3,1,16238,3.3
4,1,17003,4.5


## 帶入最好的評分資料，找出最適合的模型, 用RMSE檢測

In [10]:
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))

In [11]:
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

In [15]:
# 使用Surprise庫, 進行不同算法的預測, 並以rmse檢測
t1 = time.time()
benchmark = []

#Iterate over all algorithms

for algorithm in [BaselineOnly(), SVD(), SlopeOne(), CoClustering(), NMF(), NormalPredictor(), 
                  KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()] :
    # 觀察進度用
    print('正在執行：',str(algorithm).split(' ')[0].split('.')[-1])
    
    #Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
algorithm_test_df = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
print('time:',time.time()-t1)
algorithm_test_df

正在執行： BaselineOnly
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
正在執行： SVD
正在執行： SlopeOne
正在執行： CoClustering
正在執行： NMF
正在執行： NormalPredictor
正在執行： KNNBaseline
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
正在執行： KNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
正在執行： KNNWithMeans
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
正在執行： KNNW

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,0.068848,5.151736,29.797526
KNNBasic,0.072872,4.434458,25.082671
NMF,0.093943,15.616691,1.138872
SVD,0.124356,15.38964,1.217127
BaselineOnly,0.147441,0.685961,0.81616
KNNWithMeans,0.188197,4.614262,28.237112
KNNWithZScore,0.201675,5.393145,30.086217
SlopeOne,0.257305,3.173763,16.727873
CoClustering,0.533752,6.587989,0.929713
NormalPredictor,1.133228,0.401926,1.256713


### 由上可知，KNNBaseline的RMSE數值是最小的，選用此來當作演算法。
### 調整裡面的參數，找到最RMSE最小的參數組合。(鄰近目標, 相似度計算方式, 誤差優化器, 跌代次數)

### 第一組cosine + user_based + als

In [16]:
t1 = time.time()

rating_df = get_rating(df,0.1, 0.4, 0.5)
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

#設定演算法
sim_options = {'name': 'cosine', 'user_based': True}
bsl_options = {'method': 'als', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)

#Perform cross validation
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

# get rmse
rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']

print('花了',time.time()-t1, '秒')
print('rmse:',rmse)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
花了 176.97992372512817 秒
rmse: 0.07330058173318056


### 第二組cosine + user_based + sgd

In [17]:
t1 = time.time()

rating_df = get_rating(df,0.1, 0.4, 0.5)
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

#設定演算法
sim_options = {'name': 'cosine', 'user_based': True}
bsl_options = {'method': 'sgd', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)

#Perform cross validation
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

# get rmse
rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']

print('花了',time.time()-t1, '秒')
print('rmse:',rmse)

Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
花了 171.79921555519104 秒
rmse: 0.06666711032616866


### 第三組cosine + item_based + als

In [18]:
t1 = time.time()

rating_df = get_rating(df,0.1, 0.4, 0.5)
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

#設定演算法
sim_options = {'name': 'cosine', 'user_based': False}
bsl_options = {'method': 'als', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)

#Perform cross validation
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

# get rmse
rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']

print('花了',time.time()-t1, '秒')
print('rmse:',rmse)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
花了 143.0716187953949 秒
rmse: 0.1523224052439256


### 第四組cosine + item_based + sgd

In [19]:
t1 = time.time()

rating_df = get_rating(df,0.1, 0.4, 0.5)
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

#設定演算法
sim_options = {'name': 'cosine', 'user_based': False}
bsl_options = {'method': 'sgd', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)

#Perform cross validation
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

# get rmse
rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']

print('花了',time.time()-t1, '秒')
print('rmse:',rmse)

Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
花了 144.48915481567383 秒
rmse: 0.10815593641106382


### 第五組pearson_baseline + user_based + als

In [20]:
t1 = time.time()

rating_df = get_rating(df,0.1, 0.4, 0.5)
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

#設定演算法
sim_options = {'name': 'pearson_baseline', 'user_based': True}
bsl_options = {'method': 'als', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)

#Perform cross validation
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

# get rmse
rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']

print('花了',time.time()-t1, '秒')
print('rmse:',rmse)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
花了 167.42470049858093 秒
rmse: 0.07294185677283781


### 第六組pearson_baseline + user_based + sgd

In [21]:
t1 = time.time()

rating_df = get_rating(df,0.1, 0.4, 0.5)
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

#設定演算法
sim_options = {'name': 'pearson_baseline', 'user_based': True}
bsl_options = {'method': 'sgd', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)

#Perform cross validation
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

# get rmse
rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']

print('花了',time.time()-t1, '秒')
print('rmse:',rmse)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
花了 170.33070945739746 秒
rmse: 0.06630383250632088


### 第七組pearson_baseline + item_based + als

In [22]:
t1 = time.time()

rating_df = get_rating(df,0.1, 0.4, 0.5)
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

#設定演算法
sim_options = {'name': 'pearson_baseline', 'user_based': False}
bsl_options = {'method': 'als', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)

#Perform cross validation
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

# get rmse
rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']

print('花了',time.time()-t1, '秒')
print('rmse:',rmse)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
花了 143.32236289978027 秒
rmse: 0.11953677479680225


### 第八組pearson_baseline + item_based + sgd

In [23]:
t1 = time.time()

rating_df = get_rating(df,0.1, 0.4, 0.5)
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))
# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

#設定演算法
sim_options = {'name': 'pearson_baseline', 'user_based': False}
bsl_options = {'method': 'sgd', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)

#Perform cross validation
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

# get rmse
rmse = pd.DataFrame.from_dict(results).mean(axis=0)['test_rmse']

print('花了',time.time()-t1, '秒')
print('rmse:',rmse)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
花了 143.76690769195557 秒
rmse: 0.09735261447124512


In [24]:
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))

# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)


sim_options = {'name': 'pearson_baseline', 'user_based': False}
bsl_options = {'method': 'als', #sgd 随机梯度下降法   #als交替最小二乘法
               'n_epochs': 20,}
algo = KNNBaseline(40,1,sim_options=sim_options,bsl_options=bsl_options)



# 使用KFord作為交叉驗證
kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.1104
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.1082
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.1068
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.1072
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.1082


# Item_Based Recommender

### 數據讀取 訓練模型

In [25]:
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))

# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

### 找到相似項目 : sim_options中的user_based設置為false，基於項目相似度做計算

In [26]:
# 模型訓練
sim_options = {'name': 'cosine', 'user_based': False}
bsl_options = {'method': 'als','n_epochs': 20}
all_trainset = data.build_full_trainset()
item_algo = KNNBaseline(40, 1, sim_options=sim_options, bsl_options=bsl_options)
item_algo.fit(all_trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x294017f0cc8>

In [27]:
# 找出相似物品的Top-N list
def getSimilarItems(top_k,item_id):
    item_inner_id = item_algo.trainset.to_inner_iid(item_id)
    item_neighbors = item_algo.get_neighbors(item_inner_id, k=top_k)
    f_item_neighbors = (item_algo.trainset.to_raw_iid(inner_id)
                       for inner_id in item_neighbors)
    return list(f_item_neighbors)
# 測試
getSimilarItems(10,'23843')

['22980',
 '22982',
 '11001',
 '16236',
 '16237',
 '16238',
 '17003',
 '17011F',
 '17012A',
 '17012B']

In [28]:
# 刪除list裡的重複值
def deleteDuplicatedElementFromList(l):
    resultList = []
    for item in l:
        if not item in resultList:
            resultList.append(item)
    return resultList

In [29]:
def convertProductCode2Name(L):
    # 推薦列表轉 DataFrame
    recommend_item_df = pd.DataFrame(L,columns = ['product_code'])
    
    # 建立商品Code與Name對應的 Dataframe
    product_code_name_df = pd.DataFrame({'product_code':df['product_code'],'product_name':df['product_name']}).drop_duplicates()
    
    # 將兩個 DataFrame做 merge
    recommend_item_df = pd.merge(recommend_item_df,product_code_name_df, on = ['product_code'])
    
    # 商品名輸出成list
    recommend_item_list = list(recommend_item_df['product_name'])
    return recommend_item_list

In [30]:
def itemBasedRecommender(customer_id):
    # 建立每個顧客的購買清單 DataFrame
    customer_itemList_df = df.groupby('customer_id', as_index = False).agg({'product_code': lambda x : ' '.join(x).split(' ')})
    
    # 建立特定顧客的購買物品list
    customer_items_list = customer_itemList_df[customer_itemList_df['customer_id'] == customer_id]['product_code'].values[0]
    
    # 每個購買產品, 推薦10個相似物, 放進lsit [[A1,A2,..,A10], [B1,B2,...,B10],...]
    total_list = []
    for item in customer_items_list:
        tmp_item_list = getSimilarItems(10, item)
        total_list.append(tmp_item_list)
        
    # 用 S 型的方式將推薦品放進list [A1,B1,C1,A2,B2,C2,.....]    
    recommend_list = []
    for t in range(10):
        for l in total_list:
            recommend_list.append(l[t])
            
    # 推薦列表中, 刪除已購買過的產品
    for i in customer_items_list:
        if i in recommend_list:
            recommend_list.remove(i)
    # 將推薦商品code_list 轉成 name_list
    recommend_list = convertProductCode2Name(recommend_list)
    recommend_list = deleteDuplicatedElementFromList(recommend_list)
    return recommend_list[:10]

In [31]:
# 測試 推薦 customer_id = '1'
itemBasedRecommender('1')

['家樂福巴黎美妝馬賽香皂-羅勒哈密瓜-100gx2',
 '乖乖桶-720g',
 '日安黑檀筷-5雙入',
 '滿漢大餐麻辣鍋牛肉(碗) 204g',
 '義美純豬肉鬆-海苔芝麻175g',
 'keyway名廚標準量水杯600cc',
 '動物系列兒童雨衣-小豬粉 L',
 '黑蒜頭100g',
 'DG舒適條紋女踝襪(黑)',
 '韓國isLeaf極緻水感保濕面膜22ml-膠原蛋白']

# User_Based Recommender

### 數據讀取 訓練模型

In [32]:
# 需要reader, 並設定評分範圍(1~5)
reader = Reader(rating_scale=(1, 5))

# 資料欄位必須照著順序, 依序為['user_Id','item_Id','rating']
data = Dataset.load_from_df(rating_df[['customer_id', 'product_code', 'rating']], reader)

### 找到相似用戶

In [33]:
# 訓練模型
sim_options={'name':'cosine','user_based': True}
bsl_options = {'method': 'als','n_epochs': 20}
all_trainset = data.build_full_trainset()
user_algo = KNNBasic(k=40, min_k=3, sim_options = sim_options, bsl_options = bsl_options)
user_algo.fit(all_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2940ab711c8>

In [34]:
# 找出相似用戶的Top-N
def getSimilarUsers(top_k,u_id):
    user_inner_id = user_algo.trainset.to_inner_uid(u_id)
    user_neighbors = user_algo.get_neighbors(user_inner_id, k=top_k)
    user_neighbors = (user_algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors)
    return list(user_neighbors)
# 測試
getSimilarUsers(10, '1')

['14195',
 '14836',
 '15049',
 '16462',
 '270',
 '839',
 '1006',
 '1011',
 '103',
 '104']

In [35]:
# 刪除list裡的重複值
def deleteDuplicatedElementFromList(l):
    resultList = []
    for item in l:
        if not item in resultList:
            resultList.append(item)
    return resultList

In [36]:
def convertProductCode2Name(L):
    # 推薦列表轉 DataFrame
    recommend_item_df = pd.DataFrame(L,columns = ['product_code'])
    
    # 建立商品Code與Name對應的 Dataframe
    product_code_name_df = pd.DataFrame({'product_code':df['product_code'],'product_name':df['product_name']}).drop_duplicates()
    
    # 將兩個 DataFrame做 merge
    recommend_item_df = pd.merge(recommend_item_df,product_code_name_df, on = ['product_code'])
    
    # 商品名輸出成list
    recommend_item_list = list(recommend_item_df['product_name'])
    return recommend_item_list

In [37]:
def userBasedRecommender(customer_id):
    # 建立每個顧客的購買清單 DataFrame
    customer_itemList_df = df.groupby('customer_id', as_index = False).agg({'product_code': lambda x : ' '.join(x).split(' ')})
    
    # 建立 Top 10 的 similar users list
    similar_users_list = getSimilarUsers(10, customer_id)
    
     # 建立特定顧客的購買物品list
    customer_items_list = customer_itemList_df[customer_itemList_df['customer_id'] == customer_id]['product_code'].values[0]
    
    # 建立 similar users 購買物品 list [[A1,A2,.], [B1,B2,..],...]
    similar_user_item_list = []
    for user in similar_users_list:
        item_list = customer_itemList_df[customer_itemList_df['customer_id'] == user]['product_code'].values[0]
        similar_user_item_list.append(item_list)
        
    # 建立推薦物品list [A1,A2,...,B1,B2,...]    
    recommend_list = []
    for l in similar_user_item_list:
        for item in l:
            recommend_list.append(item)
    # 推薦列表中, 刪除已購買過的產品
    for i in customer_items_list:
        if i in recommend_list:
            recommend_list.remove(i)
    # 將推薦商品code_list 轉成 name_list
    recommend_list = convertProductCode2Name(recommend_list)
    recommend_list = deleteDuplicatedElementFromList(recommend_list)
    return recommend_list[:10]

In [38]:
# 測試
userBasedRecommender('1200')

['哈根達斯 冰淇淋品脫 抹茶 473ml',
 '愛貓機能餐罐(鮪魚+牛肉)85g',
 '瑪榭腳踝加強直條紋輕護足弓襪-LF-顏色隨機',
 '西班牙Torres 鵝肝風味洋芋片-150g',
 '米森有機黑森林野莓茶4g*8包',
 'Farcent香水室內擴香-自由雛菊-120ml',
 'BVD W跟超低襪口男隱形襪',
 '滿鍋香-濃香原味150g',
 '寶多福健康犬餐-熟齡小型犬-3.5kg',
 '樂天小熊餅家庭號-香濃煉乳風味-195g']

## 輸出模型

In [None]:
user_algo.predict(iid = '17011F', uid = '14987')

In [None]:
import os

from surprise import dump

# Compute predictions of the 'original' algorithm.
predictions = user_algo.test(all_trainset.build_testset())

# Dump algorithm and reload it.
file_name = os.path.expanduser('~/dump_file')
dump.dump(file_name, algo=user_algo)
loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo[1].test(all_trainset.build_testset())
assert predictions == predictions_loaded_algo
print('Predictions are the same')

In [None]:
predictions_loaded_algo = loaded_algo[1].test(all_trainset.build_testset())

In [None]:
assert predictions == predictions_loaded_algo