In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))
import numpy as np
from dataaccessframeworks.read_data import get_movielens, user_filter, training_testing, get_yelp, get_douban

## 0. Get Data

### MovieLens

In [7]:
data = get_movielens()
# str to int
user_movie = np.array([list(map(int, data)) for data in data['user_movie']])
# 濾除使用者評分小於三筆的資料
filter_data = user_filter(user_movie, 0)
print(f"使用者評分大於三次的共有：{filter_data.shape}")
# 取得電影個數及電影個數
len_users, movies = np.unique(filter_data[:,0]), np.unique(filter_data[:,1])
# 取得訓練資料及測試資料
training_data,  testing_data = training_testing(filter_data)
print("users: ", len(len_users))
print("items: ", len(movies))

user_movie:[['196', '242', '3', '881250949'], ['186', '302', '3', '891717742'], ['22', '377', '1', '878887116']]
movie_genre:[['1', '3'], ['1', '4'], ['1', '5']]
user_age:[['1', '3'], ['2', '6'], ['3', '3']]
user_occupation:[['1', '1'], ['2', '2'], ['3', '3']]
使用者評分大於三次的共有：(100000, 4)
users:  943
items:  1682


### Yelp

In [8]:
from sklearn.preprocessing import LabelEncoder

data = get_yelp()
# str to int
user_business = np.array([list(map(int, data)) for data in data['user_business']])
# 濾除使用者評分小於三筆的資料
filter_data = user_filter(user_business, 0)
# user label encoder
le = LabelEncoder()
filter_data[:, 0] = le.fit_transform(filter_data[:, 0])
filter_data[:, 0] += 1
# item label encoder
ile = LabelEncoder()
filter_data[:, 1] = ile.fit_transform(filter_data[:, 1])
filter_data[:, 1] += 1
# if want to inverse label 
# le.inverse_transform(yelp_training_encoder)
print(f"使用者評分大於三次的共有：{filter_data.shape}")
# 取得business個數及users個數
yelp_users, business = np.unique(filter_data[:,0]), np.unique(filter_data[:,1])
# 取得訓練資料及測試資料
yelp_training_data,  yelp_testing_data = training_testing(filter_data)
print("users: ", len(yelp_users))
print("items: ", len(business))

business_category:[['1', '334', '1'], ['1', '426', '1'], ['2', '211', '1']]
business_city:[['1', '31', '1'], ['2', '35', '1'], ['3', '35', '1']]
user_business:[['1', '8391', '5'], ['1', '8971', '5'], ['2', '186', '5']]
user_compliment:[['2', '1', '1'], ['2', '2', '1'], ['2', '3', '1']]
使用者評分大於三次的共有：(184835, 3)
users:  7326
items:  14127


### Douban

In [2]:
from sklearn.preprocessing import LabelEncoder

data = get_douban()
# str to int
user_book = np.array([list(map(int, data)) for data in data['user_book']])
# 濾除使用者評分小於三筆的資料
filter_data = user_filter(user_book, 0)
# user label encoder
le = LabelEncoder()
filter_data[:, 0] = le.fit_transform(filter_data[:, 0])
filter_data[:, 0] += 1
# item label encoder
ile = LabelEncoder()
filter_data[:, 1] = ile.fit_transform(filter_data[:, 1])
filter_data[:, 1] += 1
print(f"使用者評分大於三次的共有：{filter_data.shape}")
# 取得business個數及users個數
douban_users, books = np.unique(filter_data[:,0]), np.unique(filter_data[:,1])
# 取得訓練資料及測試資料
douban_training_data,  douban_testing_data = training_testing(filter_data)
print("users: ", len(douban_users))
print("items: ", len(books))

book_author:[['12131', '3871'], ['20995', '10690'], ['9905', '3845']]
book_publisher:[['12131', '108'], ['20995', '1470'], ['9905', '1696']]
book_year:[['9905', '16'], ['21153', '15'], ['12823', '15']]
user_book:[['10855', '938', '4'], ['10027', '3', '3'], ['741', '2426', '5']]
user_group:[['3587', '232'], ['3587', '666'], ['3587', '226']]
user_location:[['3587', '33'], ['3210', '179'], ['7993', '394']]
使用者評分大於三次的共有：(788898, 3)
users:  11266
items:  22347


## 1. User-based Collaborative Filtering (U-CF)

In [4]:
%load_ext autoreload
%autoreload 2
from models.collaborative_filtering import get_user_item_matrix, predict
import util.utility as util
from imp import reload
import heapq
import copy
from tqdm import tqdm
from models.evaluation import recall_k
from sklearn.metrics import ndcg_score
import configparser
import wandb
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(os.getcwd()), 'config.ini'))

['/home/baron/HW/Recommender_System/config.ini']

In [8]:
users = douban_users
items = books

user_matrix = get_user_item_matrix(douban_training_data, users, items)
cos, pcc = util.get_sim_array(user_matrix)
pcc_dis = pcc -  np.identity(len(users))


data transfer user matrix: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 11266/11266 [00:40<00:00, 275.48it/s]
  c /= stddev[:, None]
  c /= stddev[None, :]


In [14]:
dis = pcc_dis[5253]
sim_dis_idx = dis.tolist()
# top_sim_index = list(map(sim_dis_idx.index, heapq.nlargest(20,dis)))
# top_sim_index
if np.isnan(dis).all():
    print('hi')
dis

array([-0.00176019,  0.03153739, -0.00064693, ..., -0.00084405,
        0.04534444,  0.03106998])

In [15]:
def user_sim_score(users, items, train_data, test_data, k=int(config['CF']['user_K'])):
    # make matrix
    user_matrix = get_user_item_matrix(train_data, users, items)
    test_matrix = get_user_item_matrix(test_data, users, items)
    # 計算bias
    bias_matrix = util.get_bias(user_matrix, users, items)
    # 計算相似度
#     cos, pcc = util.get_sim_array(user_matrix)
#     cosine_dis = cos -  np.identity(len(users))
#     pcc_dis = pcc -  np.identity(len(users))
    
#     sim = {"cos":cosine_dis, "pcc":pcc_dis}
    sim = ["cos", "pcc"]
    evaluation = dict()
    for s in sim.keys():
        delta_list = list()
        predict_array = np.zeros((test_matrix.shape))
        # sim_dis = sim[s]
        sim_array = util.get_sim_array(user_matrix, sim=s)
        sim_dis = sim_array -  np.identity(len(users))
        for i in tqdm(range(len(users)), desc=f"UCF predicting {s} score with {k}"):
            # Suv: 取出前K個最相似的使用者相似度 ex:K=3, output=[0.378, 0.353, 0.336]
            Suv = heapq.nlargest(k ,sim_dis[i])
            # 若i不存在，則跳過
            if np.isnan(sim_dis[i]).all():
                continue
            # top_sim_index: 取出與使用者i最為相似的前K個使用者 ex:K=3, output=[915, 406, 214]
            sim_dis_idx = sim_dis[i].tolist()
            top_sim_index = list(map(sim_dis_idx.index, heapq.nlargest(k,sim_dis[i])))
            # recall
            prediction = list()
            # 計算相似使用者與使用者i的評分誤差
            for item_idx in range(len(items)):
                # 取得使用者i的評分(ground truth)
                rth = test_matrix[i, item_idx]
                # 如果使用者i有進行評分，則才納入計算RMSE
                if rth != 0:
                    # 之後需剔除對電影m未評分的相似使用者，因此先進行複製，才不會影響下一部電影的計算
                    copy_Suv = copy.deepcopy(Suv)
                    # R: 若相似使用者對電影 m 有評分則進行調整
                    R = list()
                    # 判斷相似使用者是否對電影ｍ有評分，若有評分則將原始評分減去該使用者對電影m的bias
                    for c, j in enumerate(top_sim_index):
                        if  test_matrix[j, item_idx] == 0:
                            R.append(0)
                            copy_Suv[c] = 0
                        else:
                            R.append(test_matrix[j, item_idx] - bias_matrix[j, item_idx])
                    # 如果所有相似使用者都沒評分則跳過此次計算
                    if sum(R) != 0:
                        # 預測使用者i對於第m部電影的評分 + 使用者i對電影m的偏差
                        Rui = predict(copy_Suv, R) + bias_matrix[i, item_idx]
                        # 計算square error
                        delta_list.append(util.se(rth, Rui))
                        # 儲存預測結果, 並取四捨五入
                        predict_array[i, item_idx] = Rui
        # 各評估指標
        evaluation[f'{s}_rmse']= util.rmse(delta_list)
        evaluation[f'{s}_recall@10'] = recall_k(test_matrix, predict_array) 
        evaluation[f'{s}_NDCG@10']=ndcg_score(test_matrix, predict_array, k=10)
        
    return evaluation


print("==========\nMovielens:\n==========")
wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="ICF")
wandb_log = WandbLog()
movie_reuslt = item_sim_score(len_users, movies, training_data, testing_data)
wandb_log.log_evaluation(movie_reuslt)
print(movie_reuslt)
wandb.finish()

print("==========\nYelp:\n==========")
wandb.init(project=config['general']['yelp'],
                        entity=config['general']['entity'],
                        group="ICF")
wandb_log = WandbLog()
yelp_reuslt = item_sim_score(yelp_users, business, yelp_training_data, yelp_testing_data)
wandb_log.log_evaluation(yelp_reuslt)
print(yelp_reuslt)
wandb.finish()

print("==========\nDouban Book:\n==========")
wandb.init(project=config['general']['douban'],
                        entity=config['general']['entity'],
                        group="ICF")
wandb_log = WandbLog()
douban_reuslt = item_sim_score(douban_users, books, douban_training_data, douban_testing_data)
wandb_log.log_evaluation(douban_reuslt)
print(douban_reuslt)
wandb.finish()

data transfer user matrix: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 11266/11266 [00:41<00:00, 274.05it/s]
data transfer user matrix: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 11266/11266 [00:09<00:00, 1240.58it/s]
  return np.reshape(total/exist_number, (-1, 1))
  c /= stddev[:, None]
  c /= stddev[None, :]
UCF predicting cos score with 20: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 11266/11266 [02:40<00:00, 70.03it/s]
UCF predicting pcc score with 20: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 11266/11266 [02:42<00:00, 69.27it/s]


Douban Book:
{'cos_rmse': 0.9581459914947442, 'cos_recall@10': 0.17468548559149782, 'cos_NDCG@10': 0.314405097518533, 'pcc_rmse': 0.9643837375362581, 'pcc_recall@10': 0.1729472623095187, 'pcc_NDCG@10': 0.31003647129290945}


## 2. Item-based Collaborative Filtering (I-CF)

In [None]:
from scipy import sparse
from util.mywandb import WandbLog

def item_sim_score(users, items, train_data, test_data, k=int(config['CF']['user_K'])):
    # make matrix
    user_matrix = get_user_item_matrix(train_data, users, items)
    test_matrix = get_user_item_matrix(test_data, users, items)
    item_matrix = user_matrix.T 
    item_test = test_matrix.T
    #item_test = sparse.csr_matrix(item_test)
    del test_matrix
    
    # 計算bias
    bias_matrix = util.get_bias(user_matrix, users, items)
    item_bias = bias_matrix.T
    del bias_matrix
    del user_matrix
    
    # 計算相似度
    #cos, pcc = util.get_sim_array(item_matrix)
    #cosine_dis = cos -  np.identity(len(items))
    #cosine_dis = sparse.csr_matrix(cosine_dis)
    #pcc_dis = pcc -  np.identity(len(items))
    #pcc_dis = sparse.csr_matrix(pcc_dis)
    #sim = {"cos":cosine_dis, "pcc":pcc_dis}
    sim = ["cos", "pcc"]
    evaluation = dict()
    for s in sim:
        delta_list = list()
        predict_array = np.zeros((item_test.shape))
        # predict array to spase
        predict_array = sparse.csr_matrix(predict_array)
        sim_array = util.get_sim_array(item_matrix, sim=s)
        sim_dis = sim_array -  np.identity(len(items))
        # sim_dis = sim[s]
        for i in tqdm(range(len(items)), desc=f"ICF predicting {s} score with {k}"):
            # Siv: 取出前K個最相似的使用者相似度 ex:K=3, output=[0.378, 0.353, 0.336]
            Siv = heapq.nlargest(k ,sim_dis[i])
            # 若i不存在，則跳過
            if np.isnan(sim_dis[i]).all():
                continue
            sim_dis[i][np.isnan(sim_dis[i])] = 0
            # top_sim_index: 取出與使用者i最為相似的前K個使用者 ex:K=3, output=[915, 406, 214]
            sim_dis_idx = sim_dis[i].tolist()
            top_sim_index = list(map(sim_dis_idx.index, heapq.nlargest(k,sim_dis[i])))
            # recall
            prediction = list()
            # 計算相似電影與電影i的評分誤差
            for user_idx in range(len(users)):
                # 取得項目i的評分(ground truth)
                rth = item_test[i, user_idx]
                # 如果使用者i有進行評分，則才納入計算RMSE
                if rth != 0:
                    # 之後需剔除對電影m未評分的相似使用者，因此先進行複製，才不會影響下一部電影的計算
                    copy_Siv = copy.deepcopy(Siv)
                    # R: 若相似使用者對電影 m 有評分則進行調整
                    R = list()
                    # 判斷相似使用者是否對電影ｍ有評分，若有評分則將原始評分減去該使用者對電影m的bias
                    for c, j in enumerate(top_sim_index):
                        if  item_test[j, user_idx] == 0:
                            R.append(0)
                            copy_Siv[c] = 0
                        else:
                            R.append(item_test[j, user_idx] - item_bias[j, user_idx])
                    # 如果所有相似使用者都沒評分則跳過此次計算
                    if sum(R) != 0:
                        # 預測使用者i對於第m部電影的評分 + 使用者i對電影m的偏差
                        Rui = predict(copy_Siv, R) + item_bias[i, user_idx]
                        # 計算square error
                        delta_list.append(util.se(rth, Rui))
                        # 儲存預測結果, 並取四捨五入
                        if np.isnan(Rui):
                            Rui=0
                        predict_array[i, user_idx] = Rui
        
        
        # 各評估指標
        evaluation[f'{s}_rmse']= util.rmse(delta_list)
        print(evaluation)
        evaluation[f'{s}_recall@10'] = recall_k(item_test, predict_array) 
        print(evaluation)
        evaluation[f'{s}_NDCG@10']=ndcg_score(item_test, predict_array.toarray(), k=10)
        print(evaluation)
        
    return evaluation

print("==========\nMovielens:\n==========")
wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="ICF")
wandb_log = WandbLog()
movie_reuslt = item_sim_score(len_users, movies, training_data, testing_data)
wandb_log.log_evaluation(movie_reuslt)
print(movie_reuslt)
wandb.finish()

print("==========\nYelp:\n==========")
wandb.init(project=config['general']['yelp'],
                        entity=config['general']['entity'],
                        group="ICF")
wandb_log = WandbLog()
yelp_reuslt = item_sim_score(yelp_users, business, yelp_training_data, yelp_testing_data)
wandb_log.log_evaluation(yelp_reuslt)
print(yelp_reuslt)
wandb.finish()

print("==========\nDouban Book:\n==========")
wandb.init(project=config['general']['douban'],
                        entity=config['general']['entity'],
                        group="ICF")
wandb_log = WandbLog()
douban_reuslt = item_sim_score(douban_users, books, douban_training_data, douban_testing_data)
wandb_log.log_evaluation(douban_reuslt)
print(douban_reuslt)
wandb.finish()

Movielens:





data transfer user matrix: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 943/943 [00:00<00:00, 2043.92it/s]
data transfer user matrix: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 943/943 [00:00<00:00, 8334.25it/s]
  self._set_intXint(row, col, x.flat[0])
ICF predicting cos score with 20: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:10<00:00, 154.52it/s]


{'cos_rmse': 1.1316303906753467}
{'cos_rmse': 1.1316303906753467, 'cos_recall@10': 0.0036760656550237757}


  c /= stddev[:, None]
  c /= stddev[None, :]
ICF predicting pcc score with 20:   0%|                                                                                                                  | 0/1682 [00:00<?, ?it/s]

{'cos_rmse': 1.1316303906753467, 'cos_recall@10': 0.0036760656550237757, 'cos_NDCG@10': 0.5412726069543039}


  self._set_intXint(row, col, x.flat[0])
ICF predicting pcc score with 20: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:10<00:00, 163.25it/s]


{'cos_rmse': 1.1316303906753467, 'cos_recall@10': 0.0036760656550237757, 'cos_NDCG@10': 0.5412726069543039, 'pcc_rmse': 1.1115921286236343}
{'cos_rmse': 1.1316303906753467, 'cos_recall@10': 0.0036760656550237757, 'cos_NDCG@10': 0.5412726069543039, 'pcc_rmse': 1.1115921286236343, 'pcc_recall@10': 0.0036760656550237757}
{'cos_rmse': 1.1316303906753467, 'cos_recall@10': 0.0036760656550237757, 'cos_NDCG@10': 0.5412726069543039, 'pcc_rmse': 1.1115921286236343, 'pcc_recall@10': 0.0036760656550237757, 'pcc_NDCG@10': 0.5184692658439667}
{'cos_rmse': 1.1316303906753467, 'cos_recall@10': 0.0036760656550237757, 'cos_NDCG@10': 0.5412726069543039, 'pcc_rmse': 1.1115921286236343, 'pcc_recall@10': 0.0036760656550237757, 'pcc_NDCG@10': 0.5184692658439667}



0,1
cos_NDCG@10,0.54127
cos_recall@10,0.00368
cos_rmse,1.13163
pcc_NDCG@10,0.51847
pcc_recall@10,0.00368
pcc_rmse,1.11159


Yelp:


data transfer user matrix: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7326/7326 [00:05<00:00, 1222.35it/s]
data transfer user matrix: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7326/7326 [00:01<00:00, 5185.12it/s]
  self._set_intXint(row, col, x.flat[0])
ICF predicting cos score with 20: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 14127/14127 [02:15<00:00, 103.93it/s]


{'cos_rmse': 1.27345583713731}
{'cos_rmse': 1.27345583713731, 'cos_recall@10': 0.00029988668138726505}
{'cos_rmse': 1.27345583713731, 'cos_recall@10': 0.00029988668138726505, 'cos_NDCG@10': 0.04081184719369428}


  c /= stddev[:, None]
  c /= stddev[None, :]
  self._set_intXint(row, col, x.flat[0])
ICF predicting pcc score with 20: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 14127/14127 [02:31<00:00, 93.51it/s]


{'cos_rmse': 1.27345583713731, 'cos_recall@10': 0.00029988668138726505, 'cos_NDCG@10': 0.04081184719369428, 'pcc_rmse': nan}
{'cos_rmse': 1.27345583713731, 'cos_recall@10': 0.00029988668138726505, 'cos_NDCG@10': 0.04081184719369428, 'pcc_rmse': nan, 'pcc_recall@10': 0.00029988668138726505}
{'cos_rmse': 1.27345583713731, 'cos_recall@10': 0.00029988668138726505, 'cos_NDCG@10': 0.04081184719369428, 'pcc_rmse': nan, 'pcc_recall@10': 0.00029988668138726505, 'pcc_NDCG@10': 0.03952111283588812}
{'cos_rmse': 1.27345583713731, 'cos_recall@10': 0.00029988668138726505, 'cos_NDCG@10': 0.04081184719369428, 'pcc_rmse': nan, 'pcc_recall@10': 0.00029988668138726505, 'pcc_NDCG@10': 0.03952111283588812}



0,1
cos_NDCG@10,0.04081
cos_recall@10,0.0003
cos_rmse,1.27346
pcc_NDCG@10,0.03952
pcc_recall@10,0.0003


Douban Book:


data transfer user matrix: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 11266/11266 [00:40<00:00, 278.87it/s]
data transfer user matrix: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 11266/11266 [00:09<00:00, 1236.06it/s]
  return np.reshape(total/exist_number, (-1, 1))
  self._set_intXint(row, col, x.flat[0])
ICF predicting cos score with 20:  42%|██████████████████████████████████████████▊                                                           | 9368/22347 [03:51<05:09, 41.91it/s]