In [8]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))
import numpy as np
%load_ext autoreload
%autoreload 2
from imp import reload
from dataaccessframeworks.read_data import get_movielens, user_filter, training_testing, get_yelp, get_douban, training_testing_XY
from dataaccessframeworks.data_preprocessing import get_one_hot_feature, generate_eval_array
from models.collaborative_filtering import get_user_item_matrix, predict
from models.evaluation import recall_k
from sklearn.preprocessing import normalize
from sklearn.metrics import ndcg_score
import configparser
import wandb
from tqdm import tqdm
from util.mywandb import WandbLog
import util.utility as util
import itertools
from random import sample
from IPython.display import clear_output

config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(os.getcwd()), 'config.ini'))
LIBFM_PATH = '/home/baron/libfm/bin/'
os.environ['LIBFM_PATH'] = LIBFM_PATH

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
'''
data input:
[[user, item, rank], .....]
'''
def get_uij(data, users, items, sample_rate=1000):
    for ii, user in enumerate(users):
        items_data = data[data[:, 0]==user]
        item_compare = list()
        item_neg = list()
        neg_count = 0
        items_iter =[i for i in itertools.combinations(items, 2)]
        items_iter = sample(items_iter, sample_rate)
        for i, j in items_iter:
            # if i exist items, but j not exsit items, i>j
            if i in items_data[:, 1] and j not in items_data[:, 1]:
                item_compare.append([user, i, j, 1])
            # if j exist items, but i not exsit items, j>i
            elif i not in items_data[:, 1] and j in items_data[:, 1]:
                item_compare.append([user, j, i, 1])
            # if i exist items, and also j exsit items, compare i and j
            elif i in items_data[:, 1] and j in items_data[:, 1]:
                ri = items_data[(items_data[:, 0]==user) & (items_data[:, 1]==i)][0, 2]
                rj = items_data[(items_data[:, 0]==user) & (items_data[:, 1]==j)][0, 2]
                if ri > rj:
                    item_compare.append([user, i, j, 1])
                elif ri < rj:
                    item_compare.append([user, j, i, 1])
                else:
                    if neg_count < len(item_compare)//2:
                        item_neg.append([user, j, i, 0])
                        item_neg.append([user, i, j, 0])
                        neg_count+=1
            else:
                if neg_count < len(item_compare)//2:
                    item_neg.append([user, j, i, 0])
                    item_neg.append([user, i, j, 0])
                    neg_count+=1
        if ii==0:
            uij = np.array(item_compare)
            uij_neg = np.array(item_neg)
        else:
            if len(item_compare)!= 0:
                uij = np.vstack((uij, np.array(item_compare)))
            if len(item_neg)!= 0:
                uij_neg = np.vstack((uij_neg, np.array(item_neg)))
        
        if ii%300==0:
            print("[{}/{}] uij_pos: {}, uij_neg: {}".format(ii, len(users), uij.shape, uij_neg.shape))
    
    return uij, uij_neg

## 0. Get Data

In [10]:
from scipy.sparse import csr_matrix
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from dataaccessframeworks.data_preprocessing import get_feature_map, generate_with_feature, get_norating_data

def get_uij_one_hot_feature(data, user_item_col, uij_data, y_col=3, time_col=3, batch_size=10000):
    # 取得user及items feature map 
    users_dict, items_dict, features = get_feature_map(data, user_item_col)

    # 將user item 數值轉為integer
    # user_items = np.array([list(map(int, data))for data in data[user_item_col]])
    # 使用者評分次數小於三筆則剔除
    filter_data = user_filter(uij_data, 0)
    print(filter_data.shape)
    print(filter_data[:5])
    # user label encoder
    le = LabelEncoder()
    filter_data[:, 0] = le.fit_transform(filter_data[:, 0])
    # item label encoder
    ile = LabelEncoder()
    filter_data[:, 1] = ile.fit_transform(filter_data[:, 1])
    filter_data[:, 2] = ile.fit_transform(filter_data[:, 2])
    
    # 做特徵的onehot encoding 
    one_hot_encoder_data, y, concat_data = get_uij_onehot_encoding(filter_data, users_dict, items_dict, features, le, ile, batch_size, y_col)

    return one_hot_encoder_data, y, concat_data

# 取得user及items的one hot encoding map
def get_uij_onehot_encoding(data, users_dict, items_dict, features, le, ile, batch_size, y_col):
    #users_onehot = get_users_onehot(data)
    sparse_, dense = get_uij_feature_onehot(data, users_dict, items_dict, features, le, ile, batch_size)
    
    # 取得y
    y = data[:,y_col].reshape(-1,1)
    
    # return np.concatenate((sparse_, dense), axis=1), y, concat_data
    return sparse.hstack((sparse_, dense), format='csr'), y, data

# 取得feature one hot
def get_uij_feature_onehot(data, users_feature, items_feature, features_map, le, ile, batch_size):
    # 取得user & item個數
    user_number = np.max(data[:,0]) + 1
    item_number = np.max(data[:,1]) + 1
    i_feature = items_feature[1].keys()
    # one hot encoding
    for b in range(0, data.shape[0], batch_size):
        user_one_hot = np.eye(user_number)[data[b:b+batch_size,0]]
        itemi_one_hot = np.eye(item_number)[data[b:b+batch_size,1]]
        itemj_one_hot = np.eye(item_number)[data[b:b+batch_size,2]]
        sparse_ = np.concatenate((user_one_hot, itemi_one_hot, itemj_one_hot), axis=1)
        dense = np.empty((user_one_hot.shape[0], 1), int)

        # create items feature 
        i_feature = items_feature[1].keys()
        for fe in i_feature:
            # sparse
            if fe.split("_")[1] != 'year':
                f_map = features_map[fe]
                feature_lengh = f_map[list(f_map.keys())[0]].shape[1]
                tmp = np.zeros((len(data[b:b+batch_size, 1]), feature_lengh*2))
                for i, item_ij in enumerate(data[b:b+batch_size, 1:3]):
                    item_i, item_j = item_ij
                    item_i = ile.inverse_transform(np.array([item_i])).item()
                    item_j = ile.inverse_transform(np.array([item_j])).item()
                    if item_i not in features_map[fe].keys():
                        # 取第一個鍵值得長度
                        # f_map = features_map[fe]
                        # item_feature_onehot = np.zeros((f_map[list(f_map.keys())[0]].shape[1]))
                        pass
                    else:
                        # item_feature_onehot = features_map[fe][item].toarray()
                        tmp[i, :feature_lengh] = features_map[fe][item_i].toarray()
                    if item_j not in features_map[fe].keys():
                        # 取第一個鍵值得長度
                        # f_map = features_map[fe]
                        # item_feature_onehot = np.zeros((f_map[list(f_map.keys())[0]].shape[1]))
                        pass
                    else:
                        # item_feature_onehot = features_map[fe][item].toarray()
                        tmp[i, feature_lengh:] = features_map[fe][item_j].toarray()
                # sparse_ = np.concatenate((sparse_, tmp), axis=1)
                sparse_ = np.hstack((sparse_, tmp))
            # dense
            else:
                # i = 0
                f_map = features_map[fe]
                feature_lengh = f_map[list(f_map.keys())[0]].shape[1]
                tmp = np.zeros((len(data[b:b+batch_size, 1]), feature_lengh*2))
                for i, item_ij in enumerate(data[b:b+batch_size, 1:3]):
                    item_i, item_j = item_ij
                    item_i = ile.inverse_transform(np.array([item_i])).item()
                    item_j = ile.inverse_transform(np.array([item_j])).item()
                    if item_i not in features_map[fe].keys():
                        # 取第一個鍵值得長度
                        # f_map = features_map[fe]
                        # item_feature_onehot = np.zeros((f_map[list(f_map.keys())[0]].shape[1]))
                        pass
                    else:
                        # item_feature_onehot = features_map[fe][item].toarray()
                        tmp[i, :feature_lengh] = features_map[fe][item_i].toarray()
                    if item_j not in features_map[fe].keys():
                        # 取第一個鍵值得長度
                        # f_map = features_map[fe]
                        # item_feature_onehot = np.zeros((f_map[list(f_map.keys())[0]].shape[1]))
                        pass
                    else:
                        # item_feature_onehot = features_map[fe][item].toarray()
                        tmp[i, feature_lengh:] = features_map[fe][item_j].toarray()
                # dense = np.concatenate((dense, tmp), axis=1)
                dense = np.hstack((dense, tmp))

        # create user feature
        u_feature = users_feature[1].keys()
        for fe in u_feature:
            # sparse
            if fe.split("_")[1] != 'age':
                f_map = features_map[fe]
                tmp = np.zeros((len(data[b:b+batch_size, 1]), f_map[list(f_map.keys())[0]].shape[1]))
                for i, user in enumerate(data[b:b+batch_size, 0]):
                    # i = 0
                    user = le.inverse_transform(np.array([user])).item()
                    if user not in features_map[fe].keys():
                        # 取第一個鍵值得長度
                        # f_map = features_map[fe]
                        # user_feature_onehot = np.zeros((f_map[list(f_map.keys())[0]].shape[1]))
                        pass
                    else:
                        tmp[i] = features_map[fe][user].toarray()
                # sparse_ = np.concatenate((sparse_, tmp), axis=1)
                sparse_ = np.hstack((sparse_, tmp))
                
            # dense
            else:
                f_map = features_map[fe]
                tmp = np.zeros((len(data[b:b+batch_size, 1]), f_map[list(f_map.keys())[0]].shape[1]))
                for i, user in enumerate(data[b:b+batch_size, 0]):
                    # i = 0
                    user = le.inverse_transform(np.array([user])).item()
                    if user not in features_map[fe].keys():
                        # 取第一個鍵值得長度
                        # f_map = features_map[fe]
                        # user_feature_onehot = np.zeros((f_map[list(f_map.keys())[0]].shape[1]))
                        pass
                    else:
                        tmp[i] = features_map[fe][user].toarray()
                # dense = np.concatenate((dense, tmp), axis=1)
                dense = np.hstack((dense, tmp))
        if b==0:
            sparse_matrix = csr_matrix(sparse_)
            dense_matrix = dense
        else:
            sparse_matrix = sparse.vstack((sparse_matrix, csr_matrix(sparse_)))
            dense_matrix = np.vstack((dense_matrix, dense))
        print("[{}/{}] sparse_matrix shape is {}".format(b, data.shape[0], sparse_matrix.shape))
    
    return sparse_matrix, dense_matrix


### MovieLens

In [11]:
data = get_movielens()
# str to int
user_movie = np.array([list(map(int, data)) for data in data['user_movie']])
# 濾除使用者評分小於三筆的資料
filter_data = user_filter(user_movie, 0)
print(f"使用者評分大於三次的共有：{filter_data.shape}")
# 是否加上假資料
fake=True
if fake:
    # 取得加上使用者未評分的sample假資料
    filter_data = get_norating_data(filter_data)
    
# 取得電影個數及電影個數
len_users, movies = np.unique(filter_data[:,0]), np.unique(filter_data[:,1])
# 取得訓練資料及測試資料
training_data,  testing_data = training_testing(filter_data)

users_dict, items_dict, features = get_feature_map(data, 'user_movie')
movielens_training_df = generate_with_feature(training_data, users_dict, items_dict, init_col=["user", "movie", "rating"])
movielens_testing_df = generate_with_feature(testing_data, users_dict, items_dict, init_col=["user", "movie", "rating"])



# normalize rating value
# training_data[:, 2:3] = normalize(training_data[:, 2:3], axis=0)
# testing_data[:, 2:3] = normalize(testing_data[:, 2:3], axis=0)
# train_min = training_data[:, 2:3].min()
# train_max = training_data[:, 2:3].max()
# training_rating = (training_data[:, 2] - train_min)/(train_max-train_min)
# test_min = testing_data[:, 2:3].min()
# test_max = testing_data[:, 2:3].max()
# testing_rating = (testing_data[:, 2:3] - test_min)/(test_max-test_min)
print("users: ", len(len_users))
print("items: ", len(movies))

# generarte one hot encoding
bpr = False
if bpr:
    # get uij index
    uij_pos, uij_neg = get_uij(training_data, len_users, movies)
    print("uij_positive: {}, uij_negative: {}".format(uij_pos.shape, uij_neg.shape))
    train_uij = np.vstack((uij_pos, uij_neg))
    test_uij_pos, test_uij_neg = get_uij(testing_data, len_users, movies)
    print("testing uij_positive: {}, testing uij_negative: {}".format(test_uij_pos.shape, test_uij_neg.shape))
    test_uij = np.vstack((test_uij_pos, test_uij_neg))
    one_hot_x, y, add_fake_data = get_uij_one_hot_feature(data,  'user_movie', train_uij, batch_size=100000)
else:
    one_hot_x, y, add_fake_data = get_one_hot_feature(data,  'user_movie', batch_size=100000)

X_train, X_test, y_train, y_test = training_testing_XY(one_hot_x, y, random_state=int(config['model']['random_state']))
training_index, test_index, _, _ = training_testing_XY(add_fake_data, y, random_state=int(config['model']['random_state']))
print(X_train.shape, X_test.shape)
print(one_hot_x.shape)

user_movie:[['196' '242' '3']
 ['186' '302' '3']
 ['22' '377' '1']]
movie_genre:[['1' '3']
 ['1' '4']
 ['1' '5']]
user_age:[['1' '3']
 ['2' '6']
 ['3' '3']]
user_occupation:[['1' '1']
 ['2' '2']
 ['3' '3']]
使用者評分大於三次的共有：(100000, 3)
users:  943
items:  1682
(100000, 3)
[0/194300] sparse_matrix shape is (100000, 2666)
[100000/194300] sparse_matrix shape is (194300, 2666)
(155440, 2676) (38860, 2676)
(194300, 2676)


### Yelp

In [None]:
from sklearn.preprocessing import LabelEncoder

data = get_yelp()
# str to int
user_business = np.array([list(map(int, data)) for data in data['user_business']])
# 濾除使用者評分小於三筆的資料
filter_data = user_filter(user_business, 0)
# user label encoder
le = LabelEncoder()
filter_data[:, 0] = le.fit_transform(filter_data[:, 0])
filter_data[:, 0] += 1
# item label encoder
ile = LabelEncoder()
filter_data[:, 1] = ile.fit_transform(filter_data[:, 1])
filter_data[:, 1] += 1
# if want to inverse label 
# le.inverse_transform(yelp_training_encoder)
print(f"使用者評分大於三次的共有：{filter_data.shape}")
# 是否加上假資料
fake=True
if fake:
    # 取得加上使用者未評分的sample假資料
    filter_data = get_norating_data(filter_data)

# 取得business個數及users個數
yelp_users, business = np.unique(filter_data[:,0]), np.unique(filter_data[:,1])
# 取得訓練資料及測試資料
yelp_training_data,  yelp_testing_data = training_testing(filter_data)
users_dict, items_dict, features = get_feature_map(data, 'user_business')
yelp_training_df = generate_with_feature(yelp_training_data, users_dict, items_dict, init_col=["user", "business", "rating"])
yelp_testing_df = generate_with_feature(yelp_testing_data, users_dict, items_dict, init_col=["user", "business", "rating"])

print("users: ", len(yelp_users))
print("items: ", len(business))
# generarte one hot encoding
bpr = False
if bpr:
    # get uij index
    uij_pos, uij_neg = get_uij(training_data, yelp_users, business)
    print("uij_positive: {}, uij_negative: {}".format(uij_pos.shape, uij_neg.shape))
    train_uij = np.vstack((uij_pos, uij_neg))
    test_uij_pos, test_uij_neg = get_uij(testing_data, yelp_users, business)
    print("testing uij_positive: {}, testing uij_negative: {}".format(test_uij_pos.shape, test_uij_neg.shape))
    test_uij = np.vstack((test_uij_pos, test_uij_neg))
    one_hot_x, y, add_fake_data = get_uij_one_hot_feature(data,  'user_business', train_uij)
else:
    one_hot_x, y, add_fake_data = get_one_hot_feature(data,  'user_business')

# generarte one hot encoding
X_train_yelp, X_test_yelp, y_train_yelp, y_test_yelp = training_testing_XY(one_hot_x, y, random_state=int(config['model']['random_state']))
training_index_yelp, test_index_yelp, _, _ = training_testing_XY(add_fake_data, y, random_state=int(config['model']['random_state']))
print(X_train_yelp.shape, X_test_yelp.shape)
print(one_hot_x.shape)

### Douban

In [None]:
from sklearn.preprocessing import LabelEncoder

data = get_douban()
# str to int
user_book = np.array([list(map(int, data)) for data in data['user_book']])
# 濾除使用者評分小於三筆的資料
filter_data = user_filter(user_book, 0)
# user label encoder
le = LabelEncoder()
filter_data[:, 0] = le.fit_transform(filter_data[:, 0])
filter_data[:, 0] += 1
# item label encoder
ile = LabelEncoder()
filter_data[:, 1] = ile.fit_transform(filter_data[:, 1])
filter_data[:, 1] += 1
print(f"使用者評分大於三次的共有：{filter_data.shape}")
# 是否加上假資料
fake=True
if fake:
    # 取得加上使用者未評分的sample假資料
    filter_data = get_norating_data(filter_data)

# 取得business個數及users個數
douban_users, books = np.unique(filter_data[:,0]), np.unique(filter_data[:,1])
# 取得訓練資料及測試資料
douban_training_data,  douban_testing_data = training_testing(filter_data)
users_dict, items_dict, features = get_feature_map(data, 'user_book')
douban_training_df = generate_with_feature(douban_training_data, users_dict, items_dict, init_col=["user", "book", "rating"])
douban_testing_df = generate_with_feature(douban_testing_data, users_dict, items_dict, init_col=["user", "book", "rating"])

print("users: ", len(douban_users))
print("items: ", len(books))
# generarte one hot encoding
bpr = False
if bpr:
    # get uij index
    uij_pos, uij_neg = get_uij(training_data, douban_users, books)
    print("uij_positive: {}, uij_negative: {}".format(uij_pos.shape, uij_neg.shape))
    train_uij = np.vstack((uij_pos, uij_neg))
    test_uij_pos, test_uij_neg = get_uij(testing_data, douban_users, books)
    print("testing uij_positive: {}, testing uij_negative: {}".format(test_uij_pos.shape, test_uij_neg.shape))
    test_uij = np.vstack((test_uij_pos, test_uij_neg))
    one_hot_x, y, add_fake_data = get_uij_one_hot_feature(data,  'user_book', train_uij)
else:
    one_hot_x, y, add_fake_data = get_one_hot_feature(data,  'user_book')

# generarte one hot encoding
X_train_douban, X_test_douban, y_train_douban, y_test_douban = training_testing_XY(one_hot_x, y, random_state=int(config['model']['random_state']))
training_index_douban, test_index_douban, _, _ = training_testing_XY(add_fake_data, y, random_state=int(config['model']['random_state']))


## 1. User-based Collaborative Filtering (U-CF)

In [None]:
import heapq
import copy
from tqdm import tqdm


In [None]:
def user_sim_score(users, items, train_data, test_data, k=int(config['CF']['user_K'])):
    # make matrix
    user_matrix = get_user_item_matrix(train_data, users, items)
    test_matrix = get_user_item_matrix(test_data, users, items)
    # 計算bias
    bias_matrix = util.get_bias(user_matrix, users, items)
    # 計算相似度
#     cos, pcc = util.get_sim_array(user_matrix)
#     cosine_dis = cos -  np.identity(len(users))
#     pcc_dis = pcc -  np.identity(len(users))
    
#     sim = {"cos":cosine_dis, "pcc":pcc_dis}
    sim = ["cos", "pcc"]
    evaluation = dict()
    for s in sim:
        delta_list = list()
        predict_array = np.zeros((test_matrix.shape))
        # sim_dis = sim[s]
        sim_array = util.get_sim_array(user_matrix, sim=s)
        sim_dis = sim_array -  np.identity(len(users))
        for i in tqdm(range(len(users)), desc=f"UCF predicting {s} score with {k}"):
            # Suv: 取出前K個最相似的使用者相似度 ex:K=3, output=[0.378, 0.353, 0.336]
            Suv = heapq.nlargest(k ,sim_dis[i])
            # 若i不存在，則跳過
            if np.isnan(sim_dis[i]).all():
                continue
            # top_sim_index: 取出與使用者i最為相似的前K個使用者 ex:K=3, output=[915, 406, 214]
            sim_dis_idx = sim_dis[i].tolist()
            top_sim_index = list(map(sim_dis_idx.index, heapq.nlargest(k,sim_dis[i])))
            # recall
            prediction = list()
            # 計算相似使用者與使用者i的評分誤差
            for item_idx in range(len(items)):
                # 取得使用者i的評分(ground truth)
                rth = test_matrix[i, item_idx]
                # 如果使用者i有進行評分，則才納入計算RMSE
                if rth != 0:
                    # 之後需剔除對電影m未評分的相似使用者，因此先進行複製，才不會影響下一部電影的計算
                    copy_Suv = copy.deepcopy(Suv)
                    # R: 若相似使用者對電影 m 有評分則進行調整
                    R = list()
                    # 判斷相似使用者是否對電影ｍ有評分，若有評分則將原始評分減去該使用者對電影m的bias
                    for c, j in enumerate(top_sim_index):
                        if  test_matrix[j, item_idx] == 0:
                            R.append(0)
                            copy_Suv[c] = 0
                        else:
                            R.append(test_matrix[j, item_idx] - bias_matrix[j, item_idx])
                    # 如果所有相似使用者都沒評分則跳過此次計算
                    if sum(R) != 0:
                        # 預測使用者i對於第m部電影的評分 + 使用者i對電影m的偏差
                        Rui = predict(copy_Suv, R) + bias_matrix[i, item_idx]
                        # 計算square error
                        delta_list.append(util.se(rth, Rui))
                        # 儲存預測結果, 並取四捨五入
                        predict_array[i, item_idx] = Rui
        # 各評估指標
        evaluation[f'{s}_rmse']= util.rmse(delta_list)
        evaluation[f'{s}_recall@10'] = recall_k(test_matrix, predict_array) 
        evaluation[f'{s}_NDCG@10']=ndcg_score(test_matrix, predict_array, k=10)
        
    return evaluation


print("==========\nMovielens:\n==========")
wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="UCF")
wandb_log = WandbLog()
movie_reuslt = user_sim_score(len_users, movies, training_data, testing_data)
wandb_log.log_evaluation(movie_reuslt)
print(movie_reuslt)
wandb.finish()

print("==========\nYelp:\n==========")
wandb.init(project=config['general']['yelp'],
                        entity=config['general']['entity'],
                        group="UCF")
wandb_log = WandbLog()
yelp_reuslt = user_sim_score(yelp_users, business, yelp_training_data, yelp_testing_data)
wandb_log.log_evaluation(yelp_reuslt)
print(yelp_reuslt)
wandb.finish()

print("==========\nDouban Book:\n==========")
wandb.init(project=config['general']['douban'],
                        entity=config['general']['entity'],
                        group="UCF")
wandb_log = WandbLog()
douban_reuslt = user_sim_score(douban_users, books, douban_training_data, douban_testing_data)
wandb_log.log_evaluation(douban_reuslt)
print(douban_reuslt)
wandb.finish()

## 2. Item-based Collaborative Filtering (I-CF)

In [None]:
from scipy import sparse
import pandas as pd
from util.mywandb import WandbLog

def item_sim_score(users, items, train_data, test_data, k=int(config['CF']['user_K'])):
    # make matrix
    user_matrix = get_user_item_matrix(train_data, users, items)
    test_matrix = get_user_item_matrix(test_data, users, items)
    item_matrix = user_matrix.T 
    item_test = test_matrix.T
    #item_test = sparse.csr_matrix(item_test)
    del test_matrix
    
    # 計算bias
    bias_matrix = util.get_bias(user_matrix, users, items)
    item_bias = bias_matrix.T
    del bias_matrix
    del user_matrix
    
    # 計算相似度
    #cos, pcc = util.get_sim_array(item_matrix)
    #cosine_dis = cos -  np.identity(len(items))
    #cosine_dis = sparse.csr_matrix(cosine_dis)
    #pcc_dis = pcc -  np.identity(len(items))
    #pcc_dis = sparse.csr_matrix(pcc_dis)
    #sim = {"cos":cosine_dis, "pcc":pcc_dis}
    sim = ["cos", "pcc"]
    evaluation = dict()
    for s in sim:
        delta_list = list()
        predict_array = np.zeros((item_test.shape))
        # predict array to spase
        predict_array = sparse.csr_matrix(predict_array)
        sim_array = util.get_sim_array(item_matrix, sim=s)
        sim_dis = sim_array -  np.identity(len(items))
        # sim_dis = sim[s]
        for i in tqdm(range(len(items)), desc=f"ICF predicting {s} score with {k}"):
            # Siv: 取出前K個最相似的使用者相似度 ex:K=3, output=[0.378, 0.353, 0.336]
            Siv = heapq.nlargest(k ,sim_dis[i])
            # 若i不存在，則跳過
            if np.isnan(sim_dis[i]).all():
                continue
            sim_dis[i][np.isnan(sim_dis[i])] = 0
            # top_sim_index: 取出與使用者i最為相似的前K個使用者 ex:K=3, output=[915, 406, 214]
            sim_dis_idx = sim_dis[i].tolist()
            top_sim_index = list(map(sim_dis_idx.index, heapq.nlargest(k,sim_dis[i])))
            # recall
            prediction = list()
            # 計算相似電影與電影i的評分誤差
            for user_idx in range(len(users)):
                # 取得項目i的評分(ground truth)
                rth = item_test[i, user_idx]
                # 如果使用者i有進行評分，則才納入計算RMSE
                if rth != 0:
                    # 之後需剔除對電影m未評分的相似使用者，因此先進行複製，才不會影響下一部電影的計算
                    copy_Siv = copy.deepcopy(Siv)
                    # R: 若相似使用者對電影 m 有評分則進行調整
                    R = list()
                    # 判斷相似使用者是否對電影ｍ有評分，若有評分則將原始評分減去該使用者對電影m的bias
                    for c, j in enumerate(top_sim_index):
                        if  item_test[j, user_idx] == 0:
                            R.append(0)
                            copy_Siv[c] = 0
                        else:
                            R.append(item_test[j, user_idx] - item_bias[j, user_idx])
                    # 如果所有相似使用者都沒評分則跳過此次計算
                    if sum(R) != 0:
                        # 預測使用者i對於第m部電影的評分 + 使用者i對電影m的偏差
                        Rui = predict(copy_Siv, R) + item_bias[i, user_idx]
                        # 計算square error
                        delta_list.append(util.se(rth, Rui))
                        # 儲存預測結果, 並取四捨五入
                        if np.isnan(Rui):
                            Rui=0
                        predict_array[i, user_idx] = Rui
        
        
        # 各評估指標
        delta_list = pd.Series(delta_list, dtype=object).fillna(0).tolist()
        evaluation[f'{s}_rmse']= util.rmse(delta_list)
        evaluation[f'{s}_recall@10'] = recall_k(item_test, predict_array) 
        evaluation[f'{s}_NDCG@10']=ndcg_score(item_test, predict_array.toarray(), k=10)
        
    return evaluation

# print("==========\nMovielens:\n==========")
# wandb.init(project=config['general']['movielens'],
#                         entity=config['general']['entity'],
#                         group="ICF")
# wandb_log = WandbLog()
# movie_reuslt = item_sim_score(len_users, movies, training_data, testing_data)
# wandb_log.log_evaluation(movie_reuslt)
# print(movie_reuslt)
# wandb.finish()

print("==========\nYelp:\n==========")
wandb.init(project=config['general']['yelp'],
                        entity=config['general']['entity'],
                        group="ICF")
wandb_log = WandbLog()
yelp_reuslt = item_sim_score(yelp_users, business, yelp_training_data, yelp_testing_data)
wandb_log.log_evaluation(yelp_reuslt)
print(yelp_reuslt)
wandb.finish()

# print("==========\nDouban Book:\n==========")
# wandb.init(project=config['general']['douban'],
#                         entity=config['general']['entity'],
#                         group="ICF")
# wandb_log = WandbLog()
# douban_reuslt = item_sim_score(douban_users, books, douban_training_data, douban_testing_data)
# wandb_log.log_evaluation(douban_reuslt)
# print(douban_reuslt)
# wandb.finish()

## 3. MF

In [None]:
import wandb
from util.mywandb import WandbLog

# 進行測試資料驗證評估
def test(test_data, p, q, gu=False, bu=False, bi=False):
    rmse_test = list()

    for test in test_data:
        user = test[0] - 1
        movie = test[1] - 1
        # 判斷是否有bias
        if gu and bu.any() and bi.any():
            rmse_test.append(util.se(test[2], (np.dot(p[user], q[movie]) + gu + bu[user] + bi[movie])))
        else:
            rmse_test.append(util.se(test[2], (np.dot(p[user], q[movie]))))
    return util.rmse(rmse_test)

def execute_matrix_factorization(users, items, train_data, test_data):
    # 存放測試資料集的rmse結果
    MF_bias_testing = list()
    # init evaluation
    evaluation = dict()
    user_item = get_user_item_matrix(train_data, users, items)
    test_matrix = get_user_item_matrix(test_data, users, items)

    # init setting global mean
    gu= util.get_u(user_item)
    # init setting user mean as bias
    bu = np.array([util.get_ubias(user_item, i) - gu for i in range(len(users))])
    # init setting items mean as bias
    bi = np.array([util.get_ibias(user_item, m) - gu for m in range(len(items))])

    # init lentent vector
    K = int(config["MF"]["latent_vector_number"])
    # init user lentent matrix
    P = np.random.uniform(low=0, high=3, size=(users.max(), K))
    # init items lentent matrix
    Q = np.random.uniform(low=0, high=3, size=(items.max(), K))

    # parameter
    epochs = int(config["MF"]["epochs"])
    alpha = float(config["MF"]["alpha"])
    l = float(config["MF"]["learning_rate"])

    # 更新次數, init=100
    for epoch in range(epochs):
        # 存放 spuare error 結果
        se_list = list()
        # 針對user有評分過的rating位置進行更新(User Latent Matrix)
        for j in range(len(users)):
            # 找出被使用者j評分過的電影
            # movie_index = [i for i, e in enumerate(user_item[j]) if e != 0]
            movie_index = np.nonzero(user_item[j])[0]
            for m in movie_index:
                # 對u 做偏微分進行ＳＧＤ更新
                tmp_gu = gu - alpha * (((np.dot(P[j], Q[m]) + gu + bu[j] + bi[m]) - user_item[j,m]) + l*(gu))
                # 對bu 做偏微分進行ＳＧＤ更新
                tmp_bu = bu[j] - alpha * (((np.dot(P[j], Q[m]) + gu + bu[j] + bi[m]) - user_item[j,m]) + l*(bu[j]))
                # 對bi 做偏微分進行ＳＧＤ更新
                tmp_bi = bi[m] - alpha * (((np.dot(P[j], Q[m]) + gu + bu[j] + bi[m]) - user_item[j,m]) + l*(bi[m]))
                # 若user item 有值則對Q的相對欄位進行SGD更新, 將更新後user latent matrix先暫存
                tmp = Q[m] - alpha * (((np.dot(P[j], Q[m]) + gu + bu[j] + bi[m]) - user_item[j,m]) * P[j] + l*(Q[m]))
                # 更新 movie latent matrix
                P[j] -= alpha * (((np.dot(P[j], Q[m]) + gu + bu[j] + bi[m]) - user_item[j,m]) * Q[m] + l*(P[j]))
                # 更新 user latent matrix
                Q[m] = tmp
                # 更新bias
                gu = tmp_gu
                bu[j] = tmp_bu
                bi[m] = tmp_bi
                # 計算ＳＥ
                se_list.append(util.se(user_item[j, m], (np.dot(P[j], Q[m]) + gu + bu[j] + bi[m])))
                
        # 進行驗證資料測試
        MF_bias_testing.append(test(test_data, P, Q, gu, bu, bi))
        if epoch % 9 == 0:
            print(f"[{epoch}/{epochs}] gu={gu}, bu={np.mean(bu)}, bi={np.mean(bi)}, testing error={MF_bias_testing[-1]}")

    # 各評估指標
    evaluation['rmse']= MF_bias_testing[-1]
    evaluation['recall@10'] = recall_k(test_matrix, np.dot(P, Q.T))
    evaluation['NDCG@10'] = ndcg_score(test_matrix, np.dot(P, Q.T))
    
    return evaluation

# print("==========\nMovielens:\n==========")
# wandb.init(project=config['general']['movielens'],
#                         entity=config['general']['entity'],
#                         group="MF")
# wandb_log = WandbLog()
# movie_reuslt = execute_matrix_factorization(len_users, movies, training_data, testing_data)
# wandb_log.log_evaluation(movie_reuslt)
# print(movie_reuslt)
# wandb.finish()

print("==========\nYelp:\n==========")
wandb.init(project=config['general']['yelp'],
                        entity=config['general']['entity'],
                        group="MF")
wandb_log = WandbLog()
yelp_reuslt = execute_matrix_factorization(yelp_users, business, yelp_training_data, yelp_testing_data)
wandb_log.log_evaluation(yelp_reuslt)
print(yelp_reuslt)
wandb.finish()

print("==========\nDouban Book:\n==========")
wandb.init(project=config['general']['douban'],
                        entity=config['general']['entity'],
                        group="MF")
wandb_log = WandbLog()
douban_reuslt = execute_matrix_factorization(douban_users, books, douban_training_data, douban_testing_data)
wandb_log.log_evaluation(douban_reuslt)
print(douban_reuslt)
wandb.finish()

## 4. BPR-MF

In [15]:
import wandb
from util.mywandb import WandbLog

# 進行測試資料驗證評估
def test(test_uij, users, items, p, q, gu=False, bu=False, bi=False):
    rmse_test = list()
    
    
    for u, i, j, rank in test_uij:
        u_idx = u - 1
        i_idx = i - 1
        j_idx = j - 1
        rui = np.dot(p[u_idx], q[i_idx])
        ruj = np.dot(p[u_idx], q[j_idx])
        x_uij =  rui - ruj
        # sigmoid
        exp_x = np.exp(-x_uij)
        y_hat = 1/(1 + np.exp(exp_x))
        
        rmse_test.append(util.se(y_hat, rank))
        
    return util.rmse(rmse_test)


            
    
def execute_bpr_matrix_factorization(users, items, train_data, test_data):
    # 存放測試資料集的rmse結果
    MF_bias_testing = list()
    # init evaluation
    evaluation = dict()
    user_item = get_user_item_matrix(train_data, users, items)
    test_matrix = get_user_item_matrix(test_data, users, items)

    # init setting global mean
    gu= util.get_u(user_item)
    # init setting user mean as bias
    bu = np.array([util.get_ubias(user_item, i) - gu for i in range(len(users))])
    # init setting items mean as bias
    bi = np.array([util.get_ibias(user_item, m) - gu for m in range(len(items))])

    # init lentent vector
    K = int(config["MF"]["latent_vector_number"])
    # init user lentent matrix
    # P = np.random.uniform(low=0, high=3, size=(users.max(), K))
    P = np.random.randn(users.max(), K)/10
    # init items lentent matrix
    # Q = np.random.uniform(low=0, high=3, size=(items.max(), K))
    Q = np.random.randn(items.max(), K)/10
    
    # get uij index
    uij_pos, uij_neg = get_uij(train_data, users, items)
    print("uij_positive: {}, uij_negative: {}".format(uij_pos.shape, uij_neg.shape))
    # uij = np.vstack(uij_pos, uij_neg)
    test_uij_pos, test_uij_neg = get_uij(test_data, users, items)
    print("testing uij_positive: {}, testing uij_negative: {}".format(test_uij_pos.shape, test_uij_neg.shape))
    test_uij = np.vstack((test_uij_pos, test_uij_neg))

    # parameter
    epochs = int(config["MF"]["epochs"])
    alpha = float(config["MF"]["alpha"])
    l = float(config["MF"]["learning_rate"])

    # 更新次數, init=100
    for epoch in range(epochs):
        # 針對user有評分過的rating位置進行更新(User Latent Matrix)
        for u, i, j, rank in uij_pos:
            # 計算x_uij
            # rui = np.dot(P[u], Q[i]) + gu + bu[u] + bi[i]
            # ruj = np.dot(P[u], Q[j]) + gu + bu[u] + bi[j]
            i_idx = i-1
            j_idx = j-1
            u_idx = u-1
            rui = np.dot(P[u_idx], Q[i_idx])
            ruj = np.dot(P[u_idx], Q[j_idx])
            x_uij =  rui - ruj
            
            # sigmoid
            exp_x = np.exp(-x_uij)
            partial_BPR = 1/(1 + np.exp(exp_x))
            
            # 更新 user latent matrix
            New_P= alpha * (partial_BPR*(Q[i_idx]-Q[j_idx]) + l*(P[u_idx]))
            # 若user item 有值則對Q的相對欄位進行SGD更新, 將更新後user latent matrix先暫存
            Q[i_idx] -=  alpha * (partial_BPR*P[u_idx] + l*(Q[i_idx]))
            Q[j_idx] -=  alpha * (partial_BPR*-P[u_idx] + l*(Q[j_idx]))
            # 更新 user latent matrix
            P[u_idx] = New_P
            # # 更新bias
            # # 對u 做偏微分進行ＳＧＤ更新
            # gu = gu - alpha * ((rui - user_item[j,m]) + l*(gu))
            # # 對bu 做偏微分進行ＳＧＤ更新
            # bu[j] = bu[j] - alpha * ((rui - user_item[j,m]) + l*(bu[j]))
            # # 對bi 做偏微分進行ＳＧＤ更新
            # bi[m] = bi[m] - alpha * ((rui - user_item[j,m]) + l*(bi[m]))
            

                
        # 進行驗證資料測試
        MF_bias_testing.append(test(test_uij, users, items, P, Q))
        if epoch % 9 == 0:
            print(f"[{epoch}/{epochs}] testing error={MF_bias_testing[-1]}")
    
    rui = np.dot(P[u_idx], Q[i_idx])
    ruj = np.dot(P[u_idx], Q[j_idx])
    x_uij =  rui - ruj
    print("{} user like item{} more than item{}, score is {}: ".format(u, i, j, x_uij))
    # 各評估指標
    evaluation['rmse']= MF_bias_testing[-1]
    evaluation['recall@10'] = recall_k(test_matrix, np.dot(P, Q.T))
    evaluation['NDCG@10'] = ndcg_score(test_matrix, np.dot(P, Q.T))
    
    return evaluation, P, Q

print("==========\nMovielens:\n==========")
wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="BPR-MF")
wandb_log = WandbLog()
movie_reuslt, P, Q = execute_bpr_matrix_factorization(len_users, movies, training_data, testing_data)
print(movie_reuslt)
wandb_log.log_evaluation(movie_reuslt)
wandb.finish()

# print("==========\nYelp:\n==========")
# wandb.init(project=config['general']['yelp'],
#                         entity=config['general']['entity'],
#                         group="MF")
# wandb_log = WandbLog()
# yelp_reuslt = execute_matrix_factorization(yelp_users, business, yelp_training_data, yelp_testing_data)
# wandb_log.log_evaluation(yelp_reuslt)
# print(yelp_reuslt)
# wandb.finish()

# print("==========\nDouban Book:\n==========")
# wandb.init(project=config['general']['douban'],
#                         entity=config['general']['entity'],
#                         group="MF")
# wandb_log = WandbLog()
# douban_reuslt = execute_matrix_factorization(douban_users, books, douban_training_data, douban_testing_data)
# wandb_log.log_evaluation(douban_reuslt)
# print(douban_reuslt)
# wandb.finish()

Movielens:





data transfer user matrix: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 943/943 [00:00<00:00, 2353.96it/s]
data transfer user matrix: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 943/943 [00:00<00:00, 8973.98it/s]


[0/943] uij_pos: (239, 4), uij_neg: (238, 4)
[300/943] uij_pos: (28185, 4), uij_neg: (27998, 4)
[600/943] uij_pos: (59223, 4), uij_neg: (58840, 4)
[900/943] uij_pos: (85840, 4), uij_neg: (85286, 4)
uij_positive: (89267, 4), uij_negative: (88690, 4)
[0/943] uij_pos: (44, 4), uij_neg: (44, 4)
[300/943] uij_pos: (7471, 4), uij_neg: (7316, 4)
[600/943] uij_pos: (15455, 4), uij_neg: (15150, 4)
[900/943] uij_pos: (22434, 4), uij_neg: (21972, 4)
testing uij_positive: (23270, 4), testing uij_negative: (22786, 4)
[0/100] testing error=0.5530065562885556
[9/100] testing error=0.5530065399186914
[18/100] testing error=0.5530065492137272
[27/100] testing error=0.5530065526123277
[36/100] testing error=0.5530065536114757
[45/100] testing error=0.5530065539055203
[54/100] testing error=0.5530065539935322
[63/100] testing error=0.5530065540197368
[72/100] testing error=0.553006554027163
[81/100] testing error=0.5530065540289968
[90/100] testing error=0.5530065540292651
[99/100] testing error=0.553006

0,1
NDCG@10,0.24916
recall@10,0.00042
rmse,0.55301


## 5. FM

In [None]:
import pywFM
import random
from IPython.display import clear_output

def execute_factorization_machine(X, y, X_test, y_test, training_index, test_index, users, items):
    rating_testing_array = generate_eval_array(y_test, test_index, users, items)
    
    # kfold = 5
    kfold = list()
    recall = list()
    ndcg = list()
    result = dict()
    sum_predict_values = 0 
    for i in range(5):
        print(f"Start {i} FM Cross-Validation")
        random_state = random.randint(0, 50)
        X_train, X_val, y_train, y_val = training_testing_XY(X, y, test_size=float(config["model"]["val_rate"]), random_state=random_state)
        _, val_index, _, _ = training_testing_XY(training_index, y, test_size=float(config["model"]["val_rate"]), random_state=random_state)

        # reshape y
        y_train = y_train.reshape(1, -1)[0]
        y_test = y_test.reshape(1, -1)[0]
        y_val = y_val.reshape(1, -1)[0]

        # define model
        fm = pywFM.FM(task='regression')

        model = fm.run(X_train, y_train, X_val, y_val)
        predict_values = model.predictions
        predict = generate_eval_array(predict_values, val_index, users, items)
        kfold.append(util.rmse(predict_values - y_val))
        recall.append(recall_k(rating_testing_array, predict))
        ndcg.append(ndcg_score(rating_testing_array, predict))
        #sum_predict_values += predict_values
        clear_output()

    result['rmse'] = sum(kfold)/len(kfold) 
    result['recall@10'] = sum(recall)/len(recall)
    result['NDCG@10'] = sum(ndcg)/len(ndcg)

    return result

# print("==========\nMovielens:\n==========")
# wandb.init(project=config['general']['movielens'],
#                         entity=config['general']['entity'],
#                         group="FM")
# wandb_log = WandbLog()
# movie_reuslt = execute_factorization_machine(X_train, y_train, X_test, y_test, training_index, test_index, len_users, movies)

# wandb_log.log_evaluation(movie_reuslt)
# print(movie_reuslt)
# wandb.finish()

# print("==========\nYelp:\n==========")
# wandb.init(project=config['general']['yelp'],
#                         entity=config['general']['entity'],
#                         group="FM")
# wandb_log = WandbLog()
# yelp_reuslt = execute_factorization_machine(X_train_yelp, y_train_yelp, X_test_yelp, y_test_yelp, training_index_yelp, test_index_yelp, yelp_users, business)

# wandb_log.log_evaluation(yelp_reuslt)
# print(yelp_reuslt)
# wandb.finish()

print("==========\nDouban Book:\n==========")
wandb.init(project=config['general']['douban'],
                        entity=config['general']['entity'],
                        group="FM")
wandb_log = WandbLog()
douban_reuslt = execute_factorization_machine(X_train_douban, y_train_douban, X_test_douban, y_test_douban, training_index_douban, test_index_douban, douban_users, books)
wandb_log.log_evaluation(douban_reuslt)
print(douban_reuslt)
wandb.finish()

## 6. BPR-FM

In [16]:
import pywFM
import random
from IPython.display import clear_output

def execute_bpr_factorization_machine(X, y, X_test, y_test, training_index, test_index, users, items):
    rating_testing_array = generate_eval_array(y_test, test_index, users, items)
    
    # kfold = 5
    kfold = list()
    recall = list()
    ndcg = list()
    result = dict()
    sum_predict_values = 0 
    for i in range(5):
        print(f"Start {i} FM Cross-Validation")
        random_state = random.randint(0, 50)
        X_train, X_val, y_train, y_val = training_testing_XY(X, y, test_size=float(config["model"]["val_rate"]), random_state=random_state)
        _, val_index, _, _ = training_testing_XY(training_index, y, test_size=float(config["model"]["val_rate"]), random_state=random_state)
        
        # reshape y
        y_train = y_train.reshape(1, -1)[0]
        y_test = y_test.reshape(1, -1)[0]
        y_val = y_val.reshape(1, -1)[0]

        # define model
        fm = pywFM.FM(task='classification')

        model = fm.run(X_train, y_train, X_val, y_val)
        predict_values = model.predictions
        predict = generate_eval_array(predict_values, val_index, users, items)
        print(predict_values - y_val)
        kfold.append(util.rmse(list(map(abs, predict_values - y_val))))
        recall.append(recall_k(rating_testing_array, predict))
        ndcg.append(ndcg_score(rating_testing_array, predict))
        #sum_predict_values += predict_values
        clear_output()

    result['rmse'] = sum(kfold)/len(kfold) 
    result['recall@10'] = sum(recall)/len(recall)
    result['NDCG@10'] = sum(ndcg)/len(ndcg)

    return result

print("==========\nMovielens:\n==========")
wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="BPR-FM")
wandb_log = WandbLog()
movie_reuslt = execute_bpr_factorization_machine(X_train, y_train, X_test, y_test, training_index, test_index, len_users, movies)

wandb_log.log_evaluation(movie_reuslt)
print(movie_reuslt)
wandb.finish()

# print("==========\nYelp:\n==========")
# wandb.init(project=config['general']['yelp'],
#                         entity=config['general']['entity'],
#                         group="BPR-FM")
# wandb_log = WandbLog()
# yelp_reuslt = execute_factorization_machine(X_train_yelp, y_train_yelp, X_test_yelp, y_test_yelp, training_index_yelp, test_index_yelp, yelp_users, business)

# wandb_log.log_evaluation(yelp_reuslt)
# print(yelp_reuslt)
# wandb.finish()

# print("==========\nDouban Book:\n==========")
# wandb.init(project=config['general']['douban'],
#                         entity=config['general']['entity'],
#                         group="BPR-FM")
# wandb_log = WandbLog()
# douban_reuslt = execute_factorization_machine(X_train_douban, y_train_douban, X_test_douban, y_test_douban, training_index_douban, test_index_douban, douban_users, books)
# wandb_log.log_evaluation(douban_reuslt)
# print(douban_reuslt)
# wandb.finish()

{'rmse': 0.7123524561238528, 'recall@10': 0.04213040476104326, 'NDCG@10': 0.29914077644948034}



0,1
NDCG@10,0.29914
recall@10,0.04213
rmse,0.71235


## 7.GBDT+LR

In [20]:
from sktools import GradientBoostingFeatureGenerator
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import random
from IPython.display import clear_output

def execute_gbdt_lr(X, y, X_test, y_test, training_index, test_index, users, items):
    rating_testing_array = generate_eval_array(y_test, test_index, users, items)
    
    # kfold = 5
    kfold = list()
    recall = list()
    ndcg = list()
    result = dict()
    sum_predict_values = 0 
    for i in range(5):
        print(f"Start {i} GBDT+LR Cross-Validation")
        random_state = random.randint(0, 50)
        X_train, X_val, y_train, y_val = training_testing_XY(X, y, test_size=float(config["model"]["val_rate"]), random_state=random_state)
        _, val_index, _, _ = training_testing_XY(training_index, y, test_size=float(config["model"]["val_rate"]), random_state=random_state)
        
        # reshape y
        y_train = y_train.ravel()
        y_test = y_test.reshape(1, -1)[0]
        y_val = y_val.reshape(1, -1)[0]

        # define model
        gbf = GradientBoostingFeatureGenerator(regression=True)
        lr = LogisticRegression()
        pipe = Pipeline([("gb_features", gbf), ("logistic", lr)])
        
        pipe.fit(X_train.toarray(), y_train)

        predict_values = pipe.predict(X_val.toarray())
        predict = generate_eval_array(predict_values, val_index, users, items)
        kfold.append(util.rmse(list(map(abs, predict_values - y_val))))
        recall.append(recall_k(rating_testing_array, predict))
        ndcg.append(ndcg_score(rating_testing_array, predict))
        #sum_predict_values += predict_values
        clear_output()

    result['rmse'] = sum(kfold)/len(kfold) 
    result['recall@10'] = sum(recall)/len(recall)
    result['NDCG@10'] = sum(ndcg)/len(ndcg)

    return result

print("==========\nMovielens:\n==========")
wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="GBDT_LR")
wandb_log = WandbLog()
movie_reuslt = execute_gbdt_lr(X_train, y_train, X_test, y_test, training_index, test_index, len_users, movies)
print(movie_reuslt)
wandb_log.log_evaluation(movie_reuslt)
wandb.finish()

# print("==========\nYelp:\n==========")
# wandb.init(project=config['general']['yelp'],
#                         entity=config['general']['entity'],
#                         group="BPR-FM")
# wandb_log = WandbLog()
# yelp_reuslt = execute_factorization_machine(X_train_yelp, y_train_yelp, X_test_yelp, y_test_yelp, training_index_yelp, test_index_yelp, yelp_users, business)

# wandb_log.log_evaluation(yelp_reuslt)
# print(yelp_reuslt)
# wandb.finish()

# print("==========\nDouban Book:\n==========")
# wandb.init(project=config['general']['douban'],
#                         entity=config['general']['entity'],
#                         group="BPR-FM")
# wandb_log = WandbLog()
# douban_reuslt = execute_factorization_machine(X_train_douban, y_train_douban, X_test_douban, y_test_douban, training_index_douban, test_index_douban, douban_users, books)
# wandb_log.log_evaluation(douban_reuslt)
# print(douban_reuslt)
# wandb.finish()

{'rmse': 1.3435173759332357, 'recall@10': 0.017182727960031178, 'NDCG@10': 0.2678576171164622}



0,1
NDCG@10,0.26786
recall@10,0.01718
rmse,1.34352


## 8. XGB-LR

In [21]:
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import random
from IPython.display import clear_output

def execute_xgb_lr(X, y, X_test, y_test, training_index, test_index, users, items):
    rating_testing_array = generate_eval_array(y_test, test_index, users, items)
    
    # kfold = 5
    kfold = list()
    recall = list()
    ndcg = list()
    result = dict()
    sum_predict_values = 0 
    for i in range(5):
        print(f"Start {i} XGB+LR Cross-Validation")
        random_state = random.randint(0, 50)
        X_train, X_val, y_train, y_val = training_testing_XY(X, y, test_size=float(config["model"]["val_rate"]), random_state=random_state)
        _, val_index, _, _ = training_testing_XY(training_index, y, test_size=float(config["model"]["val_rate"]), random_state=random_state)
        
        # reshape y
        y_train = y_train.ravel()
        y_test = y_test.reshape(1, -1)[0]
        y_val = y_val.reshape(1, -1)[0]

        # define model
        gbf = SelectFromModel(estimator=XGBRegressor(), max_features=100, threshold=-np.inf)
        lr = LogisticRegression()
        pipe = Pipeline([("xgb_features", gbf), ("logistic", lr)])
        
        pipe.fit(X_train.toarray(), y_train)

        predict_values = pipe.predict(X_val.toarray())
        predict = generate_eval_array(predict_values, val_index, users, items)
        kfold.append(util.rmse(list(map(abs, predict_values - y_val))))
        recall.append(recall_k(rating_testing_array, predict))
        ndcg.append(ndcg_score(rating_testing_array, predict))
        #sum_predict_values += predict_values
        clear_output()

    result['rmse'] = sum(kfold)/len(kfold) 
    result['recall@10'] = sum(recall)/len(recall)
    result['NDCG@10'] = sum(ndcg)/len(ndcg)

    return result

print("==========\nMovielens:\n==========")
wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="XGB_LR")
wandb_log = WandbLog()
movie_reuslt = execute_xgb_lr(X_train, y_train, X_test, y_test, training_index, test_index, len_users, movies)
print(movie_reuslt)
wandb_log.log_evaluation(movie_reuslt)
wandb.finish()

# print("==========\nYelp:\n==========")
# wandb.init(project=config['general']['yelp'],
#                         entity=config['general']['entity'],
#                         group="BPR-FM")
# wandb_log = WandbLog()
# yelp_reuslt = execute_factorization_machine(X_train_yelp, y_train_yelp, X_test_yelp, y_test_yelp, training_index_yelp, test_index_yelp, yelp_users, business)

# wandb_log.log_evaluation(yelp_reuslt)
# print(yelp_reuslt)
# wandb.finish()

# print("==========\nDouban Book:\n==========")
# wandb.init(project=config['general']['douban'],
#                         entity=config['general']['entity'],
#                         group="BPR-FM")
# wandb_log = WandbLog()
# douban_reuslt = execute_factorization_machine(X_train_douban, y_train_douban, X_test_douban, y_test_douban, training_index_douban, test_index_douban, douban_users, books)
# wandb_log.log_evaluation(douban_reuslt)
# print(douban_reuslt)
# wandb.finish()

{'rmse': 1.3514107576601617, 'recall@10': 0.017182727960031178, 'NDCG@10': 0.2678576171164622}



0,1
NDCG@10,0.26786
recall@10,0.01718
rmse,1.35141


## 9. NN-based RecSys Methods

### define model

In [13]:
from models.nn_based_models import DeepCTRModel


def deepfm(dataframe, testing_data, test_index, users, movies):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="DeepFM",
                        reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result = deer.DeepFM(dataframe, testing_data, test_index, users, movies)
    clear_output()
    print(f"DeepFM={result}")
    run.finish()

def nfm(dataframe, testing_data, test_index, users, movies):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="NFM",
                        reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result = deer.NFM(dataframe, testing_data, test_index, users, movies)
    clear_output()
    print(f"NFM={result}")
    run.finish()
    
def dcn(dataframe, testing_data, test_index, users, movies):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="DCN",
                        reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result = deer.DCN(dataframe, testing_data, test_index, users, movies)
    clear_output()
    print(f"DCN={result}")
    run.finish()

def wd(dataframe, testing_data, test_index, users, movies):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="W&D",
                        reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result = deer.WD(dataframe, testing_data, test_index, users, movies)
    clear_output()
    print(f"W&D={result}")
    run.finish()

def ccpm(dataframe, testing_data, test_index, users, movies):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="CCPM",
                        reinit=True)
    # no suppot dense
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation', 'user_age'],
                        y=['rating'])
    result = deer.CCPM(dataframe, testing_data, test_index, users, movies)
    clear_output()
    print(f"CCPM={result}")
    run.finish()
    
def fnn(dataframe, testing_data, test_index, users, movies):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="FNN",
                        reinit=True)
    deer= DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result, _ = deer.FNN(dataframe, testing_data, test_index, users, movies)
    clear_output()
    print(f"FNN={result}")

    run.finish()

def ipnn(dataframe, testing_data, test_index, users, movies, inner=True, outter=False):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="IPNN",
                        reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result, _ = deer.PNN(dataframe, testing_data, test_index, users, movies, inner=inner, outter=outter)
    clear_output()
    print(f"IPNN={result}")
    run.finish()

def opnn(dataframe, testing_data, test_index, users, movies, inner=False, outter=True):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="OPNN",
                        reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result, _ = deer.PNN(dataframe, testing_data, test_index, users, movies, inner=inner, outter=outter)
    clear_output()
    print(f"OPNN={result}")
    run.finish()

def pin(dataframe, testing_data, test_index, users, movies, inner=True, outter=True):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="PIN",
                        reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result = deer.PNN(dataframe, testing_data, test_index, users, movies, inner=inner, outter=outter)
    clear_output()
    print(f"PIN={result}")
    run.finish()

### run model

In [14]:
print("==========\nMovielens:\n==========")
# 1. FM-supported Neural Networks
fnn(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# 2. Product-based Neural Networks
ipnn(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
opnn(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
pin(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# 3. Convolutional Click Prediction Model 
ccpm(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# 4. neumf
# 5. Wide&Deep
wd(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# 6. Deep Drossing
dcn(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# 7. Neural Factorization Machine
nfm(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# 8. Deep Factorization Machine
deepfm(movielens_training_df, movielens_testing_df, test_index, len_users, movies)

# print("==========\nYelp:\n==========")
# # 1. FM-supported Neural Networks
# fnn(yelp_training_df, yelp_testing_df, test_index, yelp_users, business)
# # 2. Product-based Neural Networks
# ipnn(yelp_training_df, yelp_testing_df, test_index, yelp_users, business)
# opnn(yelp_training_df, yelp_testing_df, test_index, yelp_users, business)
# # 3. Convolutional Click Prediction Model 
# ccpm(yelp_training_df, yelp_testing_df, test_index, yelp_users, business)
# # 4. neumf
# # 5. Wide&Deep
# wd(yelp_training_df, yelp_testing_df, test_index, yelp_users, business)
# # 6. Deep Drossing
# dcn(yelp_training_df, yelp_testing_df, test_index, yelp_users, business)
# # 7. Neural Factorization Machine
# nfm(yelp_training_df, yelp_testing_df, test_index, yelp_users, business)
# # 8. Deep Factorization Machine
# deepfm(yelp_training_df, yelp_testing_df, test_index, yelp_users, business)

# print("==========\nDouban Book:\n==========")
# # 1. FM-supported Neural Networks
# fnn(douban_training_df, douban_testing_df, test_index, douban_users, books)
# # 2. Product-based Neural Networks
# ipnn(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# opnn(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# # 3. Convolutional Click Prediction Model 
# ccpm(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# # 4. neumf
# # 5. Wide&Deep
# wd(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# # 6. Deep Drossing
# dcn(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# # 7. Neural Factorization Machine
# nfm(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# # 8. Deep Factorization Machine
# deepfm(movielens_training_df, movielens_testing_df, test_index, len_users, movies)

DeepFM={'rmse': 1.4254262683002223, 'recall@10': 0.49286467198741046, 'ndcg@10': 0.9012225480734772}



0,1
ndcg@10,0.90122
recall@10,0.49286
rmse,1.42543


## 10. Recent NN-based RecSys Methods

### define model

In [15]:
from models.nn_based_models import DeepCTRModel

def din(train_df, test_df, test_index, users, movies, watch_history = ['movie', 'movie_genre'], target="rating"):
    # run = wandb.init(project=config['general']['movielens'],
    #                     entity=config['general']['entity'],
    #                     group="DIN",
    #                     reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result = deer.DIN(train_df, test_df, test_index, users, movies, watch_history, target)
    clear_output()
    print(f"DIN={result}")
    # run.finish()

def xdeepfm(dataframe, testing_data, test_index, users, movies):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="xDeepFM",
                        reinit=True)
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation'],
                        dense=['user_age'],
                        y=['rating'])
    result = deer.xDeepFM(dataframe, testing_data, test_index, users, movies)
    clear_output()
    print(f"xDeepFM={result}")
    run.finish()
    
def afm(dataframe, testing_data, test_index, users, movies):
    run = wandb.init(project=config['general']['movielens'],
                        entity=config['general']['entity'],
                        group="AFM",
                        reinit=True)
    # no dense
    deer = DeepCTRModel(sparse=['user', 'movie', 'movie_genre', 'user_occupation', 'user_age'],
                        y=['rating'])
    result = deer.AFM(dataframe, testing_data, test_index, users, movies)
    clear_output()
    print(f"AFM={result}")
    run.finish()

### run model

In [None]:
# 1. Attentional Factorization Machines
afm(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# 3. xDeepFM
xdeepfm(movielens_training_df, movielens_testing_df, test_index, len_users, movies)
# 4. Deep Interest Network
din(movielens_training_df, movielens_testing_df, test_index, len_users, movies)

trasfer history items: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38860/38860 [00:02<00:00, 17112.88it/s]


[0/5 Cross Validation]


trasfer history items: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 155440/155440 [00:09<00:00, 16969.20it/s]


Epoch 1/10
486/486 - 336s - loss: 2.5677 - mse: 2.5677 - val_loss: 2.3579 - val_mse: 2.3579
Epoch 2/10
486/486 - 319s - loss: 2.1894 - mse: 2.1894 - val_loss: 2.1716 - val_mse: 2.1716
Epoch 3/10
486/486 - 320s - loss: 2.0409 - mse: 2.0408 - val_loss: 2.0580 - val_mse: 2.0580
Epoch 4/10
486/486 - 320s - loss: 1.9524 - mse: 1.9524 - val_loss: 1.9982 - val_mse: 1.9982
Epoch 5/10
486/486 - 320s - loss: 1.8888 - mse: 1.8887 - val_loss: 1.9650 - val_mse: 1.9650
Epoch 6/10
486/486 - 320s - loss: 1.8421 - mse: 1.8420 - val_loss: 1.9931 - val_mse: 1.9930
Epoch 7/10
