In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data = pd.read_csv('./data/u2.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
test = pd.read_csv('./data/u2.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
data.head()
# test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,3,4,878542960
1,1,4,3,876893119
2,1,5,3,889751712
3,1,6,5,887431973
4,1,7,4,875071561


In [3]:
# 初始化
userNum = 943
itemNum = 1682
R = np.zeros((userNum, itemNum))    

# 将u2.base的数据存入矩阵
for row in data.itertuples():
    userID,  itemID, rating = row[1]-1, row[2]-1, row[3]
    R[userID, itemID] = rating 

print(R)

[[0. 0. 4. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]


In [4]:
# 一些统计量及计算公式

# 是否有评分
# y_ui[i][j] --- 用户i对物品j是否有评分
y_ui = np.zeros((userNum, itemNum))
for i in range(len(R)):
    for j in range(len(R[i])):
        if R[i][j] != 0:
            y_ui[i][j] = 1

# 物品平均评分
r = np.sum(y_ui * R) / np.sum(y_ui)

# 用户平均评分
# r_u[i, 0] --- 用户i的平均评分
r_u = np.zeros((userNum, 1))
for i in range(userNum):
    if np.sum(y_ui[i, :]) == 0:
        r_u[i] = r
    else:
        r_u[i] = np.sum(y_ui[i, :] * R[i, :]) / np.sum(y_ui[i, :])

# 物品平均评分
# r_i[i, 0] --- 物品i的平均评分
r_i = np.zeros((itemNum, 1))
for i in range(itemNum):
    if np.sum(y_ui[:, i]) == 0:
        r_i[i] = r
    else: 
        r_i[i] = np.sum(R[:, i]) / np.sum(y_ui[:, i])

# 用户对物品的评分偏差
# b_u[i, 0] --- 用户i对所有物品的评分偏差
b_u = np.zeros((userNum, 1))
for i in range(userNum):
    if np.sum(y_ui[i, :]) == 0:
        continue
    b_u[i] = np.sum(y_ui[i, :] * (R[i, :] - r_i[:, 0])) / np.sum(y_ui[i, :])
   
# 物品对用户的评分偏差
# b_i[i, 0] --- 物品i对所有用户的评分偏差 
b_i = np.zeros((itemNum, 1))
for i in range(itemNum):
    if np.sum(y_ui[:, i]) == 0:
        continue
    b_i[i] = np.sum(y_ui[:, i] * (R[:, i] - r_u[:, 0])) / np.sum(y_ui[:, i])

In [5]:
# user average
user_average_matrix = R.copy()
for i in range(len(user_average_matrix)):
    for j in range(len(user_average_matrix[i])):
        if y_ui[i][j] == 0:
            user_average_matrix[i][j] = r_u[i]
            
# item average
item_average_matrix = R.copy()
for i in range(len(item_average_matrix)):
    for j in range(len(item_average_matrix[i])):
        if y_ui[i][j] == 0:
            item_average_matrix[i][j] = r_i[j]
            
# mean of user average and item average
mean_of_user_and_item_average_matrix = R.copy()
for i in range(len(mean_of_user_and_item_average_matrix)):
    for j in range(len(mean_of_user_and_item_average_matrix[i])):
        if y_ui[i][j] == 0:
            mean_of_user_and_item_average_matrix[i][j] = (r_u[i] + r_i[j]) / 2
            
# user bias and item average
user_bias_and_item_average_matrix = R.copy()
for i in range(len(user_bias_and_item_average_matrix)):
    for j in range(len(user_bias_and_item_average_matrix[i])):
        if y_ui[i][j] == 0:
            user_bias_and_item_average_matrix[i][j] = b_u[i] + r_i[j]
            
# user average and item bias
user_average_and_item_bias_matrix = R.copy()
for i in range(len(user_average_and_item_bias_matrix)):
    for j in range(len(user_average_and_item_bias_matrix[i])):
        if y_ui[i][j] == 0:
            user_average_and_item_bias_matrix[i][j] = r_u[i] + b_i[j]
            
# global average, user bias and item bias
global_average_user_bias_and_item_bias_matrix = R.copy()
for i in range(len(global_average_user_bias_and_item_bias_matrix)):
    for j in range(len(global_average_user_bias_and_item_bias_matrix[i])):
        if y_ui[i][j] == 0:
            global_average_user_bias_and_item_bias_matrix[i][j] = r + b_u[i] + b_i[j]

In [6]:
def var_name(var,all_var=locals()):
    return [var_name for var_name in all_var if all_var[var_name] is var][0]

def MAE(matrix, test):
    sum = 0
    for row in test.itertuples():
        userID,  itemID, rating = row[1]-1, row[2]-1, row[3]
        sum += abs(matrix[userID, itemID] - rating)
    print(var_name(matrix), 'MAE: ' , sum / len(test))

MAE(user_average_matrix, test)
MAE(item_average_matrix, test)
MAE(mean_of_user_and_item_average_matrix, test)
MAE(user_bias_and_item_average_matrix, test)
MAE(user_average_and_item_bias_matrix, test)
MAE(global_average_user_bias_and_item_bias_matrix, test)

user_average_matrix MAE:  0.8383401457987351
item_average_matrix MAE:  0.8206951490543668
mean_of_user_and_item_average_matrix MAE:  0.79740308955009
user_bias_and_item_average_matrix MAE:  0.7469592188188907
user_average_and_item_bias_matrix MAE:  0.7596960148170049
global_average_user_bias_and_item_bias_matrix MAE:  0.7531006024906748
