In [None]:
import pandas as pd
import numpy as np

base = pd.read_csv('data/u2.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
test = pd.read_csv('data/u2.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

In [2]:
base.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,3,4,878542960
1,1,4,3,876893119
2,1,5,3,889751712
3,1,6,5,887431973
4,1,7,4,875071561


In [None]:
base['user_id'].max()

In [None]:
base['item_id'].max()

## Notations

$r_{ui}$ : observed rating of user u on item

$\hat{r}_{ui}$ : predicted rating of user u on item i

$n$ : user_number

$m$ : item_number

$p$ : observed_ratings_number

$\frac{p}{nm}$ : density (or sparsity)

## Problem Definition

Input : training data (observated records $R$ )

Output : estimate all rating $\hat{r}_{ui}$ in unobserved records

Evaluation : differnence between predicted $\hat{r}_{ui}$ and real $r_{ui}$

## Statistics

global rating_ave : $\bar{r}$ 全局均值

u rating_ave : $\bar{r}_u$ 用户u的偏好均值

i rating_ave : $\bar{r}_i$ 物品i的被偏好均值

bias of u : 对于u，每个u打过分的商品i，会因为u的偏好而更宽松/严格吗

bias of i : 对于i，每个给i打分的用户u，会因为i的性质而更受欢迎/不欢迎吗

## Prediction Rules

1. user_ave : $\hat{r}_{ui} = \bar{r}_{u} $ 取user对所有item的偏好程度均值


2. item_ave : $\hat{r}_{ui} = \bar{r}_{i} $ 取item被所有user的偏好程度均值

3. (user_ave+item_ave)/2

4. $b_u+\bar{r}_i$

5. $b_i+\bar{r}_u$

6. 全局均值+用户bias+物品bias $\bar{r}+b_u+b_i$

In [None]:
user_num = base['user_id'].max()
item_num = base['item_id'].max()
record_num = base.shape[0]
matrix = np.zeros((user_num, item_num))
for i in range(record_num):
    matrix[base.iloc[i, 0]-1, base.iloc[i, 1]-1] = base.iloc[i, 2]

In [None]:
raw_matrix = matrix.copy()
user_ave = np.zeros(user_num)
item_ave = np.zeros(item_num)

for i in range(user_num):
    rating_sum = matrix[i].sum()
    rating_count = (matrix[i] != 0).sum()
    if rating_count != 0:
        user_ave[i] = rating_sum / rating_count

for i in range(item_num):
    rating_sum = matrix[:, i].sum()
    rating_count = (matrix[:, i] != 0).sum()
    if rating_count != 0:
        item_ave[i] = rating_sum / rating_count

global_ave = matrix.sum() / (matrix != 0).sum()

In [None]:
user_bias = np.zeros(user_num)
item_bias = np.zeros(item_num)

for i in range(user_num):
    user_bias[i] = (matrix[i].sum() - (matrix[i]!=0).T.dot(item_ave))/(matrix[i]!=0).sum()

for i in range(item_num):
    item_bias[i] = (matrix[:, i].sum() - (matrix[:, i]!=0).dot(user_ave))/(matrix[:, i]!=0).sum()

In [None]:
max(user_bias)