In [1]:
import random
import math
import numpy as np

In [2]:
def load_movielens(path='./movielens/ml-100k'):
    # get movie titles
    movies = {}
    for line in open(path + '/u.item', encoding='latin-1'):
        id, title = line.split('|')[0:2]
        movies[id] = title
    # load data
    train = {}
    for line in open(path + '/ua.base', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        train.setdefault(user, {})
        train[user][movieid] = float(rating)
    test = {}
    for line in open(path + '/ua.test', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        test.setdefault(user, {})
        test[user][movieid] = float(rating)
    return train, test, movies

In [3]:
train, test, movies = load_movielens()

In [4]:
def gen_list(data, user_len=943, movie_len=1682):
    mat_data = np.zeros((user_len, movie_len), dtype=float)
    for u, item in data.items():
        for n, r in item.items():
            mat_data[int(u)-1][int(n)-1] = float(int(r))
    return mat_data

In [5]:
def sgd(data_matrix, k, alpha, lam, max_cycles):
    """使用梯度下降法进行矩阵分解。

    Args:
    - data_matrix: mat, 用户物品矩阵
    - k: int, 分解矩阵的参数
    - alpha: float, 学习率
    - lam: float, 正则化参数
    - max_cycles: int, 最大迭代次数

    Returns:
    p,q: mat, 分解后的矩阵
    """
    m, n = np.shape(data_matrix)
    # initiate p & q
    p = np.mat(np.random.random((m, k)))
    q = np.mat(np.random.random((k, n)))

    # start training
    for step in range(max_cycles):
        for i in range(m):
            for j in range(n):
                if data_matrix[i, j] > 0:
                    error = data_matrix[i, j]
                    for r in range(k):
                        error = error - p[i, r] * q[r, j]
                    for r in range(k):
                        p[i, r] = p[i, r] + alpha * (2 * error * q[r, j] - lam * p[i, r])
                        q[r, j] = q[r, j] + alpha * (2 * error * p[i, r] - lam * q[r, j])

        loss = 0.0
        for i in range(m):
            for j in range(n):
                if data_matrix[i, j] > 0:
                    error = 0.0
                    for r in range(k):
                        error = error + p[i, r] * q[r, j]
                    # calculate loss function
                    loss = (data_matrix[i, j] - error) * (data_matrix[i, j] - error)
                    for r in range(k):
                        loss = loss + lam * (p[i, r] * p[i, r] + q[r, j] * q[r, j]) / 2

        if loss < 0.001:
            break
        if step % 1 == 0:
            print("\titer: %d, loss: %f" % (step, loss))
    return p, q

In [6]:
mat_data = gen_list(train)

In [7]:
p, q = sgd(mat_data, 5, 0.001, 0.01, 50)

	iter: 0, loss: 2.090724
	iter: 1, loss: 1.585506
	iter: 2, loss: 1.380150
	iter: 3, loss: 1.273629
	iter: 4, loss: 1.204522
	iter: 5, loss: 1.150355
	iter: 6, loss: 1.102485
	iter: 7, loss: 1.057564
	iter: 8, loss: 1.014323
	iter: 9, loss: 0.972313
	iter: 10, loss: 0.931405
	iter: 11, loss: 0.891587
	iter: 12, loss: 0.852888
	iter: 13, loss: 0.815340
	iter: 14, loss: 0.778971
	iter: 15, loss: 0.743803
	iter: 16, loss: 0.709847
	iter: 17, loss: 0.677106
	iter: 18, loss: 0.645578
	iter: 19, loss: 0.615254
	iter: 20, loss: 0.586122
	iter: 21, loss: 0.558165
	iter: 22, loss: 0.531364
	iter: 23, loss: 0.505698
	iter: 24, loss: 0.481145
	iter: 25, loss: 0.457679
	iter: 26, loss: 0.435276
	iter: 27, loss: 0.413911
	iter: 28, loss: 0.393555
	iter: 29, loss: 0.374184
	iter: 30, loss: 0.355769
	iter: 31, loss: 0.338284
	iter: 32, loss: 0.321700
	iter: 33, loss: 0.305991
	iter: 34, loss: 0.291129
	iter: 35, loss: 0.277087
	iter: 36, loss: 0.263838
	iter: 37, loss: 0.251353
	iter: 38, loss: 0.239

In [25]:
def prediction(data_matrix, p, q, user):
    """为用户未互动的项打分

    Args:
    - data_matrix: mat, 原始用户物品矩阵
    - p: mat, 分解后的矩阵p
    - q: mat, 分解后的矩阵q
    - user: int, 用户的id

    Returns:
    - predict: list, 推荐列表
    """
    n = np.shape(data_matrix)[1]
    predict = {}
    for j in range(n):
        if data_matrix[int(user)-1, j] == 0:
            predict[j] = (p[int(user)-1,] * q[:, j])[0, 0]

    # 按照打分从大到小排序
    return sorted(predict.items(), key=lambda d: d[1], reverse=True)

In [26]:
rank = prediction(mat_data, p, q, 1)

In [27]:
 def top_k(rank, k):
    if len(rank) <= k:
        return rank
    else:
        return rank[:k]

In [28]:
print(top_k(rank, 5))

[(407, 5.0064800876901199), (356, 4.9315707855947544), (1448, 4.8885353582938498), (482, 4.8464312426977543), (284, 4.8212895109837737)]


In [29]:
def recall(train, test, N, p, q):
    hit = 0
    all = 0
    mat_train = gen_list(train)
    for user in train.keys():
        tu = test[user]
        rank = top_k(prediction(mat_data, p, q, user), N)
        for item, pui in rank:
            if str(item+1) in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)

In [30]:
print("top5的召回率为：", recall(train, test, 5, p, q))

top5的召回率为： 0.012195121951219513


In [35]:
def precision(train, test, N, p, q):
    hit = 0
    all = 0
    mat_train = gen_list(train)
    for user in train.keys():
        tu = test[user]
        rank = top_k(prediction(mat_data, p, q, user), N)
        for item, pui in rank:
            if str(item+1) in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)

In [36]:
print("top5的准确率为：",precision(train, test, 5, p, q))

top5的准确率为： 0.024390243902439025


In [41]:
def coverage(train, test, N, p, q):
    recommend_items = set()
    all_items = set()
    mat_train = gen_list(train)
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)
        rank = top_k(prediction(mat_data, p, q, user), N)
        for item, pui in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)

In [42]:
print("top5的覆盖率为：", coverage(train, test, 5, p, q))

top5的覆盖率为： 0.055952380952380955


In [43]:
def popularity(train, test, N, p, q):
    item_popularity = dict()
    mat_train = gen_list(train)
    for user, items in train.items():
        for item in items.keys():
            item_popularity[item] = item_popularity.get(item, 0) + 1
    ret = 0
    n = 0
    for user in train.keys():
        rank = top_k(prediction(mat_data, p, q, user), N)
        for item, pui in rank:
            ret += math.log(1 + 1/item_popularity[str(item + 1)])
            n += 1
    ret /= n * 1.0
    return ret

In [44]:
print('top5的新颖度为：', popularity(train, test, 5, p, q))

top5的新颖度为： 0.033497275576051755
