In [2]:
import random
import math
import numpy as np

In [3]:
def load_movielens(path='./movielens/ml-100k'):
    # get movie titles
    movies = {}
    for line in open(path + '/u.item', encoding='latin-1'):
        id, title = line.split('|')[0:2]
        movies[id] = title
    # load data
    train = {}
    for line in open(path + '/ua.base', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        train.setdefault(user, {})
        train[user][movieid] = float(rating)
    test = {}
    for line in open(path + '/ua.test', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        test.setdefault(user, {})
        test[user][movieid] = float(rating)
    return train, test, movies

In [4]:
train, test, movies = load_movielens()

In [5]:
def cal_count(data):
    count_user = len(data)
    tmp = [list(d.keys()) for d in data.values()]
    count_movie = len(set(sum(tmp, [])))
    return count_user, count_movie

In [6]:
def gen_list(data, user_len=943, movie_len=1682):
    mat_data = np.zeros((user_len, movie_len), dtype=float)
    for u, item in data.items():
        for n, r in item.items():
            mat_data[int(u)-1][int(n)-1] = float(int(r))
    return mat_data

In [7]:
def cos_sim(x, y):
    """余弦相似性

    Args:
    - x: mat, 以行向量的形式存储
    - y: mat, 以行向量的形式存储

    :return: x 和 y 之间的余弦相似度
    """
    # print(x.shape, y.T.shape)
    numerator = np.matmul(x, y.T)  # x 和 y 之间的内积
    denominator = np.sqrt(np.matmul(x, x.T)) * np.sqrt(np.matmul(y, y.T))
    return (numerator / denominator)

def similarity(data):
    """计算矩阵中任意两行之间的相似度
    Args:
    - data: mat, 任意矩阵

    :return: w, mat, 任意两行之间的相似度
    """

    m = np.shape(data)[0]  # 用户的数量
    # 初始化相似矩阵
    w = np.mat(np.zeros((m, m)))

    for i in range(m):
        for j in range(i, m):
            if not j == i:
                # 计算任意两行之间的相似度
                w[i, j] = cos_sim(data[i], data[j])
                w[j, i] = w[i, j]
            else:
                w[i, j] = 0
    return w

In [8]:
w = similarity(gen_list(train))

In [9]:
def user_based_recommend(w, data, user):
    m, n = np.shape(data)
    interaction = data[int(user)-1]
    
    not_inter = []
    for i in range(n):
        if interaction[i] == 0:
            not_inter.append(i)
            
    predict={}
    for x in not_inter:
        item = np.copy(data[:, x])
        predict[x] = np.matmul(w[int(user)-1], item)
    
    return sorted(predict.items(), key=lambda s:s[1], reverse=True)

In [10]:
rank = user_based_recommend(w, gen_list(train), 1)

In [11]:
def top_k(rank, k):
    if len(rank) <= k:
        return rank
    else:
        return rank[:k]

In [12]:
print(top_k(rank, 5))

[(317, matrix([[ 407.49894862]])), (422, matrix([[ 386.42125607]])), (356, matrix([[ 368.23395573]])), (116, matrix([[ 363.54450284]])), (201, matrix([[ 353.84852558]]))]


In [13]:
def recall(train, test, N):
    hit = 0
    all = 0
    mat_train = gen_list(train)
    w = similarity(mat_train)
    for user in train.keys():
        tu = test[user]
        rank = top_k(user_based_recommend(w, mat_train, user), N)
        for item, pui in rank:
            if str(item+1) in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)

In [14]:
print("top5的召回率为：", recall(train, test, 5))

top5的召回率为： 0.11208907741251325


In [15]:
def precision(train, test, N):
    hit = 0
    all = 0
    mat_train = gen_list(train)
    w = similarity(mat_train)
    for user in train.keys():
        tu = test[user]
        rank = top_k(user_based_recommend(w, mat_train, user), N)
        for item, pui in rank:
            if str(item+1) in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)

In [16]:
print("top5的准确率为：",precision(train, test, 5))

top5的准确率为： 0.2241781548250265


In [17]:
def coverage(train, test, N):
    recommend_items = set()
    all_items = set()
    mat_train = gen_list(train)
    w = similarity(mat_train)
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)
        rank = top_k(user_based_recommend(w, mat_train, user), N)
        for item, pui in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)

In [18]:
print("top5的覆盖率为：", coverage(train, test, 5))

top5的覆盖率为： 0.04345238095238095


In [40]:
def popularity(train, test, N):
    item_popularity = dict()
    mat_train = gen_list(train)
    w = similarity(mat_train)
    for user, items in train.items():
        for item in items.keys():
            item_popularity[item] = item_popularity.get(item, 0) + 1
    ret = 0
    n = 0
    for user in train.keys():
        rank = top_k(user_based_recommend(w, mat_train, user), N)
        for item, pui in rank:
            ret += math.log(1 + 1/item_popularity[str(item + 1)])
            n += 1
    ret /= n * 1.0
    return ret

In [39]:
print('top5的新颖度为：', popularity(train, test, 5))

top5的新颖度为： 0.0027819905185508557
