#### 隐语义模型(latent factor model)
该模型算法最早在文本挖掘领域被提出，用于找到文本的隐含语义。相关模型有pLSA、LDA、隐含类别模型、隐含主题模型、主题分解等。

LFM模型是如何用于RecSys的呢？<br>

与CF不同，我们考虑通过用户的兴趣分类，再从他的兴趣分类中，挑选他可能喜欢的物品。<br>

如何得到他的兴趣分类呢? <br>

主要通过分析其有过行为的物品，根据这些物品的分类，推断 <br>

物品的分类又是怎么得到的呢？  

LFM模型可以基于用户行为统计，对物品进行自动聚类  

LFM模型通过以下公式衡量用户对物品的兴趣：  
$$
Preference(u,i) = r_{ui} = p_u^Tq_i = \sum_{k=1}^{K}p_{u,k}q_{i,k}
$$
其中`p_uk`表示用户u对分类k的兴趣，`q_ik`表示物品i与分类k的关联

LFM模型在显性反馈数据上，可以很好地解决评分预测问题。在隐性反馈数据上，因为只有正样本，因此需要生成负样本。  
生成原则：  
- 对每个用户，要保证正负样本的平衡
- 对每个用户采样负样本时，要选取那些很热门，而用户没有行为的物品

In [4]:
import random
# 负采样
def randomSelectNegativeSample(items,item_pool):
    ret = {}
    for i in items.keys():
        ret[i] = 1
    n_negative = 0
    while n_negative < len(items):
        item = random.choice(item_pool)
        if item in ret:
            continue
        n_negative += 1
        ret[item] = 0
    return ret

LFM模型通过优化如下的平方损失函数，得到p、q参数：
$$
loss = \sum_{(u,i)\in K}(r_{ui} - r*_{ui})^2 = \sum_{(u,i)\in K}(r_{ui} - \sum_{k=1}^{K}p_{u,k}q_{i,k})^2 + 
\lambda \lVert p_u \rVert ^2 + \lambda \lVert q_i \rVert ^2
$$
其中，`λp+λq`是用来防止过拟合的正则化项。  

In [None]:
import numpy as np
def loss(train_set,l,user_p,movie_q):
    def predict(user,movie):
        return np.dot(user_p[user],movie_p[movie])
    C = 0
    for user,p_val in user_p.items():
        for movie,q_val in movie_q.items():
            user_train_set = train_set[user]
            if movie not in user_train_set:
                continue
            rui = train_set[movie]
            eui = rui - predict(user,movie)
            C += (np.square(eui) + l * np.sum(np.square(user_p[user]))) + l * np.sum(np.square(movie_p[movie]))
    return C        

使用随机梯度下降算法对损失函数进行优化。
$$
\frac{\partial C}{\partial p_{uk}} = -2q_{ik} + 2\lambda p_{uk} \\
\frac{\partial C}{\partial q_{ik}} = -2p_{uk} + 2\lambda q_{ik} \\
p_{uk} = p_{uk} + \alpha(q_{ik} - \lambda p_{uk}) \\
q_{ik} = q_{ik} + \alpha(p_{uk} - \lambda q_{ik})
$$

In [None]:
def train(user_p,movie_q,movies_pool,epoches,learning_rate,l):
    
    for epoch in range(epoches):
        print("第{}轮训练begainning...".format(epoch))
        for user,user_items in train_set.items():
            samples = randomSelectNegativeSample(user_items,movies_pool)
            for movie,rui in samples.items():
                user_latent = user_p[user]
                movie_latent = movie_q[movie]
                user_p[user] += learning_rate * (movie_latent - l*user_latent)
                movie_q[movie] += learning_rate * (user_latent - l*movie_latent)
        loss = loss(samples,l,user_p,movie_q)
        print("第{}轮,loss is {}".format(epoch,loss))

In [None]:
def init(train_data,k):
    user_p = {}
    movie_q = {}
    movie_pool = set()
    print("初始化参数...")
    for user,movies in train_data.items():
        user_p[user] = np.random.normal(size=(k))
        for movie in movies.items():
            movie_pool.add(movie[0])
    for movie in movie_pool:
        movie_q[movie] = np.random.normal(size=(k))
    return user_p,movie_q,movie_pool

In [None]:
import random
def load_data(users_path,movies_path,rating_path):
    
    def read_users():
        """(user_id:(gender,age,occupation))"""
        users = {}
        with open(users_path,'r') as f:
            users = {line.split("::")[0]:line.split("::")[1:-1] for line in f.readlines()}
        return users
    def read_movies():
        """(movie_id:(title,genres))"""
        movies = {}
        with open(movies_path,'r') as f:
            movies = {line.split("::")[0]: line.split("::")[1:] for line in f.readlines()}
        return movies
    def read_ratings():
        """{user_id:{movie_id: rating}}"""
        ratings = dict()
        with open(rating_path, 'r') as f:
            for line in f.readlines():
                user,movie,rating,_ = line.split('::')
                ratings.setdefault(user,{})
                ratings[user][movie] = int(rating)
        return ratings
    return read_users(),read_movies(),read_ratings()
def split_train_test(data,M,k,seed):
    """M折交叉验证"""
    train_set = dict()
    test_set = dict()
    random.seed(seed)
    test_count,train_count = 0,0
    for user,movie_info in data.items():
        for movie,rating in movie_info.items():
            if random.randint(0,M) == k:
                test_set.setdefault(user,{})
                test_set[user][movie] = int(rating)
                test_count += 1
            else:
                train_set.setdefault(user,{})
                train_set[user][movie] = int(rating)
                train_count += 1
    print(test_count,train_count)
    return train_set,test_set
USERS_PATH = '../dataset/ml-1m/users.dat'
MOVIES_PATH = '../dataset/ml-1m/movies.dat'
RATINGS_PATH = '../dataset/ml-1m/ratings.dat'
users,movies,ratings = load_data(USERS_PATH,MOVIES_PATH,RATINGS_PATH)
train_set,test_set = split_train_test(ratings,5,1,1)

In [None]:
user_p,movie_q,movies_pool = init(train_set,100)

In [None]:
user_p['1'],movie_q['661']

In [None]:
train(user_p,movie_q,list(movies_pool),500,learning_rate=0.02,l=0.01)

In [None]:
from collections import defaultdict
def recommand(user):
    def predict(user,movie):
        return np.dot(user_p[user],movie_p[movie])
    rank = defaultdict(int)
    for movie in list(movies_pool):
        rank[movie] = predict(user,movie)
    return sorted(rank.items(),key=itemgetter(1),reverse=True)

In [None]:
def hit(train,test,N):
    hit = 0
    for user in train.keys():
        real = test[user] if user in test else []
        recommand_items = recommand(user)[:N]
        for item in recommand_items:
            if item[0] in real:
                hit += 1
    print('hit:%d'% hit)
    return hit
# precision
def Precision(train,hit,N):
    print('sum:%d' % (len(train.keys()) * N))
    return hit / len(train.keys()) * N
# recall
def Recall(train,test,hit):
    recommands = 0
    for user in train.keys():
        real = test[user] if user in test else []
        recommands += len(real)
    print('sum_real: %d' % recommands)
    return hit / recommands 

In [None]:
hit_total = hit(train_set,test_set,10)

In [None]:
Precision(train_set,hit,10),Recall(train_set,hit,10)

In [6]:
import numpy as np
def loss(train_set,l,user_p,movie_q):
    def predict(user,movie):
        return np.dot(user_p[user],movie_p[movie])
    C = 0
    for user,p_val in user_p.items():
        for movie,q_val in movie_q.items():
            user_train_set = train_set[user]
            if movie not in user_train_set:
                continue
            rui = train_set[movie]
            eui = rui - predict(user,movie)
            C += (np.square(eui) + l * np.sum(np.square(user_p[user]))) + l * np.sum(np.square(movie_p[movie]))
    return C        

使用随机梯度下降算法对损失函数进行优化。
$$
\frac{\partial C}{\partial p_{uk}} = -2q_{ik} + 2\lambda p_{uk} \\
\frac{\partial C}{\partial q_{ik}} = -2p_{uk} + 2\lambda q_{ik} \\
p_{uk} = p_{uk} + \alpha(q_{ik} - \lambda p_{uk}) \\
q_{ik} = q_{ik} + \alpha(p_{uk} - \lambda q_{ik})
$$

In [25]:
def train(user_p,movie_q,movies_pool,epoches,learning_rate,l):
    
    for epoch in range(epoches):
        print("第{}轮训练begainning...".format(epoch))
        for user,user_items in train_set.items():
            samples = randomSelectNegativeSample(user_items,movies_pool)
            for movie,rui in samples.items():
                user_latent = user_p[user]
                movie_latent = movie_q[movie]
                user_p[user] += learning_rate * (movie_latent - l*user_latent)
                movie_q[movie] += learning_rate * (user_latent - l*movie_latent)
        loss = loss(samples,l,user_p,movie_q)
        print("第{}轮,loss is {}".format(epoch,loss))

In [32]:
def init(train_data,k):
    user_p = {}
    movie_q = {}
    movie_pool = set()
    print("初始化参数...")
    for user,movies in train_data.items():
        user_p[user] = np.random.normal(size=(k))
        for movie in movies.items():
            movie_pool.add(movie[0])
    for movie in movie_pool:
        movie_q[movie] = np.random.normal(size=(k))
    return user_p,movie_q,movie_pool

In [10]:
import random
def load_data(users_path,movies_path,rating_path):
    
    def read_users():
        """(user_id:(gender,age,occupation))"""
        users = {}
        with open(users_path,'r') as f:
            users = {line.split("::")[0]:line.split("::")[1:-1] for line in f.readlines()}
        return users
    def read_movies():
        """(movie_id:(title,genres))"""
        movies = {}
        with open(movies_path,'r') as f:
            movies = {line.split("::")[0]: line.split("::")[1:] for line in f.readlines()}
        return movies
    def read_ratings():
        """{user_id:{movie_id: rating}}"""
        ratings = dict()
        with open(rating_path, 'r') as f:
            for line in f.readlines():
                user,movie,rating,_ = line.split('::')
                ratings.setdefault(user,{})
                ratings[user][movie] = int(rating)
        return ratings
    return read_users(),read_movies(),read_ratings()
def split_train_test(data,M,k,seed):
    """M折交叉验证"""
    train_set = dict()
    test_set = dict()
    random.seed(seed)
    test_count,train_count = 0,0
    for user,movie_info in data.items():
        for movie,rating in movie_info.items():
            if random.randint(0,M) == k:
                test_set.setdefault(user,{})
                test_set[user][movie] = int(rating)
                test_count += 1
            else:
                train_set.setdefault(user,{})
                train_set[user][movie] = int(rating)
                train_count += 1
    print(test_count,train_count)
    return train_set,test_set
USERS_PATH = '../dataset/ml-1m/users.dat'
MOVIES_PATH = '../dataset/ml-1m/movies.dat'
RATINGS_PATH = '../dataset/ml-1m/ratings.dat'
users,movies,ratings = load_data(USERS_PATH,MOVIES_PATH,RATINGS_PATH)
train_set,test_set = split_train_test(ratings,5,1,1)

166907 833302


In [33]:
user_p,movie_q,movies_pool = init(train_set,100)

初始化参数...


In [35]:
user_p['1'],movie_q['661']

(array([-2.07494963,  0.87836205, -1.3196014 , -0.89899117, -0.05271024,
        -0.33034091,  1.3926425 ,  1.29885764, -1.40680886, -0.22223479,
        -0.04192161, -0.40425667, -0.70444912,  1.68396205,  0.00632103,
        -0.61012895, -0.12796785,  0.06183218,  0.9484937 ,  1.04422228,
        -0.22083501,  0.27943368, -0.14422907,  1.21328413, -1.49905804,
         0.97199992, -0.43069356,  0.41795874,  0.94921952,  1.27208229,
        -0.79262414, -1.09876046, -1.49824399,  0.09081167,  0.14226749,
         0.3310094 , -1.0137267 , -0.68520335, -0.24961466, -0.51489675,
         0.8029353 ,  1.40935446,  2.20096657, -0.32589617,  1.05467198,
         1.64032945, -0.05308882,  0.48315537, -1.6158723 , -0.3648277 ,
         1.05652649, -0.43412986,  0.10164002,  0.77415808, -0.02295959,
        -1.6724066 , -1.95448864,  0.64540759,  0.15180863, -0.19981573,
        -0.76902234,  0.07231744, -0.30062156,  0.65723427, -0.33843025,
         0.89257102,  2.50083823, -1.2124902 , -2.2

In [36]:
train(user_p,movie_q,list(movies_pool),500,learning_rate=0.02,l=0.01)

第0轮训练begainning...


KeyboardInterrupt: 

In [None]:
from collections import defaultdict
def recommand(user):
    def predict(user,movie):
        return np.dot(user_p[user],movie_p[movie])
    rank = defaultdict(int)
    for movie in list(movies_pool):
        rank[movie] = predict(user,movie)
    return sorted(rank.items(),key=itemgetter(1),reverse=True)

In [None]:
def hit(train,test,N):
    hit = 0
    for user in train.keys():
        real = test[user] if user in test else []
        recommand_items = recommand(user)[:N]
        for item in recommand_items:
            if item[0] in real:
                hit += 1
    print('hit:%d'% hit)
    return hit
# precision
def Precision(train,hit,N):
    print('sum:%d' % (len(train.keys()) * N))
    return hit / len(train.keys()) * N
# recall
def Recall(train,test,hit):
    recommands = 0
    for user in train.keys():
        real = test[user] if user in test else []
        recommands += len(real)
    print('sum_real: %d' % recommands)
    return hit / recommands 

In [None]:
hit_total = hit(train_set,test_set,10)

In [None]:
Precision(train_set,hit,10),Recall(train_set,hit,10)