# 隐语义模型

In [1]:
# 导入包
import random
import math
import numpy as np
import time
from tqdm import tqdm, trange

## 一. 通用函数定义

In [2]:
# 定义装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print('Func %s, run time: %s' % (func.__name__, stop_time - start_time))
        return res
    return wrapper

### 1. 数据处理相关
1. load data
2. split data

In [3]:
class Dataset():
    
    def __init__(self, fp):
        # fp: data file path
        self.data = self.loadData(fp)
    
    @timmer
    def loadData(self, fp):
        data = []
        for l in open(fp):
            data.append(tuple(map(int, l.strip().split('::')[:2])))
        return data
    
    @timmer
    def splitData(self, M, k, seed=1):
        '''
        :params: data, 加载的所有(user, item)数据条目
        :params: M, 划分的数目，最后需要取M折的平均
        :params: k, 本次是第几次划分，k~[0, M)
        :params: seed, random的种子数，对于不同的k应设置成一样的
        :return: train, test
        '''
        train, test = [], []
        random.seed(seed)
        for user, item in self.data:
            # 这里与书中的不一致，本人认为取M-1较为合理，因randint是左右都覆盖的
            if random.randint(0, M-1) == k:  
                test.append((user, item))
            else:
                train.append((user, item))

        # 处理成字典的形式，user->set(items)
        def convert_dict(data):
            data_dict = {}
            for user, item in data:
                if user not in data_dict:
                    data_dict[user] = set()
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict

        return convert_dict(train), convert_dict(test)

### 2. 评价指标
1. Precision
2. Recall
3. Coverage
4. Popularity(Novelty)

In [4]:
class Metric():
    
    def __init__(self, train, test, GetRecommendation):
        '''
        :params: train, 训练数据
        :params: test, 测试数据
        :params: GetRecommendation, 为某个用户获取推荐物品的接口函数
        '''
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
        
    # 定义精确率指标计算方式
    def precision(self):
        all, hit = 0, 0
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1
            all += len(rank)
        return round(hit / all * 100, 2)
    
    # 定义召回率指标计算方式
    def recall(self):
        all, hit = 0, 0
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1
            all += len(test_items)
        return round(hit / all * 100, 2)
    
    # 定义覆盖率指标计算方式
    def coverage(self):
        all_item, recom_item = set(), set()
        for user in self.test:
            for item in self.train[user]:
                all_item.add(item)
            rank = self.recs[user]
            for item, score in rank:
                recom_item.add(item)
        return round(len(recom_item) / len(all_item) * 100, 2)
    
    # 定义新颖度指标计算方式
    def popularity(self):
        # 计算物品的流行度
        item_pop = {}
        for user in self.train:
            for item in self.train[user]:
                if item not in item_pop:
                    item_pop[item] = 0
                item_pop[item] += 1

        num, pop = 0, 0
        for user in self.test:
            rank = self.recs[user]
            for item, score in rank:
                # 取对数，防止因长尾问题带来的被流行物品所主导
                pop += math.log(1 + item_pop[item])
                num += 1
        return round(pop / num, 6)
    
    def eval(self):
        metric = {'Precision': self.precision(),
                  'Recall': self.recall(),
                  'Coverage': self.coverage(),
                  'Popularity': self.popularity()}
        print('Metric:', metric)
        return metric

## 二. LFM算法实现

In [5]:
def LFM(train, ratio, K, lr, step, lmbda, N):
    '''
    :params: train, 训练数据
    :params: ratio, 负采样的正负比例
    :params: K, 隐语义个数
    :params: lr, 初始学习率
    :params: step, 迭代次数
    :params: lmbda, 正则化系数
    :params: N, 推荐TopN物品的个数
    :return: GetRecommendation, 获取推荐结果的接口
    '''
    
    all_items = {}
    for user in train:
        for item in train[user]:
            if item not in all_items:
                all_items[item] = 0
            all_items[item] += 1
    all_items = list(all_items.items())
    items = [x[0] for x in all_items]
    pops = [x[1] for x in all_items]
    
    # 负采样函数(注意！！！要按照流行度进行采样)
    def nSample(data, ratio):
        new_data = {}
        # 正样本
        for user in data:
            if user not in new_data:
                new_data[user] = {}
            for item in data[user]:
                new_data[user][item] = 1
        # 负样本
        for user in new_data:
            seen = set(new_data[user])
            pos_num = len(seen)
            item = np.random.choice(items, int(pos_num * ratio * 3), pops)
            item = [x for x in item if x not in seen][:int(pos_num * ratio)]
            new_data[user].update({x: 0 for x in item})
        
        return new_data
                
    # 训练
    P, Q = {}, {}
    for user in train:
        P[user] = np.random.random(K)
    for item in items:
        Q[item] = np.random.random(K)
            
    for s in trange(step):
        data = nSample(train, ratio)
        for user in data:
            for item in data[user]:
                eui = data[user][item] - (P[user] * Q[item]).sum()
                P[user] += lr * (Q[item] * eui - lmbda * P[user])
                Q[item] += lr * (P[user] * eui - lmbda * Q[item])
        lr *= 0.9 # 调整学习率
        
    # 获取接口函数
    def GetRecommendation(user):
        seen_items = set(train[user])
        recs = {}
        for item in items:
            if item not in seen_items:
                recs[item] = (P[user] * Q[item]).sum()
        recs = list(sorted(recs.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs
    
    return GetRecommendation

## 三. LFM实验
M=8, N=10, ratio=[1, 2, 3, 5, 10, 20]

In [6]:
class Experiment():
    
    def __init__(self, M, N, ratio=1,
                 K=100, lr=0.02, step=100, lmbda=0.01, fp='/Users/felix/PycharmProjects/RecommendSystemPractice/dataSet/ml-1m/ratingsTest.dat'):
        '''
        :params: M, 进行多少次实验
        :params: N, TopN推荐物品的个数
        :params: ratio, 正负样本比例
        :params: K, 隐语义个数
        :params: lr, 学习率
        :params: step, 训练步数
        :params: lmbda, 正则化系数
        :params: fp, 数据文件路径
        '''
        self.M = M
        self.K = K
        self.N = N
        self.ratio = ratio
        self.lr = lr
        self.step = step
        self.lmbda = lmbda
        self.fp = fp
        self.alg = LFM
    
    # 定义单次实验
    @timmer
    def worker(self, train, test):
        '''
        :params: train, 训练数据集
        :params: test, 测试数据集
        :return: 各指标的值
        '''
        getRecommendation = self.alg(train, self.ratio, self.K, 
                                     self.lr, self.step, self.lmbda, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 多次实验取平均
    @timmer
    def run(self):
        metrics = {'Precision': 0, 'Recall': 0, 
                   'Coverage': 0, 'Popularity': 0}
        dataset = Dataset(self.fp)
        for ii in range(self.M):
            train, test = dataset.splitData(self.M, ii)
            print('Experiment {}:'.format(ii))
            metric = self.worker(train, test)
            metrics = {k: metrics[k]+metric[k] for k in metrics}
        metrics = {k: metrics[k] / self.M for k in metrics}
        print('Average Result (M={}, N={}, ratio={}): {}'.format(\
                              self.M, self.N, self.ratio, metrics))

In [7]:
# LFM实验(运行时间较长，这里没贴实验结果)
M, N = 8, 10
for r in [1, 2, 3, 5, 10, 20]:
    exp = Experiment(M, N, ratio=r)
    exp.run()

 15%|█▌        | 15/100 [00:00<00:00, 145.51it/s]

Func loadData, run time: 0.0020852088928222656
Func splitData, run time: 0.00044798851013183594
Experiment 0:


100%|██████████| 100/100 [00:00<00:00, 153.42it/s]
 15%|█▌        | 15/100 [00:00<00:00, 149.64it/s]

Metric: {'Precision': 4.0, 'Recall': 5.0, 'Coverage': 20.35, 'Popularity': 0.725584}
Func worker, run time: 0.669050931930542
Func splitData, run time: 0.0005040168762207031
Experiment 1:


100%|██████████| 100/100 [00:00<00:00, 154.67it/s]
 15%|█▌        | 15/100 [00:00<00:00, 149.86it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 19.4, 'Popularity': 0.709366}
Func worker, run time: 0.6523849964141846
Func splitData, run time: 0.00037384033203125
Experiment 2:


100%|██████████| 100/100 [00:00<00:00, 150.00it/s]
 16%|█▌        | 16/100 [00:00<00:00, 156.36it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 15.77, 'Popularity': 0.71342}
Func worker, run time: 0.672137975692749
Func splitData, run time: 0.0003781318664550781
Experiment 3:


100%|██████████| 100/100 [00:00<00:00, 157.06it/s]
 16%|█▌        | 16/100 [00:00<00:00, 157.47it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 19.91, 'Popularity': 0.731338}
Func worker, run time: 0.6423921585083008
Func splitData, run time: 0.00040912628173828125
Experiment 4:


100%|██████████| 100/100 [00:00<00:00, 154.35it/s]
 16%|█▌        | 16/100 [00:00<00:00, 152.69it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 20.96, 'Popularity': 0.731338}
Func worker, run time: 0.6539490222930908
Func splitData, run time: 0.0003790855407714844
Experiment 5:


100%|██████████| 100/100 [00:00<00:00, 150.23it/s]
 16%|█▌        | 16/100 [00:00<00:00, 159.28it/s]

Metric: {'Precision': 4.0, 'Recall': 5.71, 'Coverage': 20.17, 'Popularity': 0.739447}
Func worker, run time: 0.6711249351501465
Func splitData, run time: 0.00038504600524902344
Experiment 6:


100%|██████████| 100/100 [00:00<00:00, 154.77it/s]
 16%|█▌        | 16/100 [00:00<00:00, 151.85it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 20.89, 'Popularity': 0.749912}
Func worker, run time: 0.6518208980560303
Func splitData, run time: 0.0003807544708251953
Experiment 7:


100%|██████████| 100/100 [00:00<00:00, 151.83it/s]
 12%|█▏        | 12/100 [00:00<00:00, 117.73it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 20.51, 'Popularity': 0.741803}
Func worker, run time: 0.6647751331329346
Average Result (M=8, N=10, ratio=1): {'Precision': 1.0, 'Recall': 1.33875, 'Coverage': 19.744999999999997, 'Popularity': 0.730276}
Func run, run time: 5.283588886260986
Func loadData, run time: 0.0015330314636230469
Func splitData, run time: 0.00034499168395996094
Experiment 0:


100%|██████████| 100/100 [00:00<00:00, 117.60it/s]
 12%|█▏        | 12/100 [00:00<00:00, 119.18it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 20.35, 'Popularity': 0.725584}
Func worker, run time: 0.8561370372772217
Func splitData, run time: 0.0003707408905029297
Experiment 1:


100%|██████████| 100/100 [00:00<00:00, 120.02it/s]
 12%|█▏        | 12/100 [00:00<00:00, 115.80it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 18.97, 'Popularity': 0.717475}
Func worker, run time: 0.8387999534606934
Func splitData, run time: 0.0003800392150878906
Experiment 2:


100%|██████████| 100/100 [00:00<00:00, 115.04it/s]
 12%|█▏        | 12/100 [00:00<00:00, 119.04it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 17.12, 'Popularity': 0.71342}
Func worker, run time: 0.8743607997894287
Func splitData, run time: 0.0003619194030761719
Experiment 3:


100%|██████████| 100/100 [00:00<00:00, 121.16it/s]
 12%|█▏        | 12/100 [00:00<00:00, 115.02it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 19.46, 'Popularity': 0.769529}
Func worker, run time: 0.8306560516357422
Func splitData, run time: 0.00036406517028808594
Experiment 4:


100%|██████████| 100/100 [00:00<00:00, 117.82it/s]
 12%|█▏        | 12/100 [00:00<00:00, 112.00it/s]

Metric: {'Precision': 2.0, 'Recall': 2.5, 'Coverage': 20.09, 'Popularity': 0.717475}
Func worker, run time: 0.8548691272735596
Func splitData, run time: 0.00036406517028808594
Experiment 5:


100%|██████████| 100/100 [00:00<00:00, 115.22it/s]
 12%|█▏        | 12/100 [00:00<00:00, 119.81it/s]

Metric: {'Precision': 2.0, 'Recall': 2.86, 'Coverage': 19.74, 'Popularity': 0.747557}
Func worker, run time: 0.8735339641571045
Func splitData, run time: 0.00036406517028808594
Experiment 6:


100%|██████████| 100/100 [00:00<00:00, 118.18it/s]
 12%|█▏        | 12/100 [00:00<00:00, 116.14it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 20.0, 'Popularity': 0.701256}
Func worker, run time: 0.8519141674041748
Func splitData, run time: 0.0003609657287597656
Experiment 7:


100%|██████████| 100/100 [00:00<00:00, 111.85it/s]
 10%|█         | 10/100 [00:00<00:00, 93.52it/s]

Metric: {'Precision': 2.0, 'Recall': 2.94, 'Coverage': 18.8, 'Popularity': 0.737092}
Func worker, run time: 0.8997890949249268
Average Result (M=8, N=10, ratio=2): {'Precision': 0.75, 'Recall': 1.0374999999999999, 'Coverage': 19.316250000000004, 'Popularity': 0.7286735}
Func run, run time: 6.885021924972534
Func loadData, run time: 0.0010287761688232422
Func splitData, run time: 0.00034308433532714844
Experiment 0:


100%|██████████| 100/100 [00:01<00:00, 95.29it/s]
 10%|█         | 10/100 [00:00<00:00, 97.68it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 19.91, 'Popularity': 0.693147}
Func worker, run time: 1.0546081066131592
Func splitData, run time: 0.00038504600524902344
Experiment 1:


100%|██████████| 100/100 [00:01<00:00, 99.71it/s]
 10%|█         | 10/100 [00:00<00:00, 95.79it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 21.55, 'Popularity': 0.733694}
Func worker, run time: 1.009375810623169
Func splitData, run time: 0.00037407875061035156
Experiment 2:


100%|██████████| 100/100 [00:01<00:00, 93.51it/s]
 11%|█         | 11/100 [00:00<00:00, 102.31it/s]

Metric: {'Precision': 2.5, 'Recall': 3.23, 'Coverage': 18.02, 'Popularity': 0.737941}
Func worker, run time: 1.074592113494873
Func splitData, run time: 0.00044798851013183594
Experiment 3:


100%|██████████| 100/100 [00:00<00:00, 100.26it/s]
 10%|█         | 10/100 [00:00<00:00, 98.01it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 21.27, 'Popularity': 0.77424}
Func worker, run time: 1.0029969215393066
Func splitData, run time: 0.00036597251892089844
Experiment 4:


100%|██████████| 100/100 [00:01<00:00, 98.03it/s]
 11%|█         | 11/100 [00:00<00:00, 100.74it/s]

Metric: {'Precision': 6.0, 'Recall': 7.5, 'Coverage': 21.4, 'Popularity': 0.725584}
Func worker, run time: 1.0256340503692627
Func splitData, run time: 0.000370025634765625
Experiment 5:


100%|██████████| 100/100 [00:01<00:00, 99.99it/s]
 10%|█         | 10/100 [00:00<00:00, 99.82it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 20.17, 'Popularity': 0.758022}
Func worker, run time: 1.0062470436096191
Func splitData, run time: 0.00036597251892089844
Experiment 6:


100%|██████████| 100/100 [00:01<00:00, 98.37it/s]
 10%|█         | 10/100 [00:00<00:00, 91.99it/s]

Metric: {'Precision': 4.0, 'Recall': 5.0, 'Coverage': 20.89, 'Popularity': 0.717475}
Func worker, run time: 1.0224580764770508
Func splitData, run time: 0.00036907196044921875
Experiment 7:


100%|██████████| 100/100 [00:01<00:00, 96.14it/s]
  9%|▉         | 9/100 [00:00<00:01, 81.10it/s]

Metric: {'Precision': 2.0, 'Recall': 2.94, 'Coverage': 20.51, 'Popularity': 0.733694}
Func worker, run time: 1.0459051132202148
Average Result (M=8, N=10, ratio=3): {'Precision': 1.8125, 'Recall': 2.33375, 'Coverage': 20.465, 'Popularity': 0.734224625}
Func run, run time: 8.246391773223877
Func loadData, run time: 0.0008797645568847656
Func splitData, run time: 0.0003399848937988281
Experiment 0:


100%|██████████| 100/100 [00:01<00:00, 80.23it/s]
  8%|▊         | 8/100 [00:00<00:01, 79.31it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 20.8, 'Popularity': 0.709366}
Func worker, run time: 1.2517290115356445
Func splitData, run time: 0.0003619194030761719
Experiment 1:


100%|██████████| 100/100 [00:01<00:00, 82.19it/s]
  9%|▉         | 9/100 [00:00<00:01, 80.87it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 19.4, 'Popularity': 0.709366}
Func worker, run time: 1.2221472263336182
Func splitData, run time: 0.00036406517028808594
Experiment 2:


100%|██████████| 100/100 [00:01<00:00, 78.88it/s]
  9%|▉         | 9/100 [00:00<00:01, 84.16it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 18.02, 'Popularity': 0.733694}
Func worker, run time: 1.2726459503173828
Func splitData, run time: 0.0003619194030761719
Experiment 3:


100%|██████████| 100/100 [00:01<00:00, 82.51it/s]
  9%|▉         | 9/100 [00:00<00:01, 82.68it/s]

Metric: {'Precision': 4.0, 'Recall': 5.0, 'Coverage': 21.72, 'Popularity': 0.755666}
Func worker, run time: 1.2177350521087646
Func splitData, run time: 0.00035572052001953125
Experiment 4:


100%|██████████| 100/100 [00:01<00:00, 80.73it/s]
  8%|▊         | 8/100 [00:00<00:01, 78.65it/s]

Metric: {'Precision': 0.0, 'Recall': 0.0, 'Coverage': 19.65, 'Popularity': 0.733694}
Func worker, run time: 1.2444572448730469
Func splitData, run time: 0.00036787986755371094
Experiment 5:


 71%|███████   | 71/100 [00:00<00:00, 77.50it/s]


KeyboardInterrupt: 

## 四. 总结
1. 负采样的时候要按照流行度进行采样。按照书中的说法：一般认为，很热门而用户却没有行为更加代表用户对这个物品不感兴趣，因为对于冷门的物品，用户可能是压根没有在网站中发现这个物品，所以谈不上是否感兴趣。