In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示
from zipfile import ZipFile
import numpy as np
import pandas as pd
import datetime
import gc


In [2]:
import sys
import random
import math
import os
from operator import itemgetter

from collections import defaultdict

random.seed(0)


class UserBasedCF(object):
    ''' TopN recommendation - User Based Collaborative Filtering '''

    # 构造函数，用来初始化
    def __init__(self):
        # 定义 训练集 测试集 为字典类型
        self.trainset = {}
        self.testset = {}
        # 训练集用的相似用户数
        self.n_sim_user = 30
        # 推荐Item数量
        self.n_rec_item = 30

        self.user_sim_mat = {}
        self.item_popular = {}
        self.item_count = 0
        # sys.stderr 是用来重定向标准错误信息的
        print('相似用户数目为 = %d' % self.n_sim_user, file=sys.stderr)
        print('推荐Item数目为 = %d' %
              self.n_rec_item, file=sys.stderr)

    # 划分训练集和测试集 pivot用来定义训练集和测试集的比例
    def generate_dataset(self, train, test=None, pivot=0.90):
        ''' load rating data and split it to training set and test set '''
        trainset_len = 0
        testset_len = 0
        if test is None:  # 随机分配验证机实验
            print('算法尝试！')
            for line in train.iterrows():
                user, item, rating = line[1][0], line[1][1], 1
                # split the data by pivot
                if random.random() < pivot:
                    self.trainset.setdefault(user, {})
                    self.trainset[user][item] = int(rating)
                    trainset_len += 1
                else:
                    self.testset.setdefault(user, {})
                    self.testset[user][item] = int(rating)
                    testset_len += 1
        else:  # 真正预测
            print('预测尝试！')
            for line in train.iterrows():
                user, item, rating = line[1][0], line[1][1], 1
                # split the data by pivot
                self.trainset.setdefault(user, {})
                self.trainset[user][item] = int(rating)
                trainset_len += 1
            del user, item, rating
            for line in test.iterrows():
                user, item, rating = line[1][0], line[1][1], 1
                self.testset.setdefault(user, {})
                self.testset[user][item] = int(rating)
                testset_len += 1

        print('split training set and test set succ')
        print('train set = %s' % trainset_len)
        print('test set = %s' % testset_len)

    # 建立物品-用户 倒排表
    def calc_user_sim(self):
        ''' calculate user similarity matrix '''
        # build inverse table for item-users
        # key=itemID, value=list of userIDs who have seen this item
        print('构建物品-用户倒排表中，请等待......', file=sys.stderr)
        item2users = dict()

        # Python 字典(Dictionary) items() 函数以列表返回可遍历的(键, 值) 元组数组
        for user, items in self.trainset.items():
            for item in items:
                # inverse table for item-users
                if item not in item2users:
                    # 根据商品id 构造set() 函数创建一个无序不重复元素集
                    item2users[item] = set()
                # 集合中值为用户id
                # 数值形如
                # {'914': {'1','6','10'}, '3408': {'1'} ......}
                item2users[item].add(user)
                # 记录电影的流行度
                if item not in self.item_popular:
                    self.item_popular[item] = 0
                self.item_popular[item] += 1
        print('构建物品-用户倒排表成功', file=sys.stderr)

        # save the total item number, which will be used in evaluation
        self.item_count = len(item2users)
        print('总共被操作过的物品数目为 = %d' % self.item_count, file=sys.stderr)

        # count co-rated items between users
        usersim_mat = self.user_sim_mat

        print('building user co-rated items matrix...', file=sys.stderr)
        # 令系数矩阵 C[u][v]表示N(u)∩N（v) ，假如用户u和用户v同时属于K个物品对应的用户列表，就有C[u][v]=K
        for item, users in item2users.items():
            for u in users:
                usersim_mat.setdefault(u, defaultdict(int))
                for v in users:
                    if u == v:
                        continue
                    usersim_mat[u][v] += 1
        print('build user co-rated items matrix succ', file=sys.stderr)

        # calculate similarity matrix
        print('calculating user similarity matrix...', file=sys.stderr)
        simfactor_count = 0
        PRINT_STEP = 2000000
        # 循环遍历usersim_mat 根据余弦相似度公式计算出用户兴趣相似度
        for u, related_users in usersim_mat.items():
            for v, count in related_users.items():
                # 以下是公式计算过程
                usersim_mat[u][v] = count / math.sqrt(
                    len(self.trainset[u]) * len(self.trainset[v]))
                # 计数 并没有什么卵用
                simfactor_count += 1
                if simfactor_count % PRINT_STEP == 0:
                    print('calculating user similarity factor(%d)' %
                          simfactor_count, file=sys.stderr)

        print('calculate user similarity matrix(similarity factor) succ',
              file=sys.stderr)
        print('Total similarity factor number = %d' %
              simfactor_count, file=sys.stderr)

    # 根据用户给予推荐结果
    def recommend(self, user, predict=False):
        '''定义给定K个相似用户和推荐N个商品'''
        K = self.n_sim_user
        N = self.n_rec_item
        # 定义一个字典来存储为用户推荐的电影
        rank = dict()

        watched_items = self.trainset[user]
        # sorted() 函数对所有可迭代的对象进行排序操作。 key 指定比较的对象 ，reverse=True 降序
        for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(),
                                                      key=itemgetter(1), reverse=True)[0:K]:
            for item in self.trainset[similar_user]:
                # 判断 如果这个商品 该用户已经买过 则跳出循环 ##此处不太成立
                #                 if item in watched_items:
                #                     continue
                # 记录用户对推荐的电影的兴趣度
                rank.setdefault(item, 0)
                rank[item] += similarity_factor
        # return the N best items
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]

    def predict(self):
        print('Predict start...')

        N = self.n_rec_item
        #  varables for precision and recall
        hit = 0
        rec_count = 0
        test_count = 0
        # varables for coverage
        all_rec_items = set()
        # varables for popularity
        popular_sum = 0
        predict_result = []
        for i, user in enumerate(self.testset):
            if i % 500 == 0:
                print('recommended for %d users' % i)
            rec_items = self.recommend(user, predict=True)
            predict_result.append((user, rec_items))
        print('Predict end...')
        return predict_result

    # 计算 准确略，召回率，覆盖率，流行度
    def evaluate(self):

        ''' print evaluation result: precision, recall, coverage and popularity '''
        print('Evaluation start...', file=sys.stderr)

        N = self.n_rec_item
        #  varables for precision and recall
        # 记录推荐正确的电影数
        hit = 0
        # 记录推荐电影的总数
        rec_count = 0
        # 记录测试数据中总数
        test_count = 0
        # varables for coverage
        all_rec_items = set()
        # varables for popularity
        popular_sum = 0

        for i, user in enumerate(self.trainset):
            if i % 500 == 0:
                print('recommended for %d users' % i, file=sys.stderr)
            test_items = self.testset.get(user, {})
            rec_items = self.recommend(user, predict=False)
            for item, _ in rec_items:
                if item in test_items:
                    hit += 1
                all_rec_items.add(item)
                popular_sum += math.log(1 + self.item_popular[item])
            rec_count += N
            test_count += len(test_items)
        # 计算准确度
        precision = hit / (1.0 * rec_count)
        # 计算召回率
        recall = hit / (1.0 * test_count)
        # 计算覆盖率
        coverage = len(all_rec_items) / (1.0 * self.item_count)
        # 计算流行度
        popularity = popular_sum / (1.0 * rec_count)

        print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
              (precision, recall, coverage, popularity), file=sys.stderr)


In [3]:
import pandas as pd

In [4]:
train=pd.read_csv("H:\\pythonchengx_u\\Ecomme AI\\Date\\train_item")
test=pd.read_csv("H:\\pythonchengx_u\\Ecomme AI\\Date\\test_item")


In [5]:
train.head()

Unnamed: 0,user_id,item_id
0,1076667570,340804045
1,1076667570,273994489
2,1076667570,142049606
3,1076667570,1224612971
4,1076667570,525100650


In [6]:

usercf_pred = UserBasedCF()


相似用户数目为 = 30
推荐Item数目为 = 30


In [7]:
usercf_pred.generate_dataset(train[['user_id','item_id']],test[['user_id','item_id']])

预测尝试！
split training set and test set succ
train set = 6010979
test set = 5252516


In [8]:
usercf_pred.calc_user_sim()


构建物品-用户倒排表中，请等待......
构建物品-用户倒排表成功
总共被操作过的物品数目为 = 1347477
building user co-rated items matrix...
build user co-rated items matrix succ
calculating user similarity matrix...
calculating user similarity factor(2000000)
calculating user similarity factor(4000000)
calculating user similarity factor(6000000)
calculating user similarity factor(8000000)
calculating user similarity factor(10000000)
calculating user similarity factor(12000000)
calculating user similarity factor(14000000)
calculating user similarity factor(16000000)
calculating user similarity factor(18000000)
calculating user similarity factor(20000000)
calculating user similarity factor(22000000)
calculating user similarity factor(24000000)
calculating user similarity factor(26000000)
calculating user similarity factor(28000000)
calculating user similarity factor(30000000)
calculating user similarity factor(32000000)
calculating user similarity factor(34000000)
calculating user similarity factor(36000000)
calculating user simi

In [9]:
result = usercf_pred.predict()

Predict start...
recommended for 0 users
recommended for 500 users
recommended for 1000 users
recommended for 1500 users
recommended for 2000 users
recommended for 2500 users
recommended for 3000 users
recommended for 3500 users
recommended for 4000 users
recommended for 4500 users
recommended for 5000 users
recommended for 5500 users
recommended for 6000 users
recommended for 6500 users
recommended for 7000 users
recommended for 7500 users
recommended for 8000 users
recommended for 8500 users
recommended for 9000 users
recommended for 9500 users
recommended for 10000 users
recommended for 10500 users
recommended for 11000 users
recommended for 11500 users
recommended for 12000 users
recommended for 12500 users
recommended for 13000 users
recommended for 13500 users
recommended for 14000 users
recommended for 14500 users
recommended for 15000 users
recommended for 15500 users
recommended for 16000 users
recommended for 16500 users
recommended for 17000 users
recommended for 17500 users

In [10]:
import pickle
output = open('H:\\pythonchengx_u\\Ecomme AI\\Date\\Acom_user_cf.pkl', 'wb')
pickle.dump(usercf_pred, output)
output.close()

MemoryError: 

In [11]:
result_usercf = pd.DataFrame(result,columns=['user_id','rec_items'])
result_usercf['rec_items'] = result_usercf.rec_items.apply(lambda x:[i[0] for i in x])

 


In [12]:
result_usercf.to_csv("H:\\pythonchengx_u\\Ecomme AI\\Date\\usercfresult_df",index=False)




In [13]:
result_usercf.head()



Unnamed: 0,user_id,rec_items
0,1052625010,"[921324564, 987952948, 1238296290, 921452260, ..."
1,1067416116,"[838402642, 1117779853, 1223899852, 1189876080..."
2,1075634154,"[905722239, 1145490924, 990276720, 895964786, ..."
3,1013441031,"[218105839, 1154534608, 475136298, 1216921275,..."
4,129362515,"[1109578253, 1066362270, 1036120431, 111301320..."
