In [None]:
import csv
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import nltk
import string
from nltk.corpus import stopwords
from collections import Counter
from collections import defaultdict
from nltk.stem.porter import*
from nltk.text import TextCollection
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA

src_path = 'C:/Users/JyoXu/RS/Assign-2-data/'
train_path = src_path + 'train.csv'
valid_path = src_path + 'valid.csv'
test_path  = src_path + 'test.csv'

dst_path = 'C:/Users/JyoXu/RS/Assign-2-result/'
matrix_path = dst_path + 'matrix.csv'
user_ave_path = dst_path + 'user_ave.csv'
item_ave_path = dst_path + 'item_ave.csv'
isim_path = dst_path + 'isim.csv'
usim_path = dst_path + 'usim.csv'
csim_path = dst_path + 'csim.csv'
pca_path  = dst_path + 'tf_idf_pca.csv'
item_review_path = dst_path + 'item_review.csv'
ineighbor_path = dst_path + 'ineighbor.csv'
uneighbor_path = dst_path + 'uneighbor.csv'
cneighbor_path = dst_path + 'cneighbor.csv'
valid_res_path = dst_path + 'valid_res.csv'
review_path = dst_path + 'review.csv'

sub_path = 'C:/Users/JyoXu/RS/Assign-2-submission/'
saved_path  = sub_path + 'res_a%.1f b%.1f c%.1f d%.1f.csv'
################################

class CF:
    def __init__(self):
        pass
    
    def dump(self, src, to_path):
        writer = csv.writer(open(to_path,'w',newline=''))
        for i in range (np.shape(src)[0]):
            writer.writerow(src[i])
        return

    def split_valid_data(self, path, to_path, rate, round):
        f = csv.reader(open(path,'r'))  # 源
        writer = csv.writer(open(to_path, 'w', newline='')) # 写入验证集
        start = int(9029 * rate * round)       # valid data 起点
        total = int(9029 * rate)
        idx, cnt = 0, 0
        for i in f:
            if i[0] == 'reviewerID':
                writer.writerow(i)
                continue
            if ( idx < start ):
                idx += 1
                continue
            # 当前条目的用户编号和商品编号
            unum, inum = self.user_dict[i[0]], self.item_dict[i[1]]
            # 保证有足够的训练集
            if ( self.user_active[unum] >= 5 and self.item_active[inum] >= 5 ):
                # 写入验证集
                writer.writerow(i)
                # 从训练集中去除
                self.user_active[unum] -= 1
                self.item_active[inum] -= 1
                self.users_per_item[inum].remove(unum) # item[i] 对应的诸多user
                self.items_per_user[unum].remove(inum)
                self.matrix[unum][inum] = 0
                self.pos[unum][inum] = -1
                cnt += 1
            if ( cnt == total ): # 收集满total条验证集
                break
        self.dump(self.matrix, matrix_path) # 输出 ui_matrix
        return self

    def load_train_data(self, path):
        self.user_dict, self.item_dict = {}, {}
        self.users_per_item = defaultdict(set)
        self.items_per_user = defaultdict(set)
        i, j = 0, 0
        reader = csv.reader(open(path,'r'))
        for k in reader: # 编号
            if k[0] == 'reviewerID':
                continue
            uid, iid = k[0], k[1]
            if uid not in self.user_dict:
                self.user_dict[uid]=i
                i+=1
            if iid not in self.item_dict:
                self.item_dict[iid]=j
                j+=1
        self.reviewText = []  # 新增了reviewText，用一个list来存储
        self.matrix = np.zeros((len(self.user_dict),len(self.item_dict)))
        self.pos    = np.full((len(self.user_dict),len(self.item_dict)),-1)
        
        # 用户、商品活跃度（相关评分数量）
        self.user_active = np.zeros(len(self.user_dict))
        self.item_active = np.zeros(len(self.item_dict))
        # 
        reader = csv.reader(open(path,'r'))
        for line in reader:
            if line[0] == 'reviewerID':
                continue
            u, i = self.user_dict[line[0]], self.item_dict[line[1]]
            self.matrix[u][i] = eval(line[2])
            self.users_per_item[i].add(u) # item[i] 对应的诸多user
            self.items_per_user[u].add(i)
            self.pos[u][i] = len(self.reviewText)
            self.reviewText.append(line[3])  # 添加review
            #
            self.user_active[self.user_dict[line[0]]] += 1 # 更新用户活跃度
            self.item_active[self.item_dict[line[1]]] += 1 # 更新商品活跃度
        self.matrix=self.matrix.astype(np.float32)
        self.dump(self.matrix, matrix_path) # 输出 ui_matrix
        return self

    def init(self):
        self.able = 0
        self.usim = np.zeros((len(self.user_dict), len(self.user_dict)))
        self.isim = np.zeros((len(self.item_dict), len(self.item_dict)))
        self.csim = np.zeros((len(self.item_dict), len(self.item_dict)))
        self.uneighbor = np.zeros((len(self.user_dict), len(self.user_dict)))
        self.ineighbor = np.zeros((len(self.item_dict), len(self.item_dict)))
        self.cneighbor = np.zeros((len(self.item_dict), len(self.item_dict)))
        return self
    
    def user_ave_cal(self, to_path):
        # 计算每个用户的平均给分
        self.user_ave_score = np.sum(self.matrix,axis=1) / self.user_active
        self.user_ave_score = np.reshape(self.user_ave_score,(len(self.user_ave_score),1))
        self.all_user_ave = np.sum(self.user_ave_score) / len(self.user_ave_score)
        print("all_user_ave %.5f"%self.all_user_ave)
        self.dump(self.user_ave_score, to_path) # 导出
        return self

    def item_ave_cal(self, to_path):
        # 计算每个商品的平均得分
        self.item_ave_score = np.sum(self.matrix,axis=0) / self.item_active
        self.item_ave_score = np.reshape(self.item_ave_score,(len(self.item_ave_score),1))
        self.all_item_ave = np.sum(self.item_ave_score) / len(self.item_ave_score)
        print("all_item_ave %.5f"%self.all_item_ave)
        self.dump(self.item_ave_score, to_path) # 导出
        return self
    
    def load_sim(self, dst, path):
        reader = csv.reader(open(path,'r'))  # 源
        i = 0
        for line in reader: # 每一行 sim[a]
            j = 0
            for e in line:  # 每一个元素，sim[a][b]
                dst[i][j] = e
                j += 1
            i += 1
        return self
    
    def cos_sim_cal(self, src, dst, to_path, act, cons=[]):
        for i in range(len(src)):
            if ( i % 200 == 0 ):
                print(i)
            for j in range(len(src)):
                if ( len(cons) != 0 ):
                    # 共同交互少于4，或活跃度太低不考虑相似度
                    if ( len(cons[i].intersection(cons[j]))<4 or act[i]<5 or act[j]<5):
                        dst[i][j] = 0
                        continue
                v1 = src[i] # 向量1
                v2 = src[j] # 向量2
                num = float(np.dot(v1, v2)) # 点乘
                denom = np.linalg.norm(v1) * np.linalg.norm(v2) # 取模
                dst[i][j] = (num / denom) if denom != 0 else 0
        self.dump(dst, to_path)
        return
        
    def clean_text(self):
        # 引入标点符号
        punctuation_map = dict((ord(char), None) for char in string.punctuation)
        # 转换为小写，去除标点符号
        for i in range(len(self.reviewText)):
            self.reviewText[i] = self.reviewText[i].lower().translate(punctuation_map)
        # 对每个句子进行分词
        self.reviewText = [word_tokenize(re) for re in self.reviewText]
        # 去除停用词
        for i in range(len(self.reviewText)):
            if ( i % 100 == 0 ):
                print("stop word %d"%i)
            self.reviewText[i] = [w for w in self.reviewText[i] if not w in stopwords.words('english')]
        # 提取词干
        s = nltk.stem.SnowballStemmer('english')
        for i in range(len(self.reviewText)):
            for j in range(len(self.reviewText[i])):
                self.reviewText[i][j] = s.stem(self.reviewText[i][j])
        print("clean text done")
        # 拼接
        self.text = [' '.join(i) for i in self.reviewText]
        self.reviewText = []
        for i in range(len(self.item_dict)): # 对每个item收集它的评论
            self.reviewText.append('')
        for u in range(len(self.user_dict)):
            for i in range(len(self.item_dict)):
                p = int(self.pos[u][i])
                if ( p != -1 ):
                    self.reviewText[i] += self.text[p]
        #print(self.reviewText)
        return self
    
    def tf_idf(self):
        self.clean_text()
        #该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
        vectorizer = CountVectorizer()
        #该类会统计每个词语的tf-idf权值
        tf_idf_transformer = TfidfTransformer()
        #将文本转为词频矩阵并计算tf-idf
        tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(self.reviewText))
        #将tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重
        self.tf_idf_matrix = tf_idf.toarray()
        print("tf-idf done")
        print(self.tf_idf_matrix)
        print(np.shape(self.tf_idf_matrix))
        return
    
    def pca(self, ndim = 2000):
        pca = PCA(n_components=ndim)
        # reduce the dimension of feature vectors
        self.tf_idf_pca = pca.fit_transform(self.tf_idf_matrix)
        # save the reduced tf-idf matrix
        self.dump(self.tf_idf_pca, pca_path)
        return
    
    def sim_cal(self, loadcsim=1, load=1):
        if ( load ):
            self.load_sim(self.usim, usim_path)
            self.load_sim(self.isim, isim_path)
        else:
            self.cos_sim_cal(self.matrix, self.usim, usim_path, self.user_active, self.items_per_user)
            self.cos_sim_cal(np.transpose(self.matrix), self.isim, isim_path, self.item_active, self.users_per_item)
            
        if ( loadcsim ):
            self.load_sim(self.csim, csim_path)
        else:
            self.tf_idf()
            # pca ############
            self.pca(2000)
            print("pca done")
            # 计算相似度
            self.cos_sim_cal(self.tf_idf_pca, self.csim, csim_path)
        # 相似度排序
        self.neighbor_sort(self.usim, self.uneighbor, uneighbor_path)
        self.neighbor_sort(self.isim, self.ineighbor, ineighbor_path)
        self.neighbor_sort(self.csim, self.cneighbor, cneighbor_path)
        return self
    
    # 按相似度从高到低为邻居排序
    def neighbor_sort(self, src, dst, to_path):
        for i in range (len(dst)):
            dst[i] = src[i].argsort()[::-1]
        self.dump(dst, to_path)
        return self
    
    def predict_1rating(self, unum, inum):
        '''
        根据共现矩阵，计算一次rating
        '''
        utotal, uweigh, itotal, iweigh, pred_ui = 0, 0, 0, 0, 0
        usim, isim = [], []
        # ucf ###########
        cnt = 0
        for u in range(len(self.user_dict)):    # 遍历用户邻居表
            user = int(self.uneighbor[unum][u]) # 当前考察的邻居
            if ( user == unum ):                # 跳过自己
                continue
            # 调整阈值 ##############################################
            if ( self.usim[user][unum] <= 0.2 or cnt == 5 ): # 相似用户
                break
            #########################################################
            r_ui = self.matrix[user][inum]      # 用户邻居user对inum的打分
            if ( r_ui > 0 ):                    # 打过分
                r_u = self.user_ave_score[user] # 该邻居打分均值
                utotal += self.usim[unum][user] * ( r_ui - r_u )
                uweigh += self.usim[unum][user]
                usim.append(self.usim[unum][user])
                cnt += 1
        if ( abs(uweigh) < 1e-10 ): # 没有相似用户评价过inum
            ucfpred = self.theta*(self.gama*self.user_ave_score[unum]+(1-self.gama)*self.item_ave_score[inum])+(1-self.theta)*self.all_user_ave
        else:
            ucfpred = self.user_ave_score[unum] + utotal / uweigh
        if ( ucfpred < 1 ):
            ucfpred = 1
        elif ( ucfpred > 5 ):
            ucfpred = 5
        # icf ###########
        cnt = 0
        for i in range(len(self.item_dict)):    # 遍历物品邻居表
            item = int(self.ineighbor[inum][i]) # 当前考察的邻居
            if ( item == inum ):                # 跳过自己
                continue
            # 调整阈值 ##############################################
            if ( self.isim[item][inum] <= 0.2 or cnt == 5 ): # 相似物品
                break
            #########################################################
            r_ui = self.matrix[unum][item]      # u对inum的邻居item的打分
            if ( r_ui > 0 ):                    # 打过分
                itotal += self.isim[item][inum] * r_ui
                iweigh += self.isim[item][inum]
                isim.append(self.isim[item][inum])
                cnt += 1
        if ( abs(iweigh) < 1e-10 ): # unum没有交互过相似物品
            icfpred = self.theta*(self.gama*self.user_ave_score[unum]+(1-self.gama)*self.item_ave_score[inum])+(1-self.theta)*self.all_item_ave
        else:
            icfpred = self.item_ave_score[inum] + itotal / iweigh
        if ( icfpred < 1 ):
            icfpred = 1
        elif ( icfpred > 5 ):
            icfpred = 5
        # content-based ###########
        for i in range(len(self.item_dict)):    # 遍历物品邻居表
            item = int(self.cneighbor[inum][i]) # 当前考察的邻居
            if ( item == inum ):                # 跳过自己
                continue
            # 调整阈值 ##############################################
            if ( self.csim[item][inum] <= 0.2 ): # 相似物品
                break
            #########################################################
            r_ui = self.matrix[unum][item]      # u对inum的邻居item的打分
            if ( r_ui > 0 ):                    # 打过分
                itotal += self.csim[item][inum] * r_ui
                iweigh += self.csim[item][inum]
                isim.append(self.csim[item][inum])
        if ( abs(iweigh) < 1e-10 ): # unum没有交互过相似物品
            cbpred = self.all_user_ave
        else:
            cbpred = itotal / iweigh
        if ( cbpred < 1 ):
            cbpred = 1
        elif ( cbpred > 5 ):
            cbpred = 5
        # 综合 ######################
        pred_ui = self.beta * ( self.alpha * ucfpred + ( 1 - self.alpha ) * icfpred ) + ( 1 - self.beta ) * cbpred
        #print(unum, inum, usim, isim, pred_ui)
        self.able += 1

        # bounding
        if ( pred_ui < 1 ):
            pred_ui = self.all_user_ave
        if ( pred_ui > 5 ):
            pred_ui = 5
        return np.squeeze(pred_ui)

    def predict_RMSE(self, path, to_path):
        '''
        提供一个计算RMSE的方法
        '''
        predicted_list=[]
        gt_rate_list=[]
        #1.read valid data & compute scores
        f = csv.reader(open(path,'r'))
        self.able = 0
        self.accurate = 0
        for i in f:
            if i[0] == 'reviewerID':
                continue
            unum,inum,gt_rate = self.user_dict[i[0]],self.item_dict[i[1]],eval(i[2])
            predicted_score = self.predict_1rating(unum, inum)
            predicted_list.append(predicted_score)
            gt_rate_list.append(gt_rate)
            if ( predicted_score == gt_rate ):
                self.accurate += 1
        RMSE = 0.0
        #2.compute RMSE
        data_len=len(predicted_list)
        RMSE=sum([((predicted_list[i]-gt_rate_list[i])**2) / data_len for i in range(data_len)]) ** 0.5
        print('RMSE',RMSE)
        print('able',self.able)
        print('accurate',self.accurate)
        
        # 输出验证集测试结果
        writer = csv.writer(open(valid_res_path,'w', newline=''))
        for i in range(len(predicted_list)):
            writer.writerow([gt_rate_list[i], predicted_list[i]])
        
        # 可视化预测结果
        plt.style.use('ggplot')
        plt.figure(figsize=(15, 4))
        plt.scatter([i for i in range(len(predicted_list))], gt_rate_list, alpha=0.7, label='rate')
        plt.scatter([i for i in range(len(predicted_list))], predicted_list, alpha=0.7, label='predicted rate')
        plt.legend(loc=[1, 1], fontsize=10)
        plt.title('Prediction results on the validation set (RMSE: %.5f)'%RMSE)
        plt.xlabel('index')
        plt.ylabel('score')
        return RMSE
    
    def save_pred(self, path, to_path):
        f = csv.reader(open(path,'r'))
        writer = csv.writer(open(to_path,'w', newline=''))
        writer.writerow(['idx','overall'])
        cnt = 0
        self.able = 0
        for i in f:
            if i[0] == 'reviewerID':
                continue
            unum,inum = self.user_dict[i[0]],self.item_dict[i[1]]
            predicted_score = self.predict_1rating(unum, inum)
            writer.writerow([cnt,predicted_score])
            cnt += 1
        print('able',self.able)
        return self

In [None]:
# 划分训练集/验证集
model=CF()
model.load_train_data(train_path)
#model.split_valid_data(train_path, valid_path, 0.1, 1)
model.init()

model.user_ave_cal(user_ave_path)
model.item_ave_cal(item_ave_path)

In [None]:
# loadcsim: 是否导入文本相似度
# load: 是否导入 user sim & item sim
model.sim_cal(loadcsim=1,load=1)

In [None]:
# 根据验证集调参结果修改
model.alpha, model.beta, model.gama, model.theta = 0.5, 0.9, 0.5, 0.7
# 保存预测结果
model.save_pred(test_path,saved_path%(model.alpha,model.beta,model.gama,model.theta))

In [None]:
# 输出预测结果
sub_path = 'C:/Users/JyoXu/RS/Assign-2-submission/'

score = ['res_a%.1f b%.1f c%.1f d%.1f'%(model.alpha,model.beta,model.gama,model.theta)]

# 可视化预测结果
plt.style.use('ggplot')
plt.figure(figsize=(15, 4))
for s in range(len(score)):
    f = csv.reader(open(sub_path+score[s]+'.csv','r'))  # 源
    idx = np.zeros(1232)
    pre = np.zeros(1232)
    cnt = 0
    for k in f:
        if k[0] == 'idx':
            continue
        idx[cnt] = eval(k[0])
        pre[cnt] = eval(k[1])
        cnt += 1
    plt.scatter(idx, pre, alpha=0.8, label=score[s])

plt.legend(loc=[1, 1], fontsize=10)
plt.title('Prediction results')
plt.xlabel('index')
plt.ylabel('score')
plt.yticks((1,1.5,2,2.5,3,3.5,4,4.5,5))
plt.show()