##训练集分析

In [1]:
#导入必要的函数库
import numpy
import random
import pandas as pd
import math
import csv
from collections import defaultdict

In [2]:
#读取测试集数据
filename = "train.csv"
f = open(filename, "rt", encoding="utf-8")
header = f.readline()
header = header.strip().split(',')#列名
print(header)

['user_id', 'business_id', 'date', 'stars']


In [6]:
dataset = []

In [7]:
#将测试集数据放入dataset
for line in f:
    fields = line.strip().split(',')
    d = dict(zip(header, fields))
    d["stars"] = float(d["stars"])
    dataset.append(d)

In [8]:
UserPerItem = defaultdict(set)
ItemPerUser = defaultdict(set)

In [9]:
itemNames = {}

In [10]:
for d in dataset:
    user,item = d['user_id'], d['business_id']
    UserPerItem[item].add(user)
    ItemPerUser[user].add(item)

In [11]:
#余弦相似度
def cos(s1,s2):
    demon = 0.0
    number = len(s1.intersection(s2))#并集
    l1 = len(s1) #s1集合中元素个数
    l2 = len(s2) #s2集合中元素个数
    demon += math.sqrt(l1 * l2)
    if demon == 0:
        return 0
    return number / demon

In [12]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

In [13]:
for d in dataset:
    user,item = d['user_id'], d['business_id']
    reviewsPerItem[item].append(d)
    reviewsPerUser[user].append(d)

In [16]:
ratingMean = sum([d['stars'] for d in dataset]) / len(dataset)

In [13]:
ratingMean

3.9070852244074636

In [14]:
def prdictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['business_id']
        if i2 == item:continue
        ratings.append(d['stars'])
        similarities.append(cos(UserPerItem[item],UserPerItem[i2]))
    if(sum(similarities)>0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [15]:
#准确率
s = len(dataset)
count = 0
for d in dataset:
    user,item,star = d['user_id'], d['business_id'], d['stars']
    star = float(star)
    p = round(prdictRating(user,item))
    if p - star <= 0.5:
        count += 1
print(count/s)

0.7294503277861826


In [19]:
def MSE(predictions,labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [20]:
alwaysPredictMean = [ratingMean for d in dataset]

In [21]:
cfPredicitions = [prdictRating(d['user_id'], d['business_id']) for d in dataset]

In [22]:
labels = [d['stars'] for d in dataset]

In [23]:
MSE(alwaysPredictMean, labels)

1.0033436473005906

In [24]:
MSE(cfPredicitions, labels)

1.0792256987993392

##读入测试集数据并预测

In [25]:
#读取测试集数据
filename2 = "test.csv"
f2 = open(filename2, "rt", encoding="utf-8")
header2 = f2.readline()
header2 = header2.strip().split(',')#列名
print(header2)
header2.append('pre_stars')
print(header2)

['', 'user_id', 'business_id', 'date', 'pre_stars']
['', 'user_id', 'business_id', 'date', 'pre_stars', 'pre_stars']


In [26]:
predata = []

In [27]:
for line in f2:
    fields = line.strip().split(',')
    d = dict(zip(header2, fields))
    d["pre_stars"] = 0
    predata.append(d)

In [28]:
for d in predata:
    u, i = d['user_id'], d['business_id']
    s = round(prdictRating(u,i))
    d['pre_stars'] = s

In [29]:
df = pd.DataFrame(predata)
df.to_csv(filename2)

##改进，即考虑商品热度进行衰减

In [14]:
def prdictRating(user,item):
    c = len(UserPerItem[item])
    c = 1 / (1 + math.log(c,10))
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['business_id']
        if i2 == item:continue
        ratings.append(d['stars'])
        similarities.append(cos(UserPerItem[item],UserPerItem[i2]) * c)
    if(sum(similarities)>0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [17]:
#读取测试集数据
filename2 = "test2.csv"
f2 = open(filename2, "rt", encoding="utf-8")
header2 = f2.readline()
header2 = header2.strip().split(',')#列名
#print(header2)
header2.append('pre_stars')
#print(header2)

predata = []

for line in f2:
    fields = line.strip().split(',')
    d = dict(zip(header2, fields))
    d["pre_stars"] = 0
    predata.append(d)

for d in predata:
    u, i = d['user_id'], d['business_id']
    s = round(prdictRating(u,i))
    d['pre_stars'] = s

df = pd.DataFrame(predata)
df.to_csv(filename2)

['user_id', 'business_id', 'date']
['user_id', 'business_id', 'date', 'pre_stars']


# 对比test.csv 与 test2.csv

In [30]:
filename_1 = "test.csv"
filename_2 = "test2.csv"

file1 = open(filename_1, "rt", encoding="utf-8")
file2 = open(filename_2, "rt", encoding="utf-8")

headers1 = file1.readline()
headers1 = headers1.strip().split(',')#列名
headers2 = file2.readline()
headers2 = headers2.strip().split(',')#列名

data1 = []
data2 = []

for line in file1:
    fields = line.strip().split(',')
    d = dict(zip(headers1, fields))
    data1.append(d)

for line in file2:
    fields = line.strip().split(',')
    d = dict(zip(headers2, fields))
    data2.append(d)

s = len(data1)
count = 0
for i in range(len(data1)):
    s1 = float(data1[i]['pre_stars'])
    s2 = float(data2[i]['pre_stars'])
    if(s1 == s2):
        count += 1
print(count / s)

0.9989806320081549
