# Netflix评分预测

* Content：对Netflix数据集进行评分预测
* Author:  HuiHui
* Date:    2020-05-23
* Reference:https://www.kaggle.com/netflix-inc/netflix-prize-data/data

## Netflix数据集

* 该数据集收集了1998年10月至2005年12月之间Netflix用户所有的电影评分数据，涵盖了48万用户对17000部电影的评分记录（超过1亿条），评分范围1-5星
* 训练数据：包含了用户对17770部电影的评分  
&emsp;电影ID:  
&emsp;用户1ID,评分,日期  
&emsp;用户2ID,评分,日期  
注：电影编号从1到17770;用户ID的范围从1到2649429，有间隙，有480189个用户;评分是从1到5（整数）；日期的格式为yyy-MM-DD。
* movie_titles.txt：  
&emsp;电影ID,发行年份,片名  
* qualifying.txt：  
&emsp;电影ID1：  
&emsp;客户1ID,日期1  
&emsp;客户2ID,日期2  
&emsp;电影ID2:  
&emsp;客户1ID,日期1  
注：您的程序必须根据训练数据集中的信息预测用户在qualifying数据集中为电影提供的所有评分  
* probe.txt：  
&emsp;电影ID1：  
&emsp;客户1ID  
&emsp;客户2ID  
&emsp;电影ID2:  
&emsp;客户1ID   
注：与qualifying数据集不同，其每对的评分和日期都包含在培训数据集中；可针对probe探测集计算RMSE与Cinematch的RMSE进行比较
* 提交文件：  
如果qualifying数据集中，  
&emsp;电影ID1：  
&emsp;客户1ID,日期1  
&emsp;客户2ID,日期2  
&emsp;电影ID2:  
&emsp;客户1ID,日期1  
则预测文件中对应，  
&emsp;电影ID1：  
&emsp;评分1  
&emsp;评分2  
&emsp;电影ID2:  
&emsp;评分1  
注：提交的预测文件的格式应遵循qualifying数据集中电影ID、用户ID、日期的顺序

## 提交备注

* 说明使用的数据集大小（如果对数据集进行切分）
* probe上的RMSE，也可以针对probe子集进行计算
* 说明使用的算法

## 导入相关库

In [5]:
# !/usr/bin/env python
# -*- coding=utf-8 -*-
import pandas as pd
import numpy as np
import time, datetime
from surprise import SVD,SVDpp #surprise是推荐算法python实现库
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import KFold
from surprise.model_selection import train_test_split

## 数据处理：txt-->csv

In [2]:
def get_train_csv(path):
    # 以“\n”为分割符，读取数据，返回df类型
    data=pd.read_csv(path,sep="\n",header=None)

    data["movieID"]=data[0].map(lambda x:int(x[:-1]) if  ":" in x else pd.NaT) # data[0]即第0列
    # 用前一个非缺失值去填充缺失的"movieID"
    data=data.fillna(method="ffill") 

    # 删除代表movieID的行
    mask=data[0].map(lambda x:":" not in x) # 结果为True/False
    data=data[mask]

    data["userID"]=data[0].map(lambda x:int(x.split(",")[0]))
    data["rating"]=data[0].map(lambda x:int(x.split(",")[1]))
    data["datetime"]=data[0].map(lambda x:datetime.datetime.strptime(x.split(",")[2].strip(), '%Y-%m-%d'))
    data= data.drop([0], axis=1)

    # 调换movieID userID顺序
    data=data.reindex(columns=['userID','movieID','rating','datetime'])
    # 行索引重新设置从0开始,并删除原行索引
    data=data.reset_index(drop=True)
    return data

def get_qualifying_csv(path):
    data=pd.read_csv(path,sep="\n",header=None)

    data["movieID"]=data[0].map(lambda x:int(x[:-1]) if  ":" in x else pd.NaT)
    data=data.fillna(method="ffill") 

    mask=data[0].map(lambda x:":" not in x)
    data=data[mask]

    data["userID"]=data[0].map(lambda x:int(x.split(",")[0]))
    data["datetime"]=data[0].map(lambda x:datetime.datetime.strptime(x.split(",")[1].strip(), '%Y-%m-%d'))
    data= data.drop([0], axis=1)

    data=data.reindex(columns=['userID','movieID','datetime'])
    data=data.reset_index(drop=True)
    return data

def get_probe_csv(path):
    data=pd.read_csv(path,sep="\n",header=None)

    data["movieID"]=data[0].map(lambda x:int(x[:-1]) if  ":" in x else pd.NaT) 
    data=data.fillna(method="ffill") 

    mask=data[0].map(lambda x:":" not in x) 
    data=data[mask]

    data["userID"]=data[0].map(lambda x:int(x.split(",")[0]))
    data= data.drop([0], axis=1)

    data=data.reindex(columns=['userID','movieID'])
    data=data.reset_index(drop=True)
    return data

# txt-->csv
# combined_data_1_csv=get_train_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_1.txt")
# combined_data_1_csv.to_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_1.csv",index = False)  # 保存时忽略行索引这一列
# combined_data_2_csv=get_train_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_2.txt")
# combined_data_2_csv.to_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_2.csv",index = False)
# combined_data_3_csv=get_train_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_3.txt")
# combined_data_3_csv.to_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_3.csv",index = False)
# combined_data_4_csv=get_train_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_4.txt")
# combined_data_4_csv.to_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_4.csv",index = False)

# probe_csv=get_probe_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/probe.txt")
# probe_csv.to_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/probe.csv",index = False)

# qualifying_csv=get_qualifying_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/qualifying.txt")
# qualifying_csv.to_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/qualifying.csv",index = False)

print("ok")

ok


In [16]:
# combined_data_1_csv=get_train_csv("combined_data_1.txt")
# combined_data_1_csv.to_csv("combined_data_1.csv",index = False)
# combined_data_2_csv=get_train_csv("combined_data_2.txt")
# combined_data_2_csv.to_csv("combined_data_2.csv",index = False)

# probe_csv=get_probe_csv("probe.txt")
# probe_csv.to_csv("probe.csv",index = False) # 保存时忽略行索引这一列

# qualifying_csv=get_qualifying_csv("qualifying.txt")
# qualifying_csv.to_csv("qualifying.csv",index = False)

    userID  movieID  rating    datetime
0  1488844        1     3.0  2020-01-01
1    30878        1     0.0  2020-01-01
2  1227322        2     4.0  2020-01-01
3  1009622        3     2.0  2020-01-01


## 数据处理：在训练集中查找probe集的评分及日期

In [None]:
# 读取probe.csv
probe_csv=pd.read_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/probe.csv")
print(probe_csv.head())

# 读取训练数据csv
combined_data_1_csv=pd.read_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_1.csv")
combined_data_2_csv=pd.read_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_2.csv")
combined_data_3_csv=pd.read_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_3.csv")
combined_data_4_csv=pd.read_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/combined_data_4.csv")

# 先将所有训练集拼接，再将probe与训练集df合并（左连接），即可得到其对应的评分及日期
train_data=pd.concat([combined_data_1_csv,combined_data_2_csv],ignore_index=True) # ignore_index=True表重建索引
train_data=pd.concat([train_data,combined_data_3_csv],ignore_index=True)
train_data=pd.concat([train_data,combined_data_4_csv],ignore_index=True)
print(train_data.head()) 
train_data.to_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/train_data.csv",index = False) # 保存成train_data.csv
probe_csv=pd.merge(probe_csv,train_data,on=['userID','movieID'],how='left')
print(probe_csv.head())

# 保存到probe.csv
probe_csv.to_csv("/Users/wangdonghui/Desktop/ZGZ/RS/dataset/netflix-prize-data/probe.csv",index = False)


## 数据集切分

In [13]:
# #￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥更改、运行

# 读取train_data.csv
train_data=pd.read_csv("train_data.csv")

# 训练数据集随机采样
train_data_small = train_data.sample(frac=0.01, replace=False, random_state=1) # frac=0.8抽取比例；replace=False未放回抽样；使用random_state，以确保可重复性实现
train_data_small = train_data_small.reset_index(drop=True)
print(train_data_small.head(10)) 

# 保存小样本train_data_small.csv
train_data_small.to_csv("train_data_small.csv",index = False)

    userID  movieID  rating    datetime
0  1481961        3       0  2020-01-01
1    30878        1       0  2020-01-01
2  8221093        1       4  2020-01-01
3  1009622        3       2  2020-01-01
4  1488844        1       3  2020-01-01
5  1227322        2       4  2020-01-01


## 评分预测（使用SVD算法）

In [61]:
# 数据读取
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # skip_lines=1 读取时跳过第一行即表头

# # 全量数据
# train_data = Dataset.load_from_file('train_data.csv', reader=reader)
# 小样本
train_data = Dataset.load_from_file('train_data_small.csv', reader=reader)
probe_data = Dataset.load_from_file('probe.csv', reader=reader)

trainset = train_data.build_full_trainset() # 不将数据集拆分或折叠，返回从整个数据集生成的训练集
probeset = probe_data.build_full_trainset().build_testset()
print(type(probeset[0]))

# 使用funkSVD
algo = SVD(biased=False)

# 训练模型
algo.fit(trainset)

# 计算RMSE（probe）
predictions = algo.test(probeset)
print(predictions)
accuracy.rmse(predictions,verbose=True) # verbose=True打印预测详细信息

# 预测（qualifying）
results = pd.read_csv("qualifying.csv")
testset = [(str(row[1]),str(row[2]),0) for row in results.itertuples()]
print(testset)
results["rating"] = algo.test(testset)
results["rating"]=results["rating"].map(lambda x:x[3])
results = results.reindex(columns=['userID','movieID','rating','datetime'])
print(results.head(10))

# 结果保存为csv
results.to_csv("results.csv",index = False)

<class 'tuple'>
[Prediction(uid='1488844', iid='1', r_ui=3.0, est=1, details={'was_impossible': False}), Prediction(uid='30878', iid='1', r_ui=0.0, est=1, details={'was_impossible': False}), Prediction(uid='1227322', iid='2', r_ui=4.0, est=1, details={'was_impossible': False}), Prediction(uid='1009622', iid='3', r_ui=2.0, est=1, details={'was_impossible': False})]
RMSE: 1.9365
[('1181550', '2', 0), ('1227322', '2', 0), ('885014', '2', 0), ('1009622', '3', 0), ('1481963', '3', 0)]
    userID  movieID    rating    datetime
0  1181550        2  2.166667  2020-01-01
1  1227322        2  1.000000  2020-01-01
2   885014        2  2.166667  2020-01-01
3  1009622        3  1.000000  2020-01-01
4  1481963        3  2.166667  2020-01-01


## 问题

* 若预测集中出现训练集中没有的新用户或新电影，矩阵分解(SVD)是如何给出预测评分的呢？