In [1]:
# =================================================
#
# 论题：基于矩阵分解的推荐算法研究
#    1、研究环境：Anaconda3 + Python3.7 + IPython7.12 
#                + JupyterLab + Jupyter Notebook
#    2、数据集：MovieLens(ml-25m)，
#              Kaggle(amazon-ratings)，
#              Book-Crossing Dataset(BX-CSV-Dump)
#    3、研究算法：Funk SVD(LFM)、Bias SVD
#    4、求解方法：随机梯度下降法 SGD
#    5、目标函数：平方损失函数
#    6、评测指标：RMSE、MAE
#
# 思路：数据集探索和模型探索
#    1、初步探索数据集
#    2、评分数据处理，划分数据集和得出稀疏评分矩阵
#    3、LFM 模型探索与模型参数调优
#    4、Bias SVD 模型探索与模型参数调优
#
# 备注：本代码只实用于 MovieLens 数据集，
#       其他两个数据集研究思路和方法一致，
#       只不过是加载不同的数据集而言
# =================================================

In [1]:
# 模块和包都是在逐步的探索中所需要的，全部汇总到这里，
#    并不是一开始就知道了 ^_^ ^_^ ^_^ 
# 不熟悉的模块和包，强烈建议查看官方文档说明以及例子
# 1、导入模块和包
import pandas as pd    # 加载并处理csv文件
import datetime        # 利用datetime处理时间戳
# cPickle 数据以二进制进行高效的储存到文件
import _pickle as cPickle 
# defaultdict 设置稀疏矩阵的 NULL 位置的默认值
from collections import defaultdict 
# 利用scipy sparse 构建稀疏矩阵
import scipy.sparse as ss     
import scipy.io as sio    # 利用scipy储存评分矩阵
# 利用numpy创建指定长度或形状的矩阵以及矩阵运算
import numpy as np 
# numpy.random中的randn函数生成正态分布的随机数据
from numpy.random import random    
import time    # 计算训练时迭代的时间
import json    # 将模型参数保存和加载 json文件
import scipy    # 将储存加载的稀疏评分矩阵转换为numpy矩阵

In [3]:
# 2、加载数据集
# 设置数据储存位置
data_path = "./../dataset/ml-25m/"
# pandas读取文件ratings.csv(两千五百多万行数据)
csv_file = pd.read_csv(data_path + "ratings.csv", sep = ",")

# 将时间戳转换
csv_file["timestamp"] = csv_file["timestamp"].map(datetime.datetime.fromtimestamp)
csv_file.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,2006-05-17 23:34:04
1,1,306,3.5,2006-05-17 20:26:57
2,1,307,5.0,2006-05-17 20:27:08
3,1,665,5.0,2006-05-17 23:13:40
4,1,899,3.5,2006-05-17 20:21:50


In [4]:
# 查看数据维度（行，列）
csv_file.shape

(25000095, 4)

In [5]:
# 将时间戳这个字段属性删除
csv_file = csv_file.drop(['timestamp'], axis=1)
# 查看数据维度（行，列）
csv_file.shape

(25000095, 3)

In [6]:
csv_file.tail()

Unnamed: 0,userId,movieId,rating
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0
25000094,162541,63876,5.0


In [7]:
# 数据集划分训练集和测试集
# =================================================
# Ture 为打乱数据集进行划分，False为不打乱数据集划分
def random_split (df, ratios, shuffle = True):
    
# Function to split DataFrame into train and test
#
# Params:     
#  df (pd.DataFrame): Pandas data frame to be split.
#  ratios (list of floats): list of ratios for split. 
#  The ratios have to sum to 1.
#
# Returns: 
#  list: List of pd.DataFrame split 
#          by the given specifications.
# ###################################################  
    
    seed = 42                  # Set random seed
    if shuffle == True:
        df = df.sample(frac=1)     # Shuffle the data
    samples = df.shape[0]      # Number of samples
    
    # Converts [0.7, 0.2, 0.1] to [0.7, 0.9]
    split_ratio = np.cumsum(ratios).tolist()[:-1] 
    
    # Get the rounded integer split index
    split_index = [round(x * samples) for x in split_ratio]
    
    # split the data
    splits = np.split(df, split_index)
    
    # Add split index 
    # (this makes splitting by group more efficient).
    for i in range(len(ratios)):
        splits[i]["split_index"] = i

    return splits

# 划分数据集
# train,validation,test = random_split()
train, test = random_split(csv_file, [0.8, 0.2])

# 保存数据集为训练集和测试集
# 利用pandas的 DataFrame的to_csv方法
train.to_csv(data_path + "train.csv")
test.to_csv(data_path + "test.csv")

print(test.shape)
print(train.shape)
train.tail()

(5000019, 4)
(20000076, 4)


Unnamed: 0,userId,movieId,rating,split_index
14321319,92779,6157,2.5,0
16551821,107398,2018,1.0,0
20372395,132473,2709,4.5,0
6931126,44967,86298,4.0,0
4852327,31661,134853,4.5,0


In [8]:
test.head()

Unnamed: 0,userId,movieId,rating,split_index
2525823,16829,7371,4.0,1
4128727,27171,43679,2.5,1
908682,6114,4973,4.0,1
5250989,34118,8798,4.5,1
9589939,62261,3160,1.0,1
