In [1]:
# 这些模块和包都是在逐步的探索中所需要的，然后全部汇总到这里，
#    并不是一开始就知道了 ^_^ ^_^ ^_^
# 1、导入模块和包
import pandas as pd    # 加载并处理csv文件
import datetime        # 利用datetime处理时间戳
import _pickle as cPickle    # 数据以二进制进行高效的储存到文件
from collections import defaultdict     # 利用Python设置稀疏矩阵的NULL位置的默认值
import scipy.sparse as ss     # 利用scipy构建稀疏矩阵
import scipy.io as sio    # 利用scipy储存评分矩阵
import numpy as np    # 利用numpy创建指定长度或形状的矩阵以及矩阵运算
from numpy.random import random    # numpy.random中的randn函数生成一些正态分布的随机数据
import time    # 利用Python内置模块，计算训练时迭代的时间
import json    # 将模型参数保存为json文件，加载模型参数json文件
import scipy    # 将储存加载的稀疏评分矩阵转换为numpy矩阵形式

In [2]:
# 2、加载数据集

# 设置数据储存位置
data_path = "./dataset/ml-25m/"

# pandas读取文件ratings.csv(两千五百多万行数据)
csv_file = pd.read_csv(data_path + "ratings.csv", sep = ",", nrows = 10000 )
# 将时间戳转换
csv_file["timestamp"] = csv_file["timestamp"].map(datetime.datetime.fromtimestamp)
csv_file.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,2006-05-17 23:34:04
1,1,306,3.5,2006-05-17 20:26:57
2,1,307,5.0,2006-05-17 20:27:08
3,1,665,5.0,2006-05-17 23:13:40
4,1,899,3.5,2006-05-17 20:21:50


In [3]:
# 查看数据维度（行，列）
csv_file.shape

(10000, 4)

In [4]:
# 将时间戳这个字段属性删除
csv_file = csv_file.drop(['timestamp'], axis=1)

# 查看数据维度（行，列）
csv_file.shape

(10000, 3)

In [5]:
csv_file.tail()

Unnamed: 0,userId,movieId,rating
9995,75,736,4.0
9996,75,778,3.0
9997,75,783,3.0
9998,75,805,3.5
9999,75,832,3.0


In [6]:
# 数据集划分训练集和测试集

# shuffle = True ：Ture 为打乱数据集进行划分，False为不打乱数据集划分
def random_split (df, ratios, shuffle = True):
    
    # Function to split pandas DataFrame into train, validation and test
    #
    # Params:     
    #    df (pd.DataFrame): Pandas data frame to be split.
    #    ratios (list of floats): list of ratios for split. The ratios have to sum to 1.
    #
    # Returns: 
    #    list: List of pd.DataFrame split by the given specifications.
    # ###################################################################################   
    
    seed = 42                  # Set random seed
    if shuffle == True:
        df = df.sample(frac=1)     # Shuffle the data
    samples = df.shape[0]      # Number of samples
    
    # Converts [0.7, 0.2, 0.1] to [0.7, 0.9]
    split_ratio = np.cumsum(ratios).tolist()[:-1] # Get split index
    
    # Get the rounded integer split index
    split_index = [round(x * samples) for x in split_ratio]
    
    # split the data
    splits = np.split(df, split_index)
    
    # Add split index (this makes splitting by group more efficient).
    for i in range(len(ratios)):
        splits[i]["split_index"] = i

    return splits

# 划分数据集
train, test = random_split(csv_file, [0.8, 0.2])

# 保存数据集为训练集和测试集
# 利用pandas的 DataFrame的to_csv方法，将数据写到一个以逗号分隔的文件中
train.to_csv(data_path + "train.csv")
test.to_csv(data_path + "test.csv")

print(test.shape)
print(train.shape)
train.tail()

(2000, 4)
(8000, 4)


Unnamed: 0,userId,movieId,rating,split_index
8272,62,51084,3.5,0
1254,6,260,5.0,0
479,3,6537,3.5,0
2730,13,48516,5.0,0
7667,59,5146,3.5,0


In [7]:
test.head()

Unnamed: 0,userId,movieId,rating,split_index
3234,19,1029,2.0,1
9514,72,2080,5.0,1
4904,31,33836,1.5,1
2835,13,78266,1.0,1
1395,8,1022,4.0,1
