In [1]:
# 这些模块和包都是在逐步的探索中所需要的，然后全部汇总到这里，
#    并不是一开始就知道了 ^_^ ^_^ ^_^
# 1、导入模块和包
import pandas as pd    # 加载并处理csv文件
import datetime        # 利用datetime处理时间戳
import _pickle as cPickle    # 数据以二进制进行高效的储存到文件
from collections import defaultdict     # 利用Python设置稀疏矩阵的NULL位置的默认值
import scipy.sparse as ss     # 利用scipy构建稀疏矩阵
import scipy.io as sio    # 利用scipy储存评分矩阵
import numpy as np    # 利用numpy创建指定长度或形状的矩阵以及矩阵运算
from numpy.random import random    # numpy.random中的randn函数生成一些正态分布的随机数据
import time    # 利用Python内置模块，计算训练时迭代的时间
import json    # 将模型参数保存为json文件，加载模型参数json文件
import scipy    # 将储存加载的稀疏评分矩阵转换为numpy矩阵形式

In [8]:
# 2、加载数据集

# 设置数据储存位置
data_path = "./../dataset/amazon-ratings/"

# pandas读取文件
csv_file = pd.read_csv(data_path + "ratings_Beauty.csv", sep = ",", nrows = 2000)
# 将时间戳转换
csv_file["Timestamp"] = csv_file["Timestamp"].map(datetime.datetime.fromtimestamp)
csv_file.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,2013-05-28 08:00:00
1,A3JM6GV9MNOF9X,558925278,3.0,2012-12-14 08:00:00
2,A1Z513UWSAAO0F,558925278,5.0,2014-07-07 08:00:00
3,A1WMRR494NWEWV,733001998,4.0,2013-10-24 08:00:00
4,A3IAAVS479H7M7,737104473,1.0,2010-05-19 08:00:00


In [9]:
# 查看数据维度（行，列）
csv_file.shape

(2000, 4)

In [10]:
# 将时间戳这个字段属性删除
csv_file = csv_file.drop(['Timestamp'], axis=1)

# 查看数据维度（行，列）
csv_file.shape

(2000, 3)

In [11]:
csv_file.tail()

Unnamed: 0,UserId,ProductId,Rating
1995,AFZZT3IKY35IW,9790798326,5.0
1996,A2PH5VIDM6F7W5,9790798326,1.0
1997,ADFW3FGCFI7C7,9790798326,5.0
1998,A86GTS35R5YGG,9790798326,5.0
1999,A27ZTNAOOODGSZ,9790798393,5.0


In [12]:
# 数据集划分训练集和测试集

# shuffle = True ：Ture 为打乱数据集进行划分，False为不打乱数据集划分
def random_split (df, ratios, shuffle = True):
    
    # Function to split pandas DataFrame into train, validation and test
    #
    # Params:     
    #    df (pd.DataFrame): Pandas data frame to be split.
    #    ratios (list of floats): list of ratios for split. The ratios have to sum to 1.
    #
    # Returns: 
    #    list: List of pd.DataFrame split by the given specifications.
    # ###################################################################################   
    
    seed = 42                  # Set random seed
    if shuffle == True:
        df = df.sample(frac=1)     # Shuffle the data
    samples = df.shape[0]      # Number of samples
    
    # Converts [0.7, 0.2, 0.1] to [0.7, 0.9]
    split_ratio = np.cumsum(ratios).tolist()[:-1] # Get split index
    
    # Get the rounded integer split index
    split_index = [round(x * samples) for x in split_ratio]
    
    # split the data
    splits = np.split(df, split_index)
    
    # Add split index (this makes splitting by group more efficient).
    for i in range(len(ratios)):
        splits[i]["split_index"] = i

    return splits

# 划分数据集
train, test = random_split(csv_file, [0.8, 0.2])

# 保存数据集为训练集和测试集
# 利用pandas的 DataFrame的to_csv方法，将数据写到一个以逗号分隔的文件中
train.to_csv(data_path + "train.csv")
test.to_csv(data_path + "test.csv")

print(test.shape)
print(train.shape)
train.tail()

(400, 4)
(1600, 4)


Unnamed: 0,UserId,ProductId,Rating,split_index
1830,A1IYOZVSPKNPFK,9790792557,5.0,0
1370,ACRO15EXJ3BL8,979078127X,5.0,0
792,A2CPYXQY7MN36Q,9788072216,5.0,0
605,A1O2WUVJ9OKUTO,9759091062,5.0,0
1182,A26THXR69AC6VO,9790776381,1.0,0


In [13]:
test.head()

Unnamed: 0,UserId,ProductId,Rating,split_index
1971,A3DCX3T7X7TXUL,9790798075,5.0,1
940,A12712009FURQ7,9788073883,5.0,1
1443,A3MGHPFOC5P3EM,9790782365,5.0,1
445,ATZVCICY66PXZ,9571044822,5.0,1
1333,A2KKJXOD3PWGIA,9790780001,5.0,1
