## import包

In [1]:
import pandas as pd
import numpy as np
import datetime
import gc
import os

## 处理后的数据包含:
1. 交互表(必须, user-item的交互记录, 包括训练集和测试集, 测试集中所有记录的时间都在训练集之后);
2. item表(可选).

## 读数据
数据地址：https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data

In [2]:
path = './netflix'
os.listdir(path)

['combined_data_3.txt',
 'movie_titles.csv',
 'combined_data_4.txt',
 'combined_data_1.txt',
 'README',
 'probe.txt',
 'combined_data_2.txt',
 'qualifying.txt']

In [3]:
col_names = ['userId', 'rating', 'time']
ratings1 = pd.read_csv(f'{path}/combined_data_1.txt', header = None, names = col_names)
ratings2 = pd.read_csv(f'{path}/combined_data_2.txt', header = None, names = col_names)
ratings3 = pd.read_csv(f'{path}/combined_data_3.txt', header = None, names = col_names)
ratings4 = pd.read_csv(f'{path}/combined_data_4.txt', header = None, names = col_names)

## 数据处理
### 交互表
原始数据格式是下面这样的格式，每个movieId单独一行，该行后的记录全是与这个movie交互的用户记录，处理时需将movieId对应的行填充入记录后再删除。

MovieID1:

CustomerID11,Date11

CustomerID12,Date12

…

MovieID2:

CustomerID21,Date21

CustomerID22,Date22

In [4]:
ratings = pd.concat([ratings1, ratings2, ratings3, ratings4])

In [5]:
del ratings1, ratings2, ratings3, ratings4
gc.collect()

11

In [6]:
ratings.head()

Unnamed: 0,userId,rating,time
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26


In [7]:
ratings.loc[ratings['rating'].notnull()].shape, ratings.loc[ratings['rating'] == 5].shape

((100480507, 3), (23168232, 3))

In [8]:
ratings = ratings.loc[(ratings['rating'].isnull()) | (ratings['rating']==5)]

In [9]:
ratings.drop('rating', axis=1, inplace=True)

In [10]:
ratings['movieId'] = ratings['userId'].apply(lambda x: x.replace(':', '') if ':' in x else np.nan)
ratings['movieId'].fillna(method='ffill', inplace=True)
ratings.dropna(inplace=True)

In [11]:
ratings.head()

Unnamed: 0,userId,time,movieId
2,822109,2005-05-13,1
12,2207774,2005-06-06,1
20,372233,2005-11-23,1
28,814701,2005-09-29,1
30,662870,2005-08-24,1


In [12]:
ratings['time'] = pd.to_datetime(ratings['time'])

In [13]:
ratings['time'].min(), ratings['time'].max()

(Timestamp('1999-11-11 00:00:00'), Timestamp('2005-12-31 00:00:00'))

In [14]:
data_used_time = datetime.datetime.strptime('2005-12-25 00:00:00', '%Y-%m-%d %H:%M:%S')

train = ratings.loc[ratings['time'] < data_used_time]
test = ratings.loc[ratings['time'] >= data_used_time]

In [15]:
train['time'].min(), train['time'].max()

(Timestamp('1999-11-11 00:00:00'), Timestamp('2005-12-24 00:00:00'))

In [16]:
test['time'].min(), test['time'].max()

(Timestamp('2005-12-25 00:00:00'), Timestamp('2005-12-31 00:00:00'))

## item表

In [17]:
movies = pd.read_csv(f'{path}/movie_titles.csv', 
                     encoding = 'ISO-8859-1', 
                     header = None,
                     names = ['movieId', 'releaseYear', 'title'],
                     dtype = {'movieId': 'str', 'releaseYear': 'str'})

In [18]:
movies.head()

Unnamed: 0,movieId,releaseYear,title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


## 存档

In [19]:
output_path = './Netflix_AutoX/'
os.makedirs(output_path, exist_ok = True)

In [20]:
train.to_csv(output_path + 'inter_df.csv', index = False)
test.to_csv(output_path + 'test.csv', index = False)
movies.to_csv(output_path + 'item_df.csv', index = False)