## import包

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm



## 处理后的数据包含: 
```
1. 交互表(必须, user-item的交互记录, 包括训练集和测试集, 测试集中所有记录的时间都在训练集之后);
2. user表(可选);
3. item表(可选).
```

## 读数据

In [2]:
path = './ml-25m'

In [3]:
os.listdir(path)

['tags.csv',
 'links.csv',
 'README.txt',
 'ratings.csv',
 'genome-tags.csv',
 'genome-scores.csv',
 'movies.csv']

In [4]:
tags = pd.read_csv(f'{path}/tags.csv')
links = pd.read_csv(f'{path}/links.csv')
ratings = pd.read_csv(f'{path}/ratings.csv')
genometags = pd.read_csv(f'{path}/genome-tags.csv')
genomescores = pd.read_csv(f'{path}/genome-scores.csv')
movies = pd.read_csv(f'{path}/movies.csv')

## 数据处理

### 交互表

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [6]:
ratings.shape, ratings.loc[ratings['rating'] == 5].shape

((25000095, 4), (3612474, 4))

In [7]:
ratings = ratings.loc[ratings['rating'] == 5]

In [8]:
ratings.drop('rating', axis=1, inplace=True)

In [11]:
import datetime
ratings['time'] = ratings['timestamp'].apply(
                lambda ts: datetime.datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))

In [12]:
ratings.drop('timestamp', axis=1, inplace=True)

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,time
0,1,296,2006-05-17 15:34:04
2,1,307,2006-05-17 12:27:08
3,1,665,2006-05-17 15:13:40
8,1,1237,2006-05-17 12:27:19
18,1,2632,2006-05-17 15:04:08


In [14]:
ratings['time'] = pd.to_datetime(ratings['time'])

In [15]:
ratings['time'].min(), ratings['time'].max()

(Timestamp('1995-01-09 11:46:49'), Timestamp('2019-11-21 09:06:53'))

In [16]:
import datetime
data_used_time = datetime.datetime.strptime('2019-11-15 00:00:00', '%Y-%m-%d %H:%M:%S')

train = ratings.loc[ratings['time'] < data_used_time]
test = ratings.loc[ratings['time'] >= data_used_time]

In [17]:
train['time'].min(), train['time'].max()

(Timestamp('1995-01-09 11:46:49'), Timestamp('2019-11-14 23:20:55'))

In [18]:
test['time'].min(), test['time'].max()

(Timestamp('2019-11-15 00:08:42'), Timestamp('2019-11-21 09:06:53'))

### item表

In [19]:
temp = genomescores.pivot(index='movieId', columns='tagId', values='relevance')
temp = temp.reset_index()
temp.columns = ['movieId'] + ['tag_' + str(i) for i in range(1, 1128+1)]

In [20]:
movies = movies.merge(temp, on = 'movieId', how = 'left')

In [21]:
movies

Unnamed: 0,movieId,title,genres,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,...,tag_1119,tag_1120,tag_1121,tag_1122,tag_1123,tag_1124,tag_1125,tag_1126,tag_1127,tag_1128
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.02875,0.02375,0.06250,0.07575,0.14075,0.14675,0.06350,...,0.04050,0.01425,0.03050,0.03500,0.14125,0.05775,0.03900,0.02975,0.08475,0.02200
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.04125,0.04050,0.06275,0.08275,0.09100,0.06125,0.06925,...,0.05250,0.01575,0.01250,0.02000,0.12225,0.03275,0.02100,0.01100,0.10525,0.01975
2,3,Grumpier Old Men (1995),Comedy|Romance,0.04675,0.05550,0.02925,0.08700,0.04750,0.04775,0.04600,...,0.06275,0.01950,0.02225,0.02300,0.12200,0.03475,0.01700,0.01800,0.09100,0.01775
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.03425,0.03800,0.04050,0.03100,0.06500,0.03575,0.02900,...,0.05325,0.02800,0.01675,0.03875,0.18200,0.07050,0.01625,0.01425,0.08850,0.01500
4,5,Father of the Bride Part II (1995),Comedy,0.04300,0.05325,0.03800,0.04100,0.05400,0.06725,0.02775,...,0.05350,0.02050,0.01425,0.02550,0.19225,0.02675,0.01625,0.01300,0.08700,0.01600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,209157,We (2018),Drama,,,,,,,,...,,,,,,,,,,
62419,209159,Window of the Soul (2001),Documentary,,,,,,,,...,,,,,,,,,,
62420,209163,Bad Poems (2018),Comedy|Drama,,,,,,,,...,,,,,,,,,,
62421,209169,A Girl Thing (2001),(no genres listed),,,,,,,,...,,,,,,,,,,


## 存档

In [22]:
output_path = './MovieLens_AutoX/'
os.makedirs(output_path, exist_ok = True)

In [23]:
train.to_csv(output_path + 'inter_df.csv', index = False)
test.to_csv(output_path + 'test.csv', index = False)
movies.to_csv(output_path + 'item_df.csv', index = False)