## import包

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm



## 读数据

In [2]:
os.listdir('./')

['checkins.csv',
 'ratings.csv',
 'users.csv',
 'venues.csv',
 'zip',
 'data_process.ipynb',
 '.ipynb_checkpoints']

In [3]:
checkins = pd.read_csv('./checkins.csv')
ratings = pd.read_csv('./ratings.csv')
users = pd.read_csv('./users.csv')
venues = pd.read_csv('./venues.csv')

In [4]:
checkins.head(2)

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,16,539270,1206,41.878114,-87.629798,2011-12-08 05:08:42
1,17,1330941,1206,0.0,0.0,2011-12-08 04:32:19


In [5]:
ratings.head(2)

Unnamed: 0,user_id,venue_id,rating
0,1,1,5
1,1,51,4


In [6]:
users.head(2)

Unnamed: 0,id,latitude,longitude
0,1,45.072464,-93.455788
1,2,30.669682,-81.462592


In [7]:
venues.head(2)

Unnamed: 0,id,latitude,longitude
0,1,44.882011,-93.212364
1,2,44.883169,-93.213687


## 数据处理

### 交互表

In [8]:
checkins.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,16,539270,1206,41.878114,-87.629798,2011-12-08 05:08:42
1,17,1330941,1206,0.0,0.0,2011-12-08 04:32:19
2,18,1330942,1206,0.0,0.0,2011-12-08 04:29:38
3,19,282798,1206,41.878114,-87.629798,2011-12-08 04:26:06
4,20,376793,1206,41.878114,-87.629798,2011-12-08 04:17:50


In [9]:
checkins.drop(['id','latitude', 'longitude'], axis=1, inplace=True)

In [11]:
checkins.head()

Unnamed: 0,user_id,venue_id,created_at
0,539270,1206,2011-12-08 05:08:42
1,1330941,1206,2011-12-08 04:32:19
2,1330942,1206,2011-12-08 04:29:38
3,282798,1206,2011-12-08 04:26:06
4,376793,1206,2011-12-08 04:17:50


In [15]:
checkins['created_at'] = pd.to_datetime(checkins['created_at'])

In [16]:
checkins['created_at'].min(), checkins['created_at'].max()

(Timestamp('2011-12-08 03:23:59'), Timestamp('2012-04-23 01:53:50'))

最后七天作为测试集

In [17]:
import datetime
split_time = datetime.datetime.strptime('2012-04-17 00:00:00', '%Y-%m-%d %H:%M:%S')

train = checkins.loc[checkins['created_at'] < split_time]
test = checkins.loc[checkins['created_at'] >= split_time]

In [19]:
train['created_at'].min(), train['created_at'].max()

(Timestamp('2011-12-08 03:23:59'), Timestamp('2012-04-16 13:58:00'))

In [20]:
test['created_at'].min(), test['created_at'].max()

(Timestamp('2012-04-17 03:21:19'), Timestamp('2012-04-23 01:53:50'))

In [21]:
train.head()

Unnamed: 0,user_id,venue_id,created_at
0,539270,1206,2011-12-08 05:08:42
1,1330941,1206,2011-12-08 04:32:19
2,1330942,1206,2011-12-08 04:29:38
3,282798,1206,2011-12-08 04:26:06
4,376793,1206,2011-12-08 04:17:50


In [22]:
test.head()

Unnamed: 0,user_id,venue_id,created_at
958365,1477919,103165,2012-04-17 06:04:46
958366,2081561,103165,2012-04-17 04:52:48
958367,2086971,103165,2012-04-17 04:25:57
958368,494829,103165,2012-04-17 03:34:39
958369,1226379,103165,2012-04-17 03:33:52


### user表

In [23]:
users.head()

Unnamed: 0,id,latitude,longitude
0,1,45.072464,-93.455788
1,2,30.669682,-81.462592
2,3,43.549975,-96.700327
3,4,44.840798,-93.29828
4,5,27.949436,-82.465144


In [25]:
users.rename({'id': 'user_id'}, axis=1, inplace=True)

In [28]:
users.head()

Unnamed: 0,user_id,latitude,longitude
0,1,45.072464,-93.455788
1,2,30.669682,-81.462592
2,3,43.549975,-96.700327
3,4,44.840798,-93.29828
4,5,27.949436,-82.465144


### item表

In [26]:
venues.head()

Unnamed: 0,id,latitude,longitude
0,1,44.882011,-93.212364
1,2,44.883169,-93.213687
2,3,44.883455,-93.214316
3,4,44.881387,-93.213801
4,5,44.882129,-93.214012


In [29]:
venues.rename({'id': 'venue_id'}, axis=1, inplace=True)

In [30]:
venues.head()

Unnamed: 0,venue_id,latitude,longitude
0,1,44.882011,-93.212364
1,2,44.883169,-93.213687
2,3,44.883455,-93.214316
3,4,44.881387,-93.213801
4,5,44.882129,-93.214012


## 存档

In [31]:
output_path = './Restaurant_AutoX/'
os.makedirs(output_path, exist_ok = True)

In [32]:
train.to_csv(output_path + 'inter_df.csv', index = False)
test.to_csv(output_path + 'test.csv', index = False)

users.to_csv(output_path + 'user_df.csv', index = False)
venues.to_csv(output_path + 'item_df.csv', index = False)