In [None]:
## 数据工具包
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import tqdm

## 字符串处理工具包
import string
import re
import gensim
from collections import Counter
import pickle
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.preprocessing import text, sequence 

import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
import lightgbm as lgb
from functools import partial

import os 
import gc
from scipy.sparse import vstack  
import time
import datetime

import joblib

import multiprocessing as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline
import seaborn as sns 
%matplotlib inline

In [None]:
from tqdm import tqdm
from tqdm import tqdm_notebook

path ='./input/'
for i in tqdm(range(1,26)):
    if i < 10:
        train_tmp = pd.read_csv(path + 'Metro_train/record_2019-01-0' + str(i) + '.csv')
    else:
        train_tmp = pd.read_csv(path + 'Metro_train/record_2019-01-' + str(i) + '.csv')
    if i== 1:
        data = train_tmp
    else:
        data = pd.concat([data, train_tmp],axis=0,ignore_index=True) 
        
Metro_roadMap = pd.read_csv(path + 'Metro_roadMap.csv')

test_A_record = pd.read_csv(path + 'Metro_testA/testA_record_2019-01-28.csv') 
test_A_submit = pd.read_csv(path + 'Metro_testA/testA_submit_2019-01-29.csv') 

data = pd.concat([data, test_A_record],axis=0,ignore_index=True)

In [3]:
def trans_time_10_minutes(x):
    x_split = x.split(':')
    x_part1 = x_split[0]
    x_part2 = int(x_split[1]) // 10
    if x_part2 == 0:
        x_part2 = '00'
    else:
        x_part2 = str(x_part2 * 10)
    return x_part1 + ':' + x_part2 + ':00'

In [4]:
data['time'] = pd.to_datetime(data['time'])
data['time_10_minutes'] = data['time'].astype(str).apply(lambda x: trans_time_10_minutes(x))

In [5]:
data_inNums = data[data.status == 1].groupby(['stationID','time_10_minutes']).size().to_frame('inNums').reset_index()
data_inNums.head() 

Unnamed: 0,stationID,time_10_minutes,inNums
0,0,2019-01-01 05:30:00,1
1,0,2019-01-01 05:40:00,4
2,0,2019-01-01 05:50:00,3
3,0,2019-01-01 06:00:00,17
4,0,2019-01-01 06:10:00,21


In [6]:
data_outNums = data[data.status == 0].groupby(['stationID','time_10_minutes']).size().to_frame('outNums').reset_index()
data_outNums.head() 

Unnamed: 0,stationID,time_10_minutes,outNums
0,0,2019-01-01 05:50:00,1
1,0,2019-01-01 06:20:00,1
2,0,2019-01-01 06:30:00,49
3,0,2019-01-01 06:40:00,48
4,0,2019-01-01 06:50:00,25


In [7]:
stationIDs = test_A_submit['stationID'].unique()
times = [] 
days = [i for i in range(1,26)] + [28, 29]
for day in days: 
    if day < 10:
        day_str = '0' + str(day)
    else:
        day_str = str(day)
    for hour in range(24):
        if hour < 10:
            hour_str = '0' + str(hour)
        else:
            hour_str = str(hour)
        for minutes in range(6):
            if minutes == 0:
                minutes_str = '0' + str(minutes)
            else:
                minutes_str = str(minutes * 10) 
            times.append('2019-01-' + day_str + ' ' + hour_str +':' + minutes_str + ':00')

# 求笛卡儿积
from itertools import product
stationids_by_times = list(product(stationIDs, times))
# 构建新的数据集
df_data = pd.DataFrame()
df_data['stationID'] = np.array(stationids_by_times)[:,0]
df_data['startTime'] = np.array(stationids_by_times)[:,1]
df_data = df_data.sort_values(['stationID','startTime'])
df_data['endTime'] = df_data.groupby('stationID')['startTime'].shift(-1).values

In [8]:
df_data

Unnamed: 0,stationID,startTime,endTime
0,0,2019-01-01 00:00:00,2019-01-01 00:10:00
1,0,2019-01-01 00:10:00,2019-01-01 00:20:00
2,0,2019-01-01 00:20:00,2019-01-01 00:30:00
3,0,2019-01-01 00:30:00,2019-01-01 00:40:00
4,0,2019-01-01 00:40:00,2019-01-01 00:50:00
...,...,...,...
38875,9,2019-01-29 23:10:00,2019-01-29 23:20:00
38876,9,2019-01-29 23:20:00,2019-01-29 23:30:00
38877,9,2019-01-29 23:30:00,2019-01-29 23:40:00
38878,9,2019-01-29 23:40:00,2019-01-29 23:50:00


In [9]:
def filltime(x):
    x_split = x.split(' ')[0].split('-')
    x_part1_1 = x_split[0] +'-'+x_split[1]+'-'
    x_part1_2 = int(x_split[2]) + 1
    if x_part1_2 < 10:
        x_part1_2 = '0' + str(x_part1_2)
    else:
        x_part1_2 = str(x_part1_2)
        
    x_part2 = ' 00:00:00'
    return x_part1_1 + x_part1_2 + x_part2
# 缺失值时间填充
df_data.loc[df_data.endTime.isnull(), 'endTime']  = \
df_data.loc[df_data.endTime.isnull(), 'startTime'].apply(lambda x: filltime(x)) 
df_data['stationID'] = df_data['stationID'].astype(int)

In [10]:
df_data

Unnamed: 0,stationID,startTime,endTime
0,0,2019-01-01 00:00:00,2019-01-01 00:10:00
1,0,2019-01-01 00:10:00,2019-01-01 00:20:00
2,0,2019-01-01 00:20:00,2019-01-01 00:30:00
3,0,2019-01-01 00:30:00,2019-01-01 00:40:00
4,0,2019-01-01 00:40:00,2019-01-01 00:50:00
...,...,...,...
38875,9,2019-01-29 23:10:00,2019-01-29 23:20:00
38876,9,2019-01-29 23:20:00,2019-01-29 23:30:00
38877,9,2019-01-29 23:30:00,2019-01-29 23:40:00
38878,9,2019-01-29 23:40:00,2019-01-29 23:50:00


In [11]:
data_inNums.rename(columns={'time_10_minutes':'startTime'}, inplace=True)
data_outNums.rename( columns={'time_10_minutes':'startTime'}, inplace=True)

df_data = df_data.merge(data_inNums , on=['stationID', 'startTime'], how='left')
df_data = df_data.merge(data_outNums, on=['stationID', 'startTime'], how='left')
df_data['inNums']  = df_data['inNums'].fillna(0)
df_data['outNums'] = df_data['outNums'].fillna(0)

In [12]:
# 时间相关特征
df_data['time'] = pd.to_datetime(df_data['startTime'])
df_data['day'] = df_data['time'].dt.day  
df_data['hours_in_day'] = df_data['time'].dt.hour 
df_data['day_of_week'] = df_data['time'].dt.dayofweek 
df_data['ten_minutes_in_day'] = df_data['hours_in_day'] * 6 + df_data['time'].dt.minute // 10 
del df_data['time']

In [13]:
# 历史平移特征
df_data['bf_inNums'] = 0
df_data['bf_outNums'] = 0
for i, d in enumerate(days):
    if d == 1:
        continue
    df_data.loc[df_data.day==d, 'bf_inNums'] = df_data.loc[df_data.day==days[i-1], 'inNums'].values
    df_data.loc[df_data.day==d, 'bf_outNums'] = df_data.loc[df_data.day==days[i-1], 'outNums'].values

In [16]:
cols = [f for f in df_data.columns if f not in ['startTime','endTime','inNums','outNums']]

df_train = df_data[df_data.day<28]
df_valid = df_data[df_data.day==28]
df_test = df_data[df_data.day==29]

X_train = df_train[cols].values
X_valid = df_valid[cols].values
X_test  = df_test[cols].values

y_train_inNums = df_train['inNums'].values
y_valid_inNums = df_valid['inNums'].values

y_train_outNums = df_train['outNums'].values
y_valid_outNums = df_valid['outNums'].values

In [17]:
params = {'num_leaves': 63,'objective': 'regression_l1','max_depth': 5,
         'learning_rate': 0.01,'boosting': 'gbdt','metric': 'mae','lambda_l1': 0.1}
model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
model.fit(X_train, y_train_inNums, 
          eval_set=[(X_train, y_train_inNums), (X_valid, y_valid_inNums)], 
          eval_metric='mae',
          verbose=100, early_stopping_rounds=200)
y_pred_inNums = model.predict(X_test, num_iteration=model.best_iteration_)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's l1: 46.1669	valid_1's l1: 43.7863
[200]	valid_0's l1: 31.0834	valid_1's l1: 27.7974
[300]	valid_0's l1: 24.9484	valid_1's l1: 22.0936
[400]	valid_0's l1: 22.7842	valid_1's l1: 19.991
[500]	valid_0's l1: 22.1243	valid_1's l1: 19.6803
[600]	valid_0's l1: 21.8249	valid_1's l1: 19.6336
[700]	valid_0's l1: 21.603	valid_1's l1: 19.7941
Early stopping, best iteration is:
[596]	valid_0's l1: 21.8379	valid_1's l1: 19.6167


In [18]:
model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
model.fit(X_train, y_train_outNums, 
          eval_set=[(X_train, y_train_outNums), (X_valid, y_valid_outNums)], 
          eval_metric='mae',
          verbose=100, early_stopping_rounds=200)
y_pred_outNums = model.predict(X_test, num_iteration=model.best_iteration_)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's l1: 46.839	valid_1's l1: 46.5593
[200]	valid_0's l1: 31.9827	valid_1's l1: 30.0168
[300]	valid_0's l1: 25.983	valid_1's l1: 22.9962
[400]	valid_0's l1: 23.7817	valid_1's l1: 20.3164
[500]	valid_0's l1: 23.0539	valid_1's l1: 19.5043
[600]	valid_0's l1: 22.6812	valid_1's l1: 19.2693
[700]	valid_0's l1: 22.4438	valid_1's l1: 19.0596
[800]	valid_0's l1: 22.1715	valid_1's l1: 19.1902
[900]	valid_0's l1: 22.0047	valid_1's l1: 19.282
Early stopping, best iteration is:
[757]	valid_0's l1: 22.3068	valid_1's l1: 19.0041
