In [168]:
import pandas as pd
pd.options.display.max_columns = None
import os
import warnings
warnings.filterwarnings('ignore')
import datetime
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [169]:
import matplotlib.pyplot as plt

In [170]:
#提取base_feature
def get_base_feature(df):
    df['time'] = df.START_DATE.apply(lambda x:x[:19])
    df['time'] = pd.to_datetime(df['time'],format='%Y-%m-%d-%H.%M.%S')
    timedelta = df['time'] - pd.datetime(df['time'].dt.year[0],df['time'].dt.month[0],df['time'].dt.day[0],0,0,0)
    df['30_min'] = timedelta.dt.seconds/1800
    df['30_min'] = df['30_min'].astype(int)
    df['20_min'] = timedelta.dt.seconds/1200
    df['20_min'] = df['20_min'].astype(int)
    df['10_min'] = timedelta.dt.seconds/600
    df['10_min'] = df['10_min'].astype(int)
    df['5_min'] = timedelta.dt.seconds/300
    df['5_min'] = df['5_min'].astype(int)
#     df['3_min'] = timedelta.dt.seconds/300
#     df['3_min'] = df['3_min'].astype(int)

    df['hour'] = df.START_DATE.apply(lambda x:int(x[11:13]))
    
    
    df.TRADE_TYPE[df.TRADE_TYPE==21] = 1
    df.TRADE_TYPE[df.TRADE_TYPE==22] = 0
    
    result = df.groupby(['date','TRADE_ADDRESS','INDUSTRY_CODE','CARD_TYPE_EX','CARD_TYPE','SAM_ID','TERMINAL_ID','RECORD_ROW','30_min','20_min','10_min','5_min']).TRADE_TYPE.agg(['count','sum']).reset_index()
    result['inNums'] = result['sum']
    result['outNums'] = result['count'] - result['sum']
    del result['count'],result['sum']
    
    result.date = pd.to_datetime(result.date)
    result['week'] = result.date.dt.dayofweek+1
    del df
    return result

In [171]:
#进一步提取特征
def more_feature(result):
    tmp = result.copy()

    tmp = result.groupby(['TRADE_ADDRESS'], as_index=False)['inNums'].agg({
                                                                        'inNums_ID_w_max'    : 'max',
                                                                        'inNums_ID_w_min'    : 'min', 
                                                                        'inNums_ID_w_mean'   : 'mean',
                                                                        'inNums_ID_w_sum'   : 'sum'
                                                                        })
    result = result.merge(tmp, on=['TRADE_ADDRESS'], how='left')

    tmp = result.groupby(['TRADE_ADDRESS','week','30_min'], as_index=False)['inNums'].agg({
                                                                        'inNums_ID_w_h_3_max'    : 'max',
                                                                        'inNums_ID_w_h_3_min'    : 'min', 
                                                                        'inNums_ID_w_h_3_mean'   : 'mean',
                                                                        'inNums_ID_w_h_3_sum'   : 'sum'
                                                                        })
    result = result.merge(tmp,on=['TRADE_ADDRESS','week','30_min'],how = 'left')

    tmp = result.groupby(['TRADE_ADDRESS','20_min'], as_index=False)['inNums'].agg({
                                                                        'inNums_ID_w_h_2_max'    : 'max',
                                                                        'inNums_ID_w_h_2_min'    : 'min', 
                                                                        'inNums_ID_w_h_2_mean'   : 'mean',
                                                                        'inNums_ID_w_h_2_sum'   : 'sum',      
                                                                
                                                                        })
    result = result.merge(tmp,on=['TRADE_ADDRESS','20_min'],how = 'left')
    
    tmp = result.groupby(['TRADE_ADDRESS','5_min'], as_index=False)['inNums'].agg({
                                                                        'inNums_ID_w_h_5_max'    : 'max',
                                                                        'inNums_ID_w_h_5_min'    : 'min', 
                                                                        'inNums_ID_w_h_5_mean'    : 'mean',
                                                                        'inNums_ID_w_h_5_sum'   : 'sum'
                                                                        })
    result = result.merge(tmp,on=['TRADE_ADDRESS','5_min'],how = 'left')
    
    
    
    ###出站与进站类似

    
    
    tmp = result.groupby(['TRADE_ADDRESS'], as_index=False)['outNums'].agg({
                                                                        'outNums_ID_max'    : 'max',
                                                                        'outNums_ID_min'    : 'min',
                                                                        'outNums_ID_mean'   : 'mean',
                                                                        'outNums_ID_sum'   : 'sum'
                                                                        })
    result = result.merge(tmp, on=['TRADE_ADDRESS'], how='left')
    


    ####按照week计算每个站口每小时的进站人数
    tmp = result.groupby(['TRADE_ADDRESS','30_min'], as_index=False)['outNums'].agg({
                                                                       
                                                                        'outNums_ID_w_h_3_max'    : 'max', 
                                                                        'outNums_ID_w_h_3_min'    : 'min', 
                                                                        'outNums_ID_w_h_3_mean'    : 'mean', 
                                                                        'outNums_ID_w_h_3_sum'    : 'sum', 
                                                           
                                                                        })
    result = result.merge(tmp,on=['TRADE_ADDRESS','30_min'],how = 'left')
    #20
    tmp = result.groupby(['TRADE_ADDRESS','20_min'], as_index=False)['outNums'].agg({
                                                              
                                                                        'outNums_ID_w_h_2_max'    : 'max', 
                                                                        'outNums_ID_w_h_2_min'    : 'min', 
                                                                        'outNums_ID_w_h_2_mean'    : 'mean', 
                                                                        'outNums_ID_w_h_2_sum'    : 'sum', 
                                                    
                                         
                                                                        })
    result = result.merge(tmp,on=['TRADE_ADDRESS','20_min'],how = 'left')

    tmp = result.groupby(['TRADE_ADDRESS','5_min'], as_index=False)['outNums'].agg({
                                                  
                                                                        'outNums_ID_w_h_5_max'   : 'max',
                                                                        'outNums_ID_w_h_5_min'   : 'min',
                                                                        'outNums_ID_w_h_5_mean'   : 'mean',
                                                                        'outNums_ID_w_h_5_sum'   : 'sum',
                                               
                                                                        })
    result = result.merge(tmp,on=['TRADE_ADDRESS','5_min'],how = 'left')
    
    return result

In [172]:
path = '../dataofweek/'

In [173]:
data6 = pd.read_csv(path+'6_week.csv')

In [174]:
data13 = pd.read_csv(path+'13_week.csv')

In [175]:
data15 = pd.read_csv(path+'15_week.csv')

In [176]:
data16 = pd.read_csv(path+'16_week.csv')

In [181]:
data6.date = pd.to_datetime(data6.date)
data6['week'] = data6.date.dt.dayofweek+1

In [182]:
df66 = data6[data6.week==6]
df136 = data13[data13.week==6]
df156 = data15[data15.week==6]
df166 = data16[data16.week==6]

In [183]:
del data15,data16,data13,data6

In [184]:
data = pd.concat([df66,df136],axis=0,ignore_index=True)
data = pd.concat([data,df156],axis=0,ignore_index=True)
data = pd.concat([data,df166],axis=0,ignore_index=True)

In [185]:
le = LabelEncoder()

In [186]:
data.SAM_ID = le.fit_transform(data.SAM_ID.values)

In [187]:
data.TERMINAL_ID = le.fit_transform(data.TERMINAL_ID.values)

In [188]:
data = get_base_feature(data)

In [189]:
data = more_feature(data)

In [190]:
data.shape

(2363490, 47)

In [191]:
prevent = data.copy()

In [192]:
#删除类别列别超过90%的列
cols = list(data.columns)
for col in cols:
    rate = data[col].value_counts(normalize = True,dropna = False).values[0]
    if (rate>0.9):
        cols.remove(col)
        print(col,rate)
data = data[cols]

INDUSTRY_CODE 0.9642917888376934
week 1.0
inNums_ID_w_min 1.0
inNums_ID_w_h_3_max 0.9285459214974465
inNums_ID_w_h_2_max 0.9354721196197149
inNums_ID_w_h_5_min 0.9993822694405308
outNums_ID_min 1.0
outNums_ID_w_h_3_min 0.9998976090442524
outNums_ID_w_h_2_min 0.9998959166317607


In [193]:
columns = [ 'date',
    'TRADE_ADDRESS',
 'CARD_TYPE_EX',
 'CARD_TYPE',
 'SAM_ID',
 'TERMINAL_ID',
 'RECORD_ROW',
 '30_min',
 '20_min',
 '10_min',
 '5_min',
 'inNums',
 'outNums',
 'inNums_ID_w_mean',
 'inNums_ID_w_sum',

 'inNums_ID_w_h_3_mean',
 'inNums_ID_w_h_3_sum',

 'inNums_ID_w_h_2_mean',
 'inNums_ID_w_h_2_sum',
 'inNums_ID_w_h_5_max',
 'inNums_ID_w_h_5_mean',
 'inNums_ID_w_h_5_sum',
 'outNums_ID_max',
 'outNums_ID_mean',
 'outNums_ID_sum',
 'outNums_ID_w_h_3_mean',
 'outNums_ID_w_h_3_sum',
 'outNums_ID_w_h_2_max',
 'outNums_ID_w_h_2_mean',
 'outNums_ID_w_h_2_sum',
 'outNums_ID_w_h_5_max',
 'outNums_ID_w_h_5_min',
 'outNums_ID_w_h_5_mean',
 'outNums_ID_w_h_5_sum']

In [194]:
data = data[columns]

In [195]:
data.date.unique()

array(['2015-09-12T00:00:00.000000000', '2015-10-31T00:00:00.000000000',
       '2015-11-14T00:00:00.000000000', '2015-11-21T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [196]:
sub = data[data.date=='2015-11-21']

In [197]:
#lgb参数
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 63,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_seed':0,
    'bagging_freq': 1,
    'verbose': 1,
    'reg_alpha':1,
    'reg_lambda':2,
    'min_child_weight':6
}

In [243]:
train_in = data[data.date<='2015-10-31']
y_train = train_in['inNums']
del train_in['inNums'],train_in['outNums']
del train_in['date']

val_in = data[data.date=='2015-11-14']
y_val = val_in['inNums']
del val_in['inNums'],val_in['outNums']
del val_in['date']


test_in = data[data.date=='2015-11-21']
y_test = test_in['inNums']
del test_in['inNums'],test_in['outNums']
del test_in['date']

lgb_train = lgb.Dataset(train_in, y_train)
lgb_evals = lgb.Dataset(val_in, y_val , reference=lgb_train)

In [282]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=5000,
                valid_sets=[lgb_train,lgb_evals],
                valid_names=['train','valid'],
                early_stopping_rounds=50,
                verbose_eval=1000,
                )

Training until validation scores don't improve for 50 rounds.
[1000]	train's l1: 0.0180515	valid's l1: 0.0190681
[2000]	train's l1: 0.0104323	valid's l1: 0.0120635
[3000]	train's l1: 0.00775735	valid's l1: 0.00947538
[4000]	train's l1: 0.00637581	valid's l1: 0.00812739
[5000]	train's l1: 0.00557602	valid's l1: 0.00734084
Did not meet early stopping. Best iteration is:
[5000]	train's l1: 0.00557602	valid's l1: 0.00734084


In [283]:
pre_in = gbm.predict(test_in,num_iteration=gbm.best_iteration)

In [284]:
error_in = mean_absolute_error(pre_in,y_test)
error_in

0.008148315750950421

In [285]:
sub['pre_in'] = pre_in

In [234]:
train_out = data[data.date<='2015-10-31']
y_train = train_out['outNums']
del train_out['inNums'],train_out['outNums']
del train_out['date']

val_out = data[data.date=='2015-11-14']
y_val = val_out['outNums']
del val_out['inNums'],val_out['outNums']
del val_out['date']

test_out = data[data.date=='2015-11-21']
y_test = test_out['outNums']
del test_out['inNums'],test_out['outNums']
del test_out['date']

lgb_train = lgb.Dataset(train_out, y_train)
lgb_evals = lgb.Dataset(val_out, y_val , reference=lgb_train)

In [235]:
gbm1 = lgb.train(params,
                lgb_train,
                num_boost_round=16000,
                valid_sets=[lgb_train,lgb_evals],
                valid_names=['train','valid'],
                early_stopping_rounds=50,
                verbose_eval=1000,
                )

Training until validation scores don't improve for 50 rounds.
[1000]	train's l1: 0.0195589	valid's l1: 0.0204507
[2000]	train's l1: 0.012208	valid's l1: 0.0135466
[3000]	train's l1: 0.0099156	valid's l1: 0.0112983
[4000]	train's l1: 0.0087007	valid's l1: 0.0100562
[5000]	train's l1: 0.00803796	valid's l1: 0.00936787
[6000]	train's l1: 0.00764037	valid's l1: 0.00897312
[7000]	train's l1: 0.00733804	valid's l1: 0.00868276
[8000]	train's l1: 0.00713545	valid's l1: 0.00849589
[9000]	train's l1: 0.0069782	valid's l1: 0.00835322
[10000]	train's l1: 0.00685365	valid's l1: 0.00824813
[11000]	train's l1: 0.00675162	valid's l1: 0.00815915
[12000]	train's l1: 0.00666295	valid's l1: 0.00808559
[13000]	train's l1: 0.00659721	valid's l1: 0.00803265
[14000]	train's l1: 0.00653625	valid's l1: 0.00798969
[15000]	train's l1: 0.00648104	valid's l1: 0.00794984
[16000]	train's l1: 0.00643332	valid's l1: 0.0079151
Early stopping, best iteration is:
[16610]	train's l1: 0.00640732	valid's l1: 0.00789572


In [236]:
pre_out = gbm1.predict(test_out,num_iteration=gbm1.best_iteration)

In [237]:
mean_absolute_error(pre_out,y_test)

0.008952359846960956

In [238]:
sub['pre_out'] = pre_out

In [286]:
sub1 = sub.groupby(['date','TRADE_ADDRESS']).pre_in.agg({'inNums':'sum'}).reset_index()
sub2 = sub.groupby(['date','TRADE_ADDRESS']).pre_out.agg({'outNums':'sum'}).reset_index()
sub_result = sub1.merge(sub2,on = ['date','TRADE_ADDRESS'],how='left')
sub_result['flow'] = sub_result.inNums + sub_result.outNums
sub_result['round_pre'] = np.round(sub_result.flow)
sub_result['round_in'] = np.round(sub_result.inNums)
sub_result['round_out'] = np.round(sub_result.outNums)

In [287]:
real1 = sub.groupby(['date','TRADE_ADDRESS']).inNums.agg({'inNums':'sum'}).reset_index()
real2 = sub.groupby(['date','TRADE_ADDRESS']).outNums.agg({'outNums':'sum'}).reset_index()
real = real1.merge(real2,on = ['date','TRADE_ADDRESS'],how='left')
real['flow'] = real.inNums + real.outNums

In [288]:
error=mean_absolute_error(sub_result.round_pre,real.flow)

In [289]:
error

20.65

In [279]:
sub_result.head(8)

Unnamed: 0,date,TRADE_ADDRESS,inNums,outNums,flow,round_pre,round_in,round_out
0,2015-11-21,121,16703.641621,13613.42432,30317.065941,30317.0,16704.0,13613.0
1,2015-11-21,123,6412.979949,5841.503991,12254.48394,12254.0,6413.0,5842.0
2,2015-11-21,125,10482.831749,13726.364377,24209.196125,24209.0,10483.0,13726.0
3,2015-11-21,127,12120.961681,12090.22017,24211.181851,24211.0,12121.0,12090.0
4,2015-11-21,129,7645.239523,7681.754874,15326.994398,15327.0,7645.0,7682.0
5,2015-11-21,131,8212.620644,8614.374542,16826.995186,16827.0,8213.0,8614.0
6,2015-11-21,133,11440.484917,11763.455595,23203.940511,23204.0,11440.0,11763.0
7,2015-11-21,135,25964.823539,29896.064015,55860.887553,55861.0,25965.0,29896.0


In [280]:
real.head(8)

Unnamed: 0,date,TRADE_ADDRESS,inNums,outNums,flow
0,2015-11-21,121,16727,13602,30329
1,2015-11-21,123,6411,5836,12247
2,2015-11-21,125,10670,13562,24232
3,2015-11-21,127,12142,12060,24202
4,2015-11-21,129,7696,7636,15332
5,2015-11-21,131,8549,8311,16860
6,2015-11-21,133,11456,11738,23194
7,2015-11-21,135,26001,29903,55904


In [281]:
re_path = '../result1/'
sub_result.to_csv(re_path+'6_flow_pre_'+str('%.2f'%error)+'.csv',encoding='utf-8',index=False)
real.to_csv(re_path+'6_flow_real.csv',encoding='utf-8',index=False)