In [1]:
import time
import warnings
import os
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
import lightgbm as lgb
from lightgbm.plotting import plot_importance
from lightgbm import LGBMRegressor
from scipy import sparse
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
import seaborn as sns

sns.set()
#warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',200)

In [2]:
BASE_PATH = os.path.join('../input')
RAW_PATH = os.path.join(BASE_PATH, 'RAW_DATA')
TRAIN_PATH = os.path.join(RAW_PATH, 'Metro_train')
TEST_A_PATH = os.path.join(RAW_PATH, 'Metro_testA')
SUBMIT_PATH = os.path.join('../submit')

In [3]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props
def read_data(name, **params):
    data = pd.read_csv(name, **params)
    data = reduce_mem_usage(data)
    return data

## 读取数据

In [4]:
'''
train = pd.DataFrame()
for file in os.listdir(TRAIN_PATH):
    temp = read_data(os.path.join(TRAIN_PATH, file))
    train = pd.concat([train, temp],ignore_index=True)
    #train = train.drop(columns=['userID', 'deviceID', 'lineID', 'payType'])
    del temp
'''
file = 'record_2019-01-01.csv'
train = read_data(os.path.join(TRAIN_PATH, file))
test_name = os.path.join(TEST_A_PATH, 'testA_record_2019-01-28.csv')
test_28 = read_data(test_name)
test_name = os.path.join(TEST_A_PATH, 'testA_submit_2019-01-29.csv')
test = pd.read_csv(test_name)

Memory usage of properties dataframe is : 135.62892150878906  MB
******************************
Column:  stationID
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  deviceID
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  status
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  payType
dtype before:  int64
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  70.23644256591797  MB
This is  51.78574140720163 % of the initial size
Memory usage of properties dataframe is : 128.43527221679688  MB
******************************
Column:  stationID
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  deviceID
dtype before:  int64
dtype after:  uint16
******************************
*****************

# F E

In [None]:
# 29号无id等信息，需要shift shift1:28->29
train = 

# count,sum
# group相当于在minute(10)上统计
result = df.groupby(['stationID', 'week', 'weekend', 'day', 'hour', 'minute']).status.agg(['count', 'sum']).reset_index()

#每10mins统计    
# nunique
tmp     = df.groupby(['stationID'])['deviceID'].nunique().reset_index(name='nuni_deviceID_of_stationID')
result  = result.merge(tmp, on=['stationID'], how='left')
tmp     = df.groupby(['stationID','hour'])['deviceID'].nunique().reset_index(name='nuni_deviceID_of_stationID_hour')
result  = result.merge(tmp, on=['stationID','hour'], how='left')
tmp     = df.groupby(['stationID','hour','minute'])['deviceID'].nunique().\
                                       reset_index(name='nuni_deviceID_of_stationID_hour_minute')
result  = result.merge(tmp, on=['stationID','hour','minute'], how='left')

# 
date_info['prev_day_is_weekend'] = date_info['is_weekend'].shift().fillna(0)
date_info['next_day_is_weekend'] = date_info['is_weekend'].shift(-1).fillna(0)

In [60]:
def get_hour_cut(data):
    if data>= 23 or data <= 6:
        hour_cut = 1
    elif data>= 10 and data <= 13:
        hour_cut = 2
    elif data>= 18 and data <= 22:
        hour_cut = 3
    elif data>= 14 and data <= 17:
        hour_cut = 4
    else:
        hour_cut = 5
    return hour_cut
def base_processing(data):
    #data.loc[:, 'lineID'] = data.loc[:,'lineID'].map({'A':1, 'B':2, 'C':3})
    # 采样工作日
    #print(f'before sampling: {data.memory_usage}')
    #data = data[((data['weekday']!=5) & (data['weekday']!=6))]
    #print(f'after sampling: {data.memory_usage}')
    data['startTime'] = data['time'].apply(lambda x: str(x)[:15]+ '0:00')
    result = data.groupby(['stationID', 'startTime'])['status'].agg(['count','sum'])
    result = result.reset_index()
    
    result['inNums'] = result['sum']
    result['outNums'] = result['count'] - result['sum']
    result['date'] = result['startTime'].apply(lambda x: int(str(x)[8:10]))
    
    result['startTime_'] = result['startTime'].apply(lambda x: str(x)[11:15]+ '0:00')
    result['startTime'] = pd.to_datetime(result['startTime'],format= '%Y-%m-%d %H:%M:%S')
    result['weekday'] = result['startTime'].dt.weekday
    result['hour'] = result['startTime'].apply(lambda x: int(str(x)[11:13]))
    result['hourCut'] = result['hour'].map(get_hour_cut)
    result = result.drop(columns=['startTime', 'hour'])
    result = result.drop(columns=['count', 'sum'])
    # datetime -> int
    result['startTime_'] = pd.to_datetime(result['startTime_'],format= '%H:%M:%S')
    result['hour'] = result['startTime_'].dt.hour
    result['minute'] = result['startTime_'].dt.minute
    result['startTime'] = result['hour']*60 + result['minute']
    result = result.drop(columns=['startTime_', 'hour', 'minute'])
    return result

In [61]:
def base_processing_test(data):
    #data.loc[:, 'lineID'] = data.loc[:,'lineID'].map({'A':1, 'B':2, 'C':3})
    # 采样工作日
    #print(f'before sampling: {data.memory_usage}')
    #data = data[((data['weekday']!=5) & (data['weekday']!=6))]
    #print(f'after sampling: {data.memory_usage}')
    result = data
    result['date'] = result['startTime'].apply(lambda x: int(str(x)[8:10]))
    
    result['startTime_'] = result['startTime'].apply(lambda x: str(x)[11:15]+ '0:00')
    result['startTime'] = pd.to_datetime(result['startTime'],format= '%Y-%m-%d %H:%M:%S')
    result['weekday'] = result['startTime'].dt.weekday
    result['hour'] = result['startTime'].apply(lambda x: int(str(x)[11:13]))
    result['hourCut'] = result['hour'].map(get_hour_cut)
    result = result.drop(columns=['startTime', 'hour'])
    # datetime -> int
    result['startTime_'] = pd.to_datetime(result['startTime_'],format= '%H:%M:%S')
    result['hour'] = result['startTime_'].dt.hour
    result['minute'] = result['startTime_'].dt.minute
    result['startTime'] = result['hour']*60 + result['minute']
    result = result.drop(columns=['startTime_', 'hour', 'minute'])
    return result

In [62]:
train = base_processing(train)
test_28 = base_processing(test_28)
test_29 = base_processing_test(test_29)

In [None]:
test_29.head()

In [65]:
X_train = train[['stationID', 'date', 'startTime', 'weekday','hourCut']]
y_train_1 = train['inNums']
y_train_2 = train['outNums']
X_test_28 = test_28[['stationID', 'date', 'startTime', 'weekday','hourCut']]
y_test_28_1 = test_28['inNums']
y_test_28_2 = test_28['outNums']
X_test_29 = test_29[['stationID', 'date', 'startTime', 'weekday','hourCut']]
params = {
    'bagging_freq': 10,          
    'bagging_fraction': 0.3,   'boost_from_average':'false',   
    'boost': 'gbdt',             
    #'feature_fraction': 0.0405,     
    'learning_rate': 0.1,
    'max_depth': -1,             'metric':'mae',                
    'min_data_in_leaf': 80, 
    'num_leaves': 13,            
    'num_threads': -1, 
    'objective': 'regression_l1',       'verbosity': 1,
    'num_boost_round': 10000000
}
NFOLD = 15
folds = KFold(n_splits=NFOLD, random_state=134, shuffle=True)
val_lgb1 = np.zeros(len(X_train))
pred_lgb1 = np.zeros(len(X_test_29))
pred_28_1 = np.zeros(len(X_test_28))
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train_1)):
    print(f'fold: {n_fold}')
    trn_data = lgb.Dataset(X_train.iloc[trn_idx], y_train_1[trn_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx], y_train_1[val_idx])
    
    reg_lgb1 = lgb.train(params, trn_data, num_boost_round=2000000, valid_sets=[trn_data, val_data], verbose_eval=10000, early_stopping_rounds=600)
    #val_lgb1[val_idx] = reg_lgb1.predict(X_train.iloc[val_idx], num_iteration=reg_lgb.best_iteration)
    pred_lgb1 += reg_lgb1.predict(X_test_29, num_iteration=reg_lgb1.best_iteration) / NFOLD
    pred_28_1 += reg_lgb1.predict(X_test_28, num_iteration=reg_lgb1.best_iteration) / NFOLD 
print(f'mae error: {mean_absolute_error(pred_28_1, y_test_28_1)}')

folds = KFold(n_splits=NFOLD, random_state=134, shuffle=True)
val_lgb2 = np.zeros(len(X_train))
pred_lgb2 = np.zeros(len(X_test_29))
pred_28_2 = np.zeros(len(X_test_28))
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train_2)):
    print(f'fold: {n_fold}')
    trn_data = lgb.Dataset(X_train.iloc[trn_idx], y_train_2[trn_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx], y_train_2[val_idx])
    
    reg_lgb2 = lgb.train(params, trn_data, num_boost_round=2000000, valid_sets=[trn_data, val_data], verbose_eval=10000, early_stopping_rounds=600)
    #val_lgb2[val_idx] = reg_lgb2.predict(X_train.iloc[val_idx], num_iteration=reg_lgb.best_iteration)
    pred_lgb2 += reg_lgb2.predict(X_test_29, num_iteration=reg_lgb2.best_iteration) / NFOLD
    pred_28_2 += reg_lgb2.predict(X_test_28, num_iteration=reg_lgb2.best_iteration) / NFOLD 
print(f'mae error: {mean_absolute_error(pred_28_2, y_test_28_2)}')

fold: 0
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 17.7855	valid_1's l1: 18.4451
[20000]	training's l1: 16.6919	valid_1's l1: 17.5794
[30000]	training's l1: 16.0998	valid_1's l1: 17.1156
[40000]	training's l1: 15.7	valid_1's l1: 16.79
[50000]	training's l1: 15.3908	valid_1's l1: 16.5462
[60000]	training's l1: 15.1201	valid_1's l1: 16.3432
Early stopping, best iteration is:
[68377]	training's l1: 14.9415	valid_1's l1: 16.2217
fold: 1
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 17.7155	valid_1's l1: 18.8802
[20000]	training's l1: 16.67	valid_1's l1: 18.1087
[30000]	training's l1: 16.0754	valid_1's l1: 17.6917
[40000]	training's l1: 15.6662	valid_1's l1: 17.3725
Early stopping, best iteration is:
[48434]	training's l1: 15.3766	valid_1's l1: 17.1783
fold: 2
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 17.6973	valid_1's l1: 18.4418
[20000]	training's l1: 16.6015	vali

Early stopping, best iteration is:
[36135]	training's l1: 18.4048	valid_1's l1: 19.2452
fold: 6
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 20.4968	valid_1's l1: 21.9708
[20000]	training's l1: 19.3832	valid_1's l1: 21.01
[30000]	training's l1: 18.7661	valid_1's l1: 20.5457
[40000]	training's l1: 18.3312	valid_1's l1: 20.2477
Early stopping, best iteration is:
[44793]	training's l1: 18.1727	valid_1's l1: 20.1434
fold: 7
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 20.1345	valid_1's l1: 21.195
[20000]	training's l1: 19.2395	valid_1's l1: 20.5554
[30000]	training's l1: 18.6795	valid_1's l1: 20.1952
Early stopping, best iteration is:
[37141]	training's l1: 18.3883	valid_1's l1: 19.9632
fold: 8
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 20.1996	valid_1's l1: 20.6351
[20000]	training's l1: 19.2325	valid_1's l1: 19.9794
[30000]	training's l1: 18.6968	valid_1's l1: 19.6

In [73]:
folds = KFold(n_splits=NFOLD, random_state=134, shuffle=True)
val_lgb2 = np.zeros(len(X_train))
pred_lgb2 = np.zeros(len(X_test_29))
pred_28_2 = np.zeros(len(X_test_28))
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train_2)):
    print(f'fold: {n_fold}')
    trn_data = lgb.Dataset(X_train.iloc[trn_idx], y_train_2[trn_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx], y_train_2[val_idx])
    
    reg_lgb2 = lgb.train(params, trn_data, num_boost_round=2000000, valid_sets=[trn_data, val_data], verbose_eval=10000, early_stopping_rounds=600)
    #val_lgb2[val_idx] = reg_lgb2.predict(X_train.iloc[val_idx], num_iteration=reg_lgb.best_iteration)
    pred_lgb2 += reg_lgb2.predict(X_test_29, num_iteration=reg_lgb2.best_iteration) / NFOLD
    pred_28_2 += reg_lgb2.predict(X_test_28, num_iteration=reg_lgb2.best_iteration) / NFOLD 
print(f'mae error: {mean_absolute_error(pred_28_2, y_test_28_2)}')

fold: 0
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 20.3058	valid_1's l1: 21.2697
[20000]	training's l1: 19.2489	valid_1's l1: 20.4083
[30000]	training's l1: 18.678	valid_1's l1: 20.0461
Early stopping, best iteration is:
[36445]	training's l1: 18.4236	valid_1's l1: 19.8645
fold: 1
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 20.1299	valid_1's l1: 21.3901
[20000]	training's l1: 19.1647	valid_1's l1: 20.6523
[30000]	training's l1: 18.6169	valid_1's l1: 20.3131
Early stopping, best iteration is:
[30772]	training's l1: 18.5726	valid_1's l1: 20.2946
fold: 2
Training until validation scores don't improve for 600 rounds.
[10000]	training's l1: 20.219	valid_1's l1: 21.1263
[20000]	training's l1: 19.2206	valid_1's l1: 20.4309
[30000]	training's l1: 18.6628	valid_1's l1: 20.0133
[40000]	training's l1: 18.2616	valid_1's l1: 19.7033
Early stopping, best iteration is:
[41430]	training's l1: 18.2166	valid_1's l1: 19.

In [74]:
test_29['inNums'] = pred_lgb1
test_29['outNums'] = pred_lgb2

In [75]:
submit_name = os.path.join(TEST_A_PATH, 'testA_submit_2019-01-29.csv')
submit = pd.read_csv(submit_name) 
test_29['startTime'] = submit['startTime']
test_29[['stationID', 'startTime', 'endTime', 'inNums', 'outNums']].to_csv(SUBMIT_PATH+'/lgb.csv', index=False)

In [71]:
a = pd.DataFrame([1,2,3],columns=['a'])
b = pd.DataFrame([1,2,3],columns=['a'])
c = pd.concat([a,b],axis=0,ignore_index=True)

In [72]:
c

Unnamed: 0,a
0,1
1,2
2,3
3,1
4,2
5,3
