In [1]:
import time
import warnings
import os
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
import lightgbm as lgb
from lightgbm.plotting import plot_importance
from lightgbm import LGBMRegressor
from scipy import sparse
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
import seaborn as sns

sns.set()
#warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',200)


# In[2]:


BASE_PATH = os.path.join('../input')
RAW_PATH = os.path.join(BASE_PATH, 'RAW_DATA')
TRAIN_PATH = os.path.join(RAW_PATH, 'Metro_train')
TEST_A_PATH = os.path.join(RAW_PATH, 'Metro_testA')
SUBMIT_PATH = os.path.join('../submit')

In [1]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props
def read_data(name, **params):
    data = pd.read_csv(name, **params)
    data = reduce_mem_usage(data)
    return data


# ## 读取数据


# payType-most
# devideID-most two
# userID-count(this stationID)-( nunique(userID) - nunique(userID)[payType==3])
def get_hour_cut(data):
    if data>= 23 or data <= 6:
        hour_cut = 1
    elif data>= 10 and data <= 13:
        hour_cut = 2
    elif data>= 18 and data <= 22:
        hour_cut = 3
    elif data>= 14 and data <= 17:
        hour_cut = 4
    else:
        hour_cut = 5
    return hour_cut
def is_weekend(data):
    if data <= 4:
        return 0
    else:
        return 1
def date_processing(data):
    data['startTime'] = data['time'].apply(lambda x: str(x)[:15]+ '0:00')
    data['day'] = data['startTime'].apply(lambda x: int(str(x)[8:10]))
    data['hour'] = data['startTime'].apply(lambda x: int(str(x)[11:13]))
    data['minute'] = data['startTime'].apply(lambda x: int(str(x)[14:15]+'0'))# hour+10min 10min最后可以删除
    data['startTime'] = pd.to_datetime(data['startTime'],format= '%Y-%m-%d %H:%M:%S')
    data['weekday'] = data['startTime'].dt.weekday
    #result['weekend'] = result['weekday'].apply(is_weekend)
    
    result = data.groupby(['stationID', 'startTime','day', 'hour', 'minute','weekday'])['status'].agg(['count','sum'])
    result = result.reset_index()
    result['inNums'] = result['sum']
    result['outNums'] = result['count'] - result['sum']
    
    tmp     = data.groupby(['stationID'])['deviceID'].nunique().reset_index(name='nuni_deviceID_of_stationID')
    result  = result.merge(tmp, on=['stationID'], how='left')
    tmp     = data.groupby(['stationID','hour'])['deviceID'].nunique().reset_index(name='nuni_deviceID_of_stationID_hour')
    result  = result.merge(tmp, on=['stationID','hour'], how='left')
    tmp     = data.groupby(['stationID','hour','minute'])['deviceID'].nunique().                                           reset_index(name='nuni_deviceID_of_stationID_hour_minute')
    result  = result.merge(tmp, on=['stationID','hour','minute'], how='left')
    def get_top(df, n=1):
        return df.sort_values()[-n:].values[0]
    tmp     = data.groupby(['stationID'])['deviceID'].apply(get_top,n=1).reset_index(name='most_deviceID_of_stationID')
    result  = result.merge(tmp, on=['stationID'], how='left')

    tmp     = data.groupby(['stationID','hour'])['deviceID'].apply(get_top,n=1).reset_index(name='most_deviceID_of_stationID&hour')
    result  = result.merge(tmp, on=['stationID','hour'], how='left')

    tmp     = data.groupby(['stationID','weekday','hour'])['deviceID'].apply(get_top,n=1).reset_index(name='most_deviceID_of_stationID&wh')
    result  = result.merge(tmp, on=['stationID','weekday','hour'], how='left')

    tmp     = data.groupby(['stationID'])['payType'].apply(get_top,n=1).reset_index(name='most_payType_of_stationID')
    result  = result.merge(tmp, on=['stationID'], how='left')
    tmp     = data.groupby(['stationID','hour'])['payType'].apply(get_top,n=1).reset_index(name='most_payType_of_stationID&hour')
    result  = result.merge(tmp, on=['stationID','hour'], how='left')
    tmp     = data.groupby(['stationID','weekday','hour'])['payType'].apply(get_top,n=1).reset_index(name='most_payType_of_stationID&wh')
    result  = result.merge(tmp, on=['stationID','weekday','hour'], how='left')

    #result['weekday'] = result['startTime'].dt.weekday
    result['hourCut'] = result['hour'].map(get_hour_cut)
    result = result.drop(columns=['count', 'sum'])
    # datetime -> int
    return result
def date_processing_test(data):
    result = data
    
    result['day'] = result['startTime'].apply(lambda x: int(str(x)[8:10]))
    result['startTime'] = pd.to_datetime(result['startTime'],format= '%Y-%m-%d %H:%M:%S')
    result['weekday'] = result['startTime'].dt.weekday
    #result['weekend'] = result['weekday'].apply(is_weekend)
    result['hour'] = result['startTime'].apply(lambda x: int(str(x)[11:13]))
    result['minute'] = result['startTime'].apply(lambda x: int(str(x)[14:15]+'0'))# hour+10min 10min最后可以删除
    result['hourCut'] = result['hour'].map(get_hour_cut)
    result = result.drop(columns='endTime')
    result = result.drop(columns=['inNums', 'outNums'])
    return result  

## 读取数据
data = pd.DataFrame()
for file in os.listdir(TRAIN_PATH):
    print(f'the file: {file}')
    temp = read_data(os.path.join(TRAIN_PATH, file))
    temp = date_processing(temp)
    data = pd.concat([data, temp],ignore_index=True)
    del temp
test_name = os.path.join(TEST_A_PATH, 'testA_record_2019-01-28.csv')
test_28 = read_data(test_name)
test_28 = date_processing(test_28)
data = pd.concat([data, test_28],ignore_index=True)
test_name = os.path.join(TEST_A_PATH, 'testA_submit_2019-01-29.csv')
test = pd.read_csv(test_name)
test = date_processing_test(test)


the file: record_2019-01-01.csv
Memory usage of properties dataframe is : 135.62892150878906  MB
******************************
Column:  stationID
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  deviceID
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  status
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  payType
dtype before:  int64
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  70.23644256591797  MB
This is  51.78574140720163 % of the initial size
the file: record_2019-01-02.csv
Memory usage of properties dataframe is : 126.91683959960938  MB
******************************
Column:  stationID
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  deviceID
dtype before:  int64
dtype

the file: record_2019-01-13.csv
Memory usage of properties dataframe is : 110.20756530761719  MB
******************************
Column:  stationID
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  deviceID
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  status
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  payType
dtype before:  int64
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  57.07181167602539  MB
This is  51.78574766326026 % of the initial size
the file: record_2019-01-14.csv
Memory usage of properties dataframe is : 128.0226058959961  MB
******************************
Column:  stationID
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  deviceID
dtype before:  int64
dtype 

the file: record_2019-01-25.csv
Memory usage of properties dataframe is : 137.58303833007812  MB
******************************
Column:  stationID
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  deviceID
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  status
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  payType
dtype before:  int64
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  71.2483959197998  MB
This is  51.785741021990226 % of the initial size
Memory usage of properties dataframe is : 128.43527221679688  MB
******************************
Column:  stationID
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  deviceID
dtype before:  int64
dtype after:  uint16
****************

In [33]:
data = pd.read_csv('../input/after_base_features.csv')

In [12]:
data[data['day']==21].head()

Unnamed: 0,stationID,startTime,day,hour,minute,weekday,inNums,outNums,nuni_deviceID_of_stationID,nuni_deviceID_of_stationID_hour,nuni_deviceID_of_stationID_hour_minute,nuni_deviceID_of_stationID_wh,nuni_deviceID_of_stationID_whm,most_deviceID_of_stationID,most_deviceID_of_stationID&hour,most_deviceID_of_stationID&wh,most_payType_of_stationID,most_payType_of_stationID&hour,most_payType_of_stationID&wh
2917,0,2019-01-29 00:00:00,21,0,0,1,0.0,0.0,18,12,7,3,3,17.0,17.0,15.0,3.0,3.0,3.0
2918,0,2019-01-29 00:10:00,21,0,10,1,0.0,0.0,18,12,6,3,1,17.0,17.0,15.0,3.0,3.0,3.0
2919,0,2019-01-29 00:20:00,21,0,20,1,0.0,0.0,18,12,6,3,1,17.0,17.0,15.0,3.0,3.0,3.0
2920,0,2019-01-29 00:30:00,21,0,30,1,0.0,0.0,18,12,5,3,1,17.0,17.0,15.0,3.0,3.0,3.0
2921,0,2019-01-29 00:40:00,21,0,40,1,0.0,0.0,18,12,5,3,1,17.0,17.0,15.0,3.0,3.0,3.0


In [34]:
## F E
# 剔除周末,并修改为连续时间
data = data[(data.day!=5)&(data.day!=6)]
data = data[(data.day!=12)&(data.day!=13)]
data = data[(data.day!=19)&(data.day!=20)]
data = data[(data.day!=26)&(data.day!=27)]

def fix_day(d):
    if d in [1,2,3,4]:
        return d
    elif d in [7,8,9,10,11]:
        return d - 2
    elif d in [14,15,16,17,18]:
        return d - 4
    elif d in [21,22,23,24,25]:
        return d - 6
    elif d in [28,29]:
        return d - 8
data['day'] = data['day'].apply(fix_day)

In [35]:
data.loc[data['day']==21,['inNums','outNums']] = 0

In [36]:
def feat_count(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].count()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_count" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_nunique(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].nunique()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_nunique" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_mean(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].mean()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_mean" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_std(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].std()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_std" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_median(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].median()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_median" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_max(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].max()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_max" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_min(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].min()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_min" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_sum(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].sum()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_sum" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_var(df, df_feature, fe,value,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].var()).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_var" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df

def feat_quantile(df, df_feature, fe,value,n,name=""):
    df_count = pd.DataFrame(df_feature.groupby(fe)[value].quantile(n)).reset_index()
    if not name:
        df_count.columns = fe + [value+"_%s_quantile" % ("_".join(fe))]
    else:
        df_count.columns = fe + [name]
    df = df.merge(df_count, on=fe, how="left").fillna(0)
    return df
#['stationID', 'startTime', 'day', 'hour', 'minute', 'weekday', 'inNums', 'outNums',  'day_gap']
def create_features(df_label, df_train):
    # nums
    #这里加入前一天的数据

    for i in [1,3,5,10,15]:
        if df_train.day_gap.min() > -i:
            break
        df_select=df_train[df_train.day_gap>=-i].copy()
        if i==1:
            df_label = feat_mean(df_label,df_select,["stationID"],"inNums", "inNums_mean_stationID_%s"%i)
            df_label = feat_mean(df_label,df_select,["stationID"],"outNums", "outNums_mean_stationID_%s"%i)
            df_label=feat_mean(df_label,df_select,["stationID", 'hour'],"inNums", "inNums_mean_s_h_%s"%i)
            df_label=feat_mean(df_label,df_select,["stationID", 'hour'],"outNums", "outNums_mean_s_h_%s"%i)
            df_label=feat_mean(df_label,df_select,["stationID", 'hour','minute'],"inNums", "inNums_mean_s_h_m_%s"%i)
            df_label=feat_mean(df_label,df_select,["stationID", 'hour','minute'],"outNums", "outNums_mean_s_h_m_%s"%i)
            continue        
        # stationID
        df_label=feat_mean(df_label,df_select,["stationID"],"inNums", "inNums_mean_stationID_%s"%i)
        df_label=feat_std(df_label,df_select,["stationID"],"inNums", "inNums_std_stationID_%s"%i)
        df_label=feat_median(df_label,df_select,["stationID"],"inNums", "inNums_median_stationID_%s"%i)
        df_label=feat_max(df_label,df_select,["stationID"],"inNums", "inNums_max_stationID_%s"%i)
        df_label=feat_min(df_label,df_select,["stationID"],"inNums", "inNums_min_stationID_%s"%i)
        df_label=feat_var(df_label,df_select,["stationID"],"inNums", "inNums_var_stationID_%s"%i)
        #df_label=feat_quantile(df_label,df_select,["stationID"],"inNums", "inNums_quantile_stationID_%s"%i)

        df_label=feat_mean(df_label,df_select,["stationID"],"outNums", "outNums_mean_stationID_%s"%i)
        df_label=feat_std(df_label,df_select,["stationID"],"outNums", "outNums_std_stationID_%s"%i)
        df_label=feat_median(df_label,df_select,["stationID"],"outNums", "outNums_median_stationID_%s"%i)
        df_label=feat_max(df_label,df_select,["stationID"],"outNums", "outNums_max_stationID_%s"%i)
        df_label=feat_min(df_label,df_select,["stationID"],"outNums", "outNums_min_stationID_%s"%i)
        df_label=feat_var(df_label,df_select,["stationID"],"outNums", "outNums_var_stationID_%s"%i)
        #df_label=feat_quantile(df_label,df_select,["stationID"],"outNums", "outNums_quantile_stationID_%s"%i)
       
        # stationID weekday hour
        df_label=feat_mean(df_label,df_select,["stationID", 'weekday','hour'],"inNums", "inNums_mean_s_w_h_%s"%i)
        df_label=feat_std(df_label,df_select,["stationID", 'weekday','hour'],"inNums", "inNums_std_s_w_h_%s"%i)
        df_label=feat_median(df_label,df_select,["stationID", 'weekday','hour'],"inNums", "inNums_median_s_w_h_%s"%i)
        df_label=feat_max(df_label,df_select,["stationID", 'weekday','hour'],"inNums", "inNums_max_s_w_h_%s"%i)
        df_label=feat_min(df_label,df_select,["stationID", 'weekday','hour'],"inNums", "inNums_min_s_w_h_%s"%i)
        df_label=feat_var(df_label,df_select,["stationID", 'weekday','hour'],"inNums", "inNums_var_s_w_h_%s"%i)
        #df_label=feat_quantile(df_label,df_select,["stationID", 'weekday','hour'],"inNums", "inNums_quantile_s_w_h_%s"%i)

        df_label=feat_mean(df_label,df_select,["stationID", 'weekday','hour'],"outNums", "outNums_mean_s_w_h_%s"%i)
        df_label=feat_std(df_label,df_select,["stationID", 'weekday','hour'],"outNums", "outNums_std_s_w_h_%s"%i)
        df_label=feat_median(df_label,df_select,["stationID", 'weekday','hour'],"outNums", "outNums_median_s_w_h_%s"%i)
        df_label=feat_max(df_label,df_select,["stationID", 'weekday','hour'],"outNums", "outNums_max_s_w_h_%s"%i)
        df_label=feat_min(df_label,df_select,["stationID", 'weekday','hour'],"outNums", "outNums_min_s_w_h_%s"%i)
        df_label=feat_var(df_label,df_select,["stationID", 'weekday','hour'],"outNums", "outNums_var_s_w_h_%s"%i)
        #f_label=feat_quantile(df_label,df_select,["stationID", 'weekday','hour'],"outNums", "outNums_quantile_s_w_h_%s"%i)

        df_label=feat_mean(df_label,df_select,["stationID", 'weekday','hour','minute'],"inNums", "inNums_mean_s_w_hm_%s"%i)
        df_label=feat_std(df_label,df_select,["stationID", 'weekday','hour','minute'],"inNums", "inNums_std_s_w_hm_%s"%i)
        df_label=feat_median(df_label,df_select,["stationID", 'weekday','hour','minute'],"inNums", "inNums_median_s_w_hm_%s"%i)
        df_label=feat_max(df_label,df_select,["stationID", 'weekday','hour','minute'],"inNums", "inNums_max_s_w_hm_%s"%i)
        df_label=feat_min(df_label,df_select,["stationID", 'weekday','hour','minute'],"inNums", "inNums_min_s_w_hm_%s"%i)
        df_label=feat_var(df_label,df_select,["stationID", 'weekday','hour','minute'],"inNums", "inNums_var_s_w_hm_%s"%i)

        df_label=feat_mean(df_label,df_select,["stationID", 'weekday','hour','minute'],"outNums", "outNums_mean_s_w_hm_%s"%i)
        df_label=feat_std(df_label,df_select,["stationID", 'weekday','hour','minute'],"outNums", "outNums_std_s_w_hm_%s"%i)
        df_label=feat_median(df_label,df_select,["stationID", 'weekday','hour','minute'],"outNums", "outNums_median_s_w_hm_%s"%i)
        df_label=feat_max(df_label,df_select,["stationID", 'weekday','hour','minute'],"outNums", "outNums_max_s_w_hm_%s"%i)
        df_label=feat_min(df_label,df_select,["stationID", 'weekday','hour','minute'],"outNums", "outNums_min_s_w_hm_%s"%i)
        df_label=feat_var(df_label,df_select,["stationID", 'weekday','hour','minute'],"outNums", "outNums_var_s_w_hm_%s"%i)
    return df_label


In [10]:
train.shape

NameError: name 'train' is not defined

In [11]:
data.shape

(192586, 19)

In [None]:
df_train = data
# 根据划窗计算id&num特征
# 按出入车站group，看两站进出时间的统计特征
for slip in [2,3,5,7]:
    print(f'the slip is: {slip}')
    t_end = 21
    nday = slip

    # 构造训练集
    all_data = []
    for i in range(nday*1, nday*(19//nday+1),nday):
        t_begin = t_end-i
        print(t_begin)
        df_train["day_gap"]=df_train["day"].apply(lambda x:int(x-t_begin))
        df_feature=df_train[df_train.day_gap<0].copy()
        df_label=df_train[(df_train.day_gap>=0)&(df_train.day_gap<nday)][["stationID","startTime",'weekday','hour',
        'minute','inNums','outNums']].copy()
        train_data_tmp=create_features(df_label,df_feature)
        all_data.append(train_data_tmp)
    train=pd.concat(all_data)
    #构造线上测试集
    t_begin=21
    print(t_begin)
    df_label=df_train.loc[df_train['day']==21, ['stationID','startTime','weekday','hour','minute','inNums','outNums']]
    df_label["day_gap"]=0
    df_train["day_gap"]=df_train["day"].apply(lambda x:int(x-t_begin))
    df_label=df_label[['stationID','startTime','weekday','hour','minute','inNums','outNums']].copy()
    test=create_features(df_label,df_train)

    #save features data for stacking
    #train.to_csv("../stacking/train.csv",index=None)
    #test.to_csv("../stacking/test.csv",index=None)

    #训练预测
    #weight_df=train[["day_gap"]].copy()
    #weight_df["weight"]=weight_df["day_gap"].apply(lambda x: 1 if x<=6 else 1)
    def stacking(reg, train_data, test_data, reg_name, inOrout, params):
        train_pre = np.zeros(train_data.shape[0])
        test_pre = np.zeros(test_data.shape[0])
        cv_score = []
        
        all_cols = [col for col in train_data.columns if col not in ['inNums', 'outNums']]
        train_x = train_data[all_cols].values
        train_y = train[inOrout].values
        test_data = test_data[all_cols].values
        for i, (trn_index, val_index) in enumerate(kf.split(train_data)):
            trn_x = train_x[trn_index]
            trn_y = train_y[trn_index]
            
            val_x = train_x[val_index]
            val_y = train_y[val_index]
            #weight_train=weight_df.iloc[trn_index]
            #weight_test=weight_df.iloc[val_index]

            trn_matrix = reg.Dataset(trn_x, label=trn_y)
            val_matrix = reg.Dataset(val_x, label=val_y)
            num_round = 200000
            early_stopping_rounds = 500
            if val_matrix:
                model = reg.train(params, trn_matrix, num_round, valid_sets=[trn_matrix, val_matrix],
                                  early_stopping_rounds=early_stopping_rounds, verbose_eval=500
                                  )
                pre= model.predict(val_x,num_iteration=model.best_iteration)
                train_pre[val_index]=pre
                test_pre += (model.predict(test_data, num_iteration=model.best_iteration)) / folds
                cv_score.append(mean_absolute_error(val_y, pre))

            #print(f"folds {i} of {reg_name} score is: {mean_absolute_error(val_y, pre)}")
            
        print("%s_score_list:"%reg_name,cv_score)
        print("%s_score_mean:"%reg_name,np.mean(cv_score))

        return train_pre.reshape(-1,1), test_pre.reshape(-1,1), np.mean(cv_score)

    def lgb_reg(train, test):
        params = {
            'boosting': 'gbdt',
            'objective': 'regression',
            'metric': 'mae',
            'num_leaves': 63,
            'learning_rate': 0.1,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_seed':0,
            'bagging_freq': 1,
            'reg_alpha':1,
            'reg_lambda':2,
            'verbose':500,
            'num_threads': 4
        }
        lgb_train_in, lgb_test_in, cv_scores_in = stacking(lgb,train,test,"lgb", 'inNums', params)
        lgb_train_out, lgb_test_out, cv_scores_out = stacking(lgb,train,test,"lgb", 'outNums', params)
        return lgb_train_in, lgb_test_in, cv_scores_in, lgb_train_out, lgb_test_out, cv_scores_out

    import lightgbm as lgb
    folds = 10
    seed = 2019

    #生成数据
    # 考虑不去stationID

    train_data = train.drop(columns=['stationID','startTime'])
    test_data = test.drop(columns=['stationID','startTime'])

    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    lgb_train_in, lgb_test_in, cv_scores_in, lgb_train_out, lgb_test_out, cv_scores_out=lgb_reg(train_data,test_data)

    #生成线下
    train["inNums_pre"]=np.clip(lgb_train_in,0,5000)
    train["outNums_pre"]=np.clip(lgb_train_out,0,5000)
    score_result=mean_absolute_error(train["inNums_pre"], train["inNums"]) + mean_absolute_error(train["outNums_pre"], train["outNums"]) 
    print(f'slip {slip}: the total mae score is {score_result/2}')
    #生成提交
    test_name = os.path.join(TEST_A_PATH, 'testA_submit_2019-01-29.csv')
    submit = pd.read_csv(test_name)
    
    #lgb_test_in = np.round(lgb_test_in)
    #lgb_test_out = np.round(lgb_test_out)
    submit['inNums'] = np.clip(lgb_test_in,0,5000)
    submit['outNums'] = np.clip(lgb_test_out,0,5000)
    submit.to_csv(f'../submit/{slip}slips_{folds}folds.csv', index=False)
    
test_name = os.path.join(TEST_A_PATH, 'testA_submit_2019-01-29.csv')
submit = pd.read_csv(test_name)
i = 0
for file in os.listdir('../submit'):
    print(f'the file: {file}')
    temp = pd.read_csv('../submit/'+file)
    if i==0:
        submit['inNums'] = temp['inNums']
        submit['outNums'] = temp['outNums']
        i = 1
    else:
        submit['inNums'] += temp['inNums']
        submit['outNums'] += temp['outNums']
files = [file for file in os.listdir('../submit') if file[-4:]=='.csv']
len_files = len(files)
submit['inNums'] = submit['inNums'].apply(lambda x: x/len_files)
submit['outNums'] = submit['outNums'].apply(lambda x: x/len_files)
submit.to_csv(f'../submit/submit_{datetime.strftime(datetime.now(), "%Y_%m_%d_%H_%M")}.csv', index=False)

the slip is: 2
19
17
15
13
11
9
7
5
3
21
Training until validation scores don't improve for 500 rounds.
[500]	training's l1: 11.4042	valid_1's l1: 14.0314
[1000]	training's l1: 9.79313	valid_1's l1: 13.7989
[1500]	training's l1: 8.67525	valid_1's l1: 13.713
[2000]	training's l1: 7.80314	valid_1's l1: 13.6752
[2500]	training's l1: 7.0871	valid_1's l1: 13.6671
Early stopping, best iteration is:
[2388]	training's l1: 7.23886	valid_1's l1: 13.6611
Training until validation scores don't improve for 500 rounds.
[500]	training's l1: 11.4225	valid_1's l1: 13.8004
[1000]	training's l1: 9.83096	valid_1's l1: 13.6253


In [None]:
# 1 15.1
# 2 14.2
# 3 14.1
# 5 14.0
# 7 14.9
# 10 15.7

In [2]:
test_name = os.path.join(TEST_A_PATH, 'testA_submit_2019-01-29.csv')
submit = pd.read_csv(test_name)
i = 0
for file in os.listdir('../submit'):
    print(f'the file: {file}')
    temp = pd.read_csv('../submit/'+file)
    if i==0:
        submit['inNums'] = temp['inNums']
        submit['outNums'] = temp['outNums']
        i = 1
    else:
        submit['inNums'] += temp['inNums']
        submit['outNums'] += temp['outNums']
files = [file for file in os.listdir('../submit') if file[-4:]=='.csv']
len_files = len(files)
submit['inNums'] = submit['inNums'].apply(lambda x: x/len_files)
submit['outNums'] = submit['outNums'].apply(lambda x: x/len_files)
submit.to_csv(f'../submit/submit_{datetime.strftime(datetime.now(), "%Y_%m_%d_%H_%M")}.csv', index=False)

the file: 2slips_10folds.csv
the file: 3slips_10folds.csv
the file: 5slips_10folds.csv
the file: 7slips_10folds.csv


In [38]:
## 对夜晚数据单独处理
data = pd.read_csv('../input/after_base_features.csv')
data['hour_minutes'] = data['hour']*60+data['minute']
data_in = data.loc[(data['hour_minutes']>=1420) | (data['hour_minutes']<=320), ['stationID','day','hour_minutes','inNums','outNums']]
data_out = data.loc[data['hour_minutes']<=350, ['stationID','day','hour_minutes','inNums','outNums']]
data_in = data_in[data_in['inNums'] != 0]
data_out = data_out[data_out['outNums'] != 0]

def special_time(df_data, df_label, inOrout):
    # count
    tmp = df_data.groupby(['stationID', 'hour_minutes'])['day'].count().reset_index(name=f'count_inNum_days_{inOrout}')
    df_label = df_label.merge(tmp, on=['stationID', 'hour_minutes'], how='left')
    #submit.fillnan(0)
    tmp = df_data.groupby(['stationID', 'hour_minutes'])['day'].count().reset_index(name=f'count_outNum_days_{inOrout}')
    df_label = df_label.merge(tmp, on=['stationID', 'hour_minutes'], how='left')
    # mean
    tmp = df_data.groupby(['stationID', 'hour_minutes'])['inNums'].mean().reset_index(name=f'mean_inNums_{inOrout}')
    df_label = df_label.merge(tmp, on=['stationID', 'hour_minutes'], how='left')
    tmp = df_data.groupby(['stationID', 'hour_minutes'])['outNums'].mean().reset_index(name=f'mean_outNums_{inOrout}')
    df_label = df_label.merge(tmp, on=['stationID', 'hour_minutes'], how='left')
    # mode

    return df_label

In [42]:
submit.head()

Unnamed: 0,stationID,startTime,endTime,inNums,outNums,hour_x,minute_x,hour_minutes,hour_y,minute_y,count_inNum_days_in,count_outNum_days_in,mean_inNums_in,mean_outNums_in,hour,minute,count_inNum_days_out,count_outNum_days_out,mean_inNums_out,mean_outNums_out
0,0,2019-01-29 00:00:00,2019-01-29 00:10:00,0.724791,1.574449,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,0.0,2.142857
1,0,2019-01-29 00:10:00,2019-01-29 00:20:00,0.799236,1.370402,0,10,10,0.0,10.0,1.0,1.0,2.0,0.0,0.0,10.0,4.0,4.0,0.0,1.75
2,0,2019-01-29 00:20:00,2019-01-29 00:30:00,1.21004,1.30162,0,20,20,0.0,20.0,1.0,1.0,1.0,2.0,0.0,20.0,4.0,4.0,0.25,2.0
3,0,2019-01-29 00:30:00,2019-01-29 00:40:00,1.606611,1.23219,0,30,30,0.0,30.0,3.0,3.0,1.333333,0.333333,0.0,30.0,2.0,2.0,0.5,1.0
4,0,2019-01-29 00:40:00,2019-01-29 00:50:00,1.827594,1.220876,0,40,40,0.0,40.0,1.0,1.0,1.0,1.0,0.0,40.0,4.0,4.0,0.25,1.25


In [39]:
submit = pd.read_csv('../submit/submit_2019_03_28_09_10.csv')
submit['hour'] = submit['startTime'].apply(lambda x: int(str(x)[11:13]))
submit['minute'] = submit['startTime'].apply(lambda x: int(str(x)[14:15]+'0'))# hour+10min 10min最后可以删除
submit['hour_minutes'] = submit['hour']*60+submit['minute']

submit_in = special_time(data_in, submit[(submit['hour_minutes']>=1420) | (submit['hour_minutes']<=320)], 'in')
submit_out = special_time(data_out, submit[submit['hour_minutes']<=350], 'out')

In [40]:
submit_in = submit_in.fillna(0)
submit_out = submit_out.fillna(0)

In [41]:
submit = submit.merge(submit_in, on=['stationID','hour_minutes','startTime','endTime','inNums','outNums'], how='left')
submit = submit.merge(submit_out, on=['stationID','hour_minutes','startTime','endTime','inNums','outNums'], how='left')

In [43]:
submit.loc[(submit['hour_minutes']>=1420) | (submit['hour_minutes']<=320), 'inNums'] = submit.loc[(submit['hour_minutes']>=1420)| (submit['hour_minutes']<=320), 'mean_inNums_in']
submit.loc[submit['hour_minutes']<=350, 'outNums'] = submit.loc[submit['hour_minutes']<=350, 'mean_outNums_out']

In [44]:
submit[['stationID', 'startTime', 'endTime', 'inNums', 'outNums']].to_csv(f'../submit/submit_{datetime.strftime(datetime.now(), "%Y_%m_%d_%H_%M")}.csv', index=False)

In [48]:
submit = pd.read_csv('../submit/submit_2019_03_28_09_10.csv')

In [49]:
submit.shape

(11664, 5)