In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/optiver-realized-volatility-prediction/sample_submission.csv
/kaggle/input/optiver-realized-volatility-prediction/train.csv
/kaggle/input/optiver-realized-volatility-prediction/test.csv
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=97/888f813404d8417ca8d6b8aebd5f2951.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=43/bb0efa57f511470e817880842e3e2afa.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=21/1d8dc18ebfee47ffbb54b04e6afc0634.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=72/60f62a03d8854605901dda072c84db39.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=4/761268d671f9429abb29d9d2895e9bd2.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=112/cd283097a5b54293ba400a19e811a7f9.parquet
/kaggle/input/optiver-realized-volatility-pr

In [2]:
data_dir = '../input/optiver-realized-volatility-prediction/'

In [3]:
#Preprocessing Functions
def calc_wap(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    a = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1']+ df['ask_size1'])
    b = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2']+ df['ask_size2'])
    wap = (a + b) / 2
    return wap

def calc_wap4(df):
    a1 = df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']
    a2 = df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']
    b = df['bid_size1'] + df['ask_size1'] + df['bid_size2']+ df['ask_size2']    
    wap = (a1 + a2)/ b
    return wap

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    return len(np.unique(series))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [4]:
#Declaring training data
book_train = pd.read_parquet(data_dir + "book_train.parquet/stock_id=6")
book_train.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.00081,1.002485,1.000783,1.002512,100,110,1,1
1,5,1,1.00081,1.002485,1.000783,1.002512,4,110,1,1
2,5,5,1.001863,1.002944,1.001647,1.002971,12,1,34,1
3,5,6,1.000864,1.002944,1.000837,1.002971,100,1,70,1
4,5,7,1.00108,1.002944,1.001053,1.002971,100,1,70,1


In [5]:
#Preproccing functions for book data

def preprocessor_book(file_path):
    df = pd.read_parquet(file_path)
    #calculate return etc
    df['wap'] = calc_wap(df)
    df['log_return'] = df.groupby('time_id')['wap'].apply(log_return)
    
    df['wap2'] = calc_wap2(df)
    df['log_return2'] = df.groupby('time_id')['wap2'].apply(log_return)
    
    df['wap3'] = calc_wap3(df)
    df['log_return3'] = df.groupby('time_id')['wap3'].apply(log_return)
    
    df['wap_balance'] = abs(df['wap'] - df['wap2'])
    
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1'])/2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))

    #dict for aggregate
    create_feature_dict = {
        'log_return':[realized_volatility],
        'log_return2':[realized_volatility],
        'log_return3':[realized_volatility],
        'wap_balance':[np.mean],
        'price_spread':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'volume_imbalance':[np.mean],
        'total_volume':[np.mean],
        'wap':[np.mean],
            }

    #####groupby / all seconds
    df_feature = pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()
    
    df_feature.columns = ['_'.join(col) for col in df_feature.columns] #time_id is changed to time_id_
        
    ######groupby / last XX seconds
    last_seconds = [300]
    
    for second in last_seconds:
        second = 600 - second 
    
        df_feature_sec = pd.DataFrame(df.query(f'seconds_in_bucket >= {second}').groupby(['time_id']).agg(create_feature_dict)).reset_index()

        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns] #time_id is changed to time_id_
     
        df_feature_sec = df_feature_sec.add_suffix('_' + str(second))

        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
        df_feature = df_feature.drop([f'time_id__{second}'],axis=1)
    
    #create row_id
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['time_id_'],axis=1)
    
    return df_feature

In [6]:
file_path = data_dir + "book_train.parquet/stock_id=100"
preprocessor_book(file_path)

Unnamed: 0,log_return_realized_volatility,log_return2_realized_volatility,log_return3_realized_volatility,wap_balance_mean,price_spread_mean,bid_spread_mean,ask_spread_mean,volume_imbalance_mean,total_volume_mean,wap_mean,...,log_return2_realized_volatility_300,log_return3_realized_volatility_300,wap_balance_mean_300,price_spread_mean_300,bid_spread_mean_300,ask_spread_mean_300,volume_imbalance_mean_300,total_volume_mean_300,wap_mean_300,row_id
0,0.004813,0.007202,0.004943,0.000318,0.000606,0.000156,-0.000142,172.591422,369.408578,1.001530,...,0.004852,0.003294,0.000300,0.000578,0.000138,-0.000123,169.042918,381.274678,1.002192,100-5
1,0.002436,0.002899,0.002085,0.000204,0.000451,0.000126,-0.000128,328.706122,638.902041,1.000982,...,0.001435,0.001349,0.000165,0.000414,0.000147,-0.000102,459.238938,818.955752,1.001129,100-11
2,0.004001,0.005286,0.004031,0.000179,0.000380,0.000156,-0.000154,354.151982,776.865639,1.000929,...,0.003914,0.002533,0.000196,0.000375,0.000163,-0.000147,450.362500,874.470833,1.000571,100-16
3,0.002743,0.003732,0.002721,0.000227,0.000464,0.000143,-0.000130,127.825737,370.619303,1.000461,...,0.002635,0.002029,0.000244,0.000485,0.000136,-0.000155,151.966851,431.359116,1.000937,100-31
4,0.002714,0.004039,0.002643,0.000315,0.000501,0.000186,-0.000169,136.755187,292.780083,1.000437,...,0.002917,0.002157,0.000344,0.000507,0.000187,-0.000179,161.162602,347.455285,1.000120,100-62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3824,0.003507,0.005707,0.003609,0.000273,0.000462,0.000123,-0.000142,128.347548,310.671642,1.000331,...,0.004203,0.002473,0.000270,0.000454,0.000126,-0.000131,133.489796,317.457143,1.000641,100-32751
3825,0.002800,0.003495,0.002447,0.000222,0.000385,0.000108,-0.000150,164.515901,406.339223,0.998695,...,0.001964,0.001443,0.000173,0.000343,0.000119,-0.000104,173.540323,364.911290,0.999076,100-32753
3826,0.003194,0.004489,0.003106,0.000269,0.000671,0.000177,-0.000193,220.187500,517.845833,1.000780,...,0.002675,0.002037,0.000228,0.000579,0.000173,-0.000208,203.470588,439.588235,1.000889,100-32758
3827,0.004297,0.005072,0.004265,0.000195,0.000553,0.000201,-0.000247,216.226044,716.643735,1.005620,...,0.003760,0.003123,0.000197,0.000574,0.000194,-0.000246,229.785000,739.045000,1.006400,100-32763


In [7]:
trade_train = pd.read_parquet(data_dir + "trade_train.parquet/stock_id=13")
trade_train.head(10)

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count
0,5,0,1.000129,415,7
1,5,5,1.000386,17,3
2,5,8,1.000472,598,10
3,5,13,1.000652,517,7
4,5,19,1.000901,100,3
5,5,24,1.000901,484,6
6,5,25,1.001158,100,2
7,5,30,1.001243,301,5
8,5,40,1.001415,1103,7
9,5,42,1.001415,1,1


In [8]:
def preprocessor_trade(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    
    aggregate_dictionary = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    df_feature = df.groupby('time_id').agg(aggregate_dictionary)
    
    df_feature = df_feature.reset_index()
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]

    
    ######groupby / last XX seconds
    last_seconds = [300]
    
    for second in last_seconds:
        second = 600 - second
    
        df_feature_sec = df.query(f'seconds_in_bucket >= {second}').groupby('time_id').agg(aggregate_dictionary)
        df_feature_sec = df_feature_sec.reset_index()
        
        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
        df_feature_sec = df_feature_sec.add_suffix('_' + str(second))
        
        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
        df_feature = df_feature.drop([f'time_id__{second}'],axis=1)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['trade_time_id_'],axis=1)
    
    return df_feature

In [9]:
file_path = data_dir + "trade_train.parquet/stock_id=51"
preprocessor_trade(file_path)

Unnamed: 0,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,row_id
0,0.001379,82,17131,3.378049,0.000887,41,9185,3.390244,51-5
1,0.000954,38,4488,3.236842,0.000790,22,3254,3.590909,51-11
2,0.001167,43,12095,3.302326,0.000497,17,5577,3.647059,51-16
3,0.001359,40,7163,2.950000,0.001099,25,3438,2.200000,51-31
4,0.001204,38,5397,3.184211,0.000857,16,2759,3.437500,51-62
...,...,...,...,...,...,...,...,...,...
3825,0.001553,97,21106,3.041237,0.001081,48,10651,2.979167,51-32751
3826,0.001311,54,10960,4.407407,0.000625,20,3559,4.350000,51-32753
3827,0.001207,53,6684,2.226415,0.000842,30,3480,2.166667,51-32758
3828,0.001476,98,24445,3.785714,0.001126,46,11347,3.804348,51-32763


In [10]:
def preprocessor(list_stock_ids, is_train = True):
    from joblib import Parallel, delayed # parallel computing to save time
    df = pd.DataFrame()
    
    def for_joblib(stock_id):
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = pd.merge(preprocessor_book(file_path_book),preprocessor_trade(file_path_trade),on='row_id',how='left')
     
        return pd.concat([df,df_tmp])
    
    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df


In [11]:
list_stock_ids = [0,1]
preprocessor(list_stock_ids, is_train = True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.6s finished


Unnamed: 0,log_return_realized_volatility,log_return2_realized_volatility,log_return3_realized_volatility,wap_balance_mean,price_spread_mean,bid_spread_mean,ask_spread_mean,volume_imbalance_mean,total_volume_mean,wap_mean,...,wap_mean_300,row_id,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300
0,0.004499,0.006999,0.003930,0.000388,0.000852,0.000176,-0.000151,134.894040,323.496689,1.003725,...,1.003753,0-5,0.002006,40,3179,2.750000,0.001308,21.0,1587.0,2.571429
1,0.001204,0.002476,0.001502,0.000212,0.000394,0.000142,-0.000135,142.050000,411.450000,1.000239,...,1.000397,0-11,0.000901,30,1289,1.900000,0.000587,16.0,900.0,2.250000
2,0.002369,0.004801,0.002705,0.000331,0.000725,0.000197,-0.000198,141.414894,416.351064,0.999542,...,0.998685,0-16,0.001961,25,2161,2.720000,0.001137,12.0,1189.0,3.166667
3,0.002574,0.003637,0.002104,0.000380,0.000860,0.000190,-0.000108,146.216667,435.266667,0.998832,...,0.998436,0-31,0.001561,15,1962,3.933333,0.001089,9.0,1556.0,5.111111
4,0.001894,0.003257,0.001823,0.000254,0.000397,0.000191,-0.000109,123.846591,343.221591,0.999619,...,0.999488,0-62,0.000871,22,1791,4.045455,0.000453,11.0,1219.0,4.909091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7655,0.003723,0.004996,0.002743,0.000330,0.000597,0.000157,-0.000118,125.013029,296.185668,1.000142,...,1.000130,1-32751,0.001776,49,3249,2.775510,0.001280,23.0,1889.0,3.608696
7656,0.010829,0.012168,0.010240,0.000403,0.000922,0.000159,-0.000125,254.006073,567.840081,1.007503,...,1.012343,1-32753,0.008492,183,75903,7.874317,0.006310,88.0,30858.0,8.136364
7657,0.003135,0.004268,0.002872,0.000243,0.000648,0.000141,-0.000132,163.645367,426.603834,1.000854,...,1.001250,1-32758,0.001927,26,2239,2.615385,0.001567,11.0,980.0,2.727273
7658,0.003750,0.005773,0.003748,0.000199,0.000421,0.000190,-0.000231,138.235023,526.317972,1.003032,...,1.004296,1-32763,0.002856,109,16648,2.935780,0.001919,57.0,8274.0,2.701754


In [12]:
train = pd.read_csv(data_dir + 'train.csv')
test = test = pd.read_csv(data_dir + 'test.csv')

train_ids = train.stock_id.unique()
test_ids = test.stock_id.unique()

In [13]:
df_train = preprocessor(list_stock_ids= train_ids, is_train = True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed: 10.6min finished


In [14]:

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_train = train.merge(df_train, on = ['row_id'], how = 'left')

In [15]:
df_test = preprocessor(list_stock_ids= test_ids, is_train = False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [16]:
df_test = test.merge(df_test, on = ['row_id'], how = 'left')

In [17]:
df_train.head(10)

Unnamed: 0,row_id,target,log_return_realized_volatility,log_return2_realized_volatility,log_return3_realized_volatility,wap_balance_mean,price_spread_mean,bid_spread_mean,ask_spread_mean,volume_imbalance_mean,...,total_volume_mean_300,wap_mean_300,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300
0,0-5,0.004136,0.004499,0.006999,0.00393,0.000388,0.000852,0.000176,-0.000151,134.89404,...,294.928058,1.003753,0.002006,40.0,3179.0,2.75,0.001308,21.0,1587.0,2.571429
1,0-11,0.001445,0.001204,0.002476,0.001502,0.000212,0.000394,0.000142,-0.000135,142.05,...,484.521739,1.000397,0.000901,30.0,1289.0,1.9,0.000587,16.0,900.0,2.25
2,0-16,0.002168,0.002369,0.004801,0.002705,0.000331,0.000725,0.000197,-0.000198,141.414894,...,455.235294,0.998685,0.001961,25.0,2161.0,2.72,0.001137,12.0,1189.0,3.166667
3,0-31,0.002195,0.002574,0.003637,0.002104,0.00038,0.00086,0.00019,-0.000108,146.216667,...,418.169811,0.998436,0.001561,15.0,1962.0,3.933333,0.001089,9.0,1556.0,5.111111
4,0-62,0.001747,0.001894,0.003257,0.001823,0.000254,0.000397,0.000191,-0.000109,123.846591,...,407.58427,0.999488,0.000871,22.0,1791.0,4.045455,0.000453,11.0,1219.0,4.909091
5,0-72,0.004912,0.007902,0.010336,0.005931,0.000649,0.001637,0.000114,-0.000241,140.543726,...,327.571429,0.997851,0.003443,26.0,3395.0,3.730769,0.002419,14.0,1551.0,2.785714
6,0-97,0.009388,0.010034,0.014493,0.00846,0.000666,0.001665,0.000266,-0.000223,96.336957,...,339.10241,0.998384,0.004242,52.0,2279.0,2.192308,0.002647,21.0,957.0,2.0
7,0-103,0.00412,0.005331,0.006557,0.004747,0.000493,0.001039,0.000133,-0.000246,119.826531,...,300.954128,1.002162,0.002102,28.0,1181.0,2.071429,0.00145,10.0,375.0,2.7
8,0-109,0.002182,0.001797,0.003536,0.002494,0.000202,0.000445,0.000123,-0.000154,175.199153,...,336.70339,1.001976,0.001266,45.0,1868.0,1.888889,0.000679,20.0,759.0,1.75
9,0-123,0.002669,0.003273,0.005989,0.003395,0.000243,0.000468,0.000121,-0.00016,133.548165,...,326.246445,1.00035,0.001907,64.0,5135.0,2.90625,0.00112,21.0,1985.0,2.809524


In [18]:
df_test.tail(9)

Unnamed: 0,stock_id,time_id,row_id,log_return_realized_volatility,log_return2_realized_volatility,log_return3_realized_volatility,wap_balance_mean,price_spread_mean,bid_spread_mean,ask_spread_mean,...,total_volume_mean_300,wap_mean_300,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300
0,0,4,0-4,0.000294,0.000252,0.000126,0.000145,0.000557,0.000393,-0.000115,...,,,0.000295,3.0,201.0,3.666667,,,,
1,0,32,0-32,,,,,,,,...,,,,,,,,,,
2,0,34,0-34,,,,,,,,...,,,,,,,,,,


In [19]:
from sklearn.model_selection import KFold
import lightgbm as lgbm


In [20]:
#stock_id target encoding
df_train['stock_id'] = df_train['row_id'].apply(lambda x:x.split('-')[0])
df_test['stock_id'] = df_test['row_id'].apply(lambda x:x.split('-')[0])

stock_id_target_mean = df_train.groupby('stock_id')['target'].mean() 
df_test['stock_id_target_enc'] = df_test['stock_id'].map(stock_id_target_mean) # test_set

#training
tmp = np.repeat(np.nan, df_train.shape[0])
kf = KFold(n_splits = 20, shuffle=True,random_state = 19911109)
for idx_1, idx_2 in kf.split(df_train):
    target_mean = df_train.iloc[idx_1].groupby('stock_id')['target'].mean()

    tmp[idx_2] = df_train['stock_id'].iloc[idx_2].map(target_mean)
df_train['stock_id_target_enc'] = tmp

In [21]:
df_test.head()

Unnamed: 0,stock_id,time_id,row_id,log_return_realized_volatility,log_return2_realized_volatility,log_return3_realized_volatility,wap_balance_mean,price_spread_mean,bid_spread_mean,ask_spread_mean,...,wap_mean_300,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,stock_id_target_enc
0,0,4,0-4,0.000294,0.000252,0.000126,0.000145,0.000557,0.000393,-0.000115,...,,0.000295,3.0,201.0,3.666667,,,,,0.004028
1,0,32,0-32,,,,,,,,...,,,,,,,,,,0.004028
2,0,34,0-34,,,,,,,,...,,,,,,,,,,0.004028


In [22]:
df_train.head()

Unnamed: 0,row_id,target,log_return_realized_volatility,log_return2_realized_volatility,log_return3_realized_volatility,wap_balance_mean,price_spread_mean,bid_spread_mean,ask_spread_mean,volume_imbalance_mean,...,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,stock_id,stock_id_target_enc
0,0-5,0.004136,0.004499,0.006999,0.00393,0.000388,0.000852,0.000176,-0.000151,134.89404,...,0.002006,40.0,3179.0,2.75,0.001308,21.0,1587.0,2.571429,0,0.004036
1,0-11,0.001445,0.001204,0.002476,0.001502,0.000212,0.000394,0.000142,-0.000135,142.05,...,0.000901,30.0,1289.0,1.9,0.000587,16.0,900.0,2.25,0,0.004041
2,0-16,0.002168,0.002369,0.004801,0.002705,0.000331,0.000725,0.000197,-0.000198,141.414894,...,0.001961,25.0,2161.0,2.72,0.001137,12.0,1189.0,3.166667,0,0.004033
3,0-31,0.002195,0.002574,0.003637,0.002104,0.00038,0.00086,0.00019,-0.000108,146.216667,...,0.001561,15.0,1962.0,3.933333,0.001089,9.0,1556.0,5.111111,0,0.004022
4,0-62,0.001747,0.001894,0.003257,0.001823,0.000254,0.000397,0.000191,-0.000109,123.846591,...,0.000871,22.0,1791.0,4.045455,0.000453,11.0,1219.0,4.909091,0,0.004027


In [23]:
df_train['stock_id'] = df_train['stock_id'].astype(int)
df_test['stock_id'] = df_test['stock_id'].astype(int)

X = df_train.drop(['row_id','target'],axis=1)
y = df_train['target']

def feval_RMSPE(preds, lgbm_train):
    labels = lgbm_train.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

params = {
      "objective": "rmse", 
      "metric": "rmse", 
      "boosting_type": "gbdt",
      'early_stopping_rounds': 30,
      'learning_rate': 0.01,
      'lambda_l1': 1.0,
      'lambda_l2': 1.0,
      'feature_fraction': 0.8,
      'bagging_fraction': 0.8,
  }

In [24]:
kf = KFold(n_splits=25, random_state=42, shuffle=True)
oof = pd.DataFrame()                
models = []                          
scores = 0.0                

for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):

    print("Fold :", fold+1)
    
    # create dataset
    X_train, y_train = X.loc[trn_idx], y[trn_idx]
    X_valid, y_valid = X.loc[val_idx], y[val_idx]
    
    #RMSPE weight
    weights = 1/np.square(y_train)
    lgbm_train = lgbm.Dataset(X_train,y_train,weight = weights)

    weights = 1/np.square(y_valid)
    lgbm_valid = lgbm.Dataset(X_valid,y_valid,reference = lgbm_train,weight = weights)
    
    # model 
    model = lgbm.train(params=params,
                      train_set=lgbm_train,
                      valid_sets=[lgbm_train, lgbm_valid],
                      num_boost_round=5000,         
                      feval=feval_RMSPE,
                      verbose_eval=100,
                      categorical_feature = ['stock_id']                
                     )
    
    # validation 
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

    RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),3)
    print(f'Performance of the　prediction: , RMSPE: {RMSPE}')

    #keep scores and models
    scores += RMSPE / 25
    models.append(model)

Fold : 1


New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411774, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659313	training's RMSPE: 0.30514	valid_1's rmse: 0.000660795	valid_1's RMSPE: 0.30571
[200]	training's rmse: 0.000535322	training's RMSPE: 0.24776	valid_1's rmse: 0.0005385	valid_1's RMSPE: 0.24913
[300]	training's rmse: 0.000509561	training's RMSPE: 0.23583	valid_1's rmse: 0.000514055	valid_1's RMSPE: 0.23782
[400]	training's rmse: 0.000502101	training's RMSPE: 0.23238	valid_1's rmse: 0.000508123	valid_1's RMSPE: 0.23508
[500]	training's rmse: 0.000498281	training's RMSPE: 0.23061	valid_1's rmse: 0.000506262	valid_1's RMSPE: 0.23422
[600]	training's rmse: 0.00049569	training's RMSPE: 0.22942	valid_1's rmse: 0.00050507	valid_1's RMSPE: 0.23367
[700]	training's rmse: 0.000493392	training's RMSPE: 0.22835	valid_1's rmse: 0.000503918	valid_1's RMSPE: 0.23313
[800]	training's rmse: 0.000491385	training's RMSPE: 0.22742	valid_1's rmse: 0.000503192	valid_1's RMSPE: 0.2328
[900]	training's rmse: 0.000489586	training's RMSPE: 0.22659	valid_1's rmse: 0.000502622	vali

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411774, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659048	training's RMSPE: 0.30507	valid_1's rmse: 0.000709797	valid_1's RMSPE: 0.32699
[200]	training's rmse: 0.000534966	training's RMSPE: 0.24764	valid_1's rmse: 0.00061398	valid_1's RMSPE: 0.28285
[300]	training's rmse: 0.000509135	training's RMSPE: 0.23568	valid_1's rmse: 0.000594928	valid_1's RMSPE: 0.27408
[400]	training's rmse: 0.000501727	training's RMSPE: 0.23225	valid_1's rmse: 0.000590256	valid_1's RMSPE: 0.27192
Early stopping, best iteration is:
[401]	training's rmse: 0.000501681	training's RMSPE: 0.23223	valid_1's rmse: 0.000590205	valid_1's RMSPE: 0.2719
Performance of the　prediction: , RMSPE: 0.272
Fold : 3


New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7507
[LightGBM] [Info] Number of data points in the train set: 411774, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659576	training's RMSPE: 0.30522	valid_1's rmse: 0.000653244	valid_1's RMSPE: 0.30334
[200]	training's rmse: 0.000535463	training's RMSPE: 0.24778	valid_1's rmse: 0.000530578	valid_1's RMSPE: 0.24638
[300]	training's rmse: 0.000509661	training's RMSPE: 0.23584	valid_1's rmse: 0.000505971	valid_1's RMSPE: 0.23496
[400]	training's rmse: 0.00050222	training's RMSPE: 0.2324	valid_1's rmse: 0.000499605	valid_1's RMSPE: 0.232
[500]	training's rmse: 0.000498411	training's RMSPE: 0.23064	valid_1's rmse: 0.000497035	valid_1's RMSPE: 0.23081
[600]	training's rmse: 0.000495853	training's RMSPE: 0.22945	valid_1's rmse: 0.00049556	valid_1's RMSPE: 0.23012
[700]	training's rmse: 0.000493587	training's RMSPE: 0.22841	valid_1's rmse: 0.000494347	valid_1's RMSPE: 0.22956
[800]	training's rmse: 0.000491567	training's RMSPE: 0.22747	valid_1's rmse: 0.00049343	valid_1's RMSPE: 0.22913
[900]	training's rmse: 0.000489733	training's RMSPE: 0.22662	valid_1's rmse: 0.000492626	valid

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411774, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659464	training's RMSPE: 0.30518	valid_1's rmse: 0.000661822	valid_1's RMSPE: 0.30704
[200]	training's rmse: 0.000535432	training's RMSPE: 0.24778	valid_1's rmse: 0.000538766	valid_1's RMSPE: 0.24995
[300]	training's rmse: 0.000509628	training's RMSPE: 0.23584	valid_1's rmse: 0.000512956	valid_1's RMSPE: 0.23798
[400]	training's rmse: 0.000502183	training's RMSPE: 0.23239	valid_1's rmse: 0.000506546	valid_1's RMSPE: 0.23501
[500]	training's rmse: 0.000498415	training's RMSPE: 0.23065	valid_1's rmse: 0.000504257	valid_1's RMSPE: 0.23394
[600]	training's rmse: 0.000495812	training's RMSPE: 0.22944	valid_1's rmse: 0.000502856	valid_1's RMSPE: 0.23329
[700]	training's rmse: 0.000493537	training's RMSPE: 0.22839	valid_1's rmse: 0.00050148	valid_1's RMSPE: 0.23266
[800]	training's rmse: 0.000491528	training's RMSPE: 0.22746	valid_1's rmse: 0.000500341	valid_1's RMSPE: 0.23213
[900]	training's rmse: 0.00048973	training's RMSPE: 0.22663	valid_1's rmse: 0.000499433	v

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411774, number of used features: 30
[LightGBM] [Info] Start training from score 0.001798
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659403	training's RMSPE: 0.30539	valid_1's rmse: 0.00065221	valid_1's RMSPE: 0.29676
[200]	training's rmse: 0.000535302	training's RMSPE: 0.24792	valid_1's rmse: 0.000532486	valid_1's RMSPE: 0.24229
[300]	training's rmse: 0.00050945	training's RMSPE: 0.23594	valid_1's rmse: 0.000509468	valid_1's RMSPE: 0.23181
[400]	training's rmse: 0.000502062	training's RMSPE: 0.23252	valid_1's rmse: 0.000504094	valid_1's RMSPE: 0.22937
[500]	training's rmse: 0.000498322	training's RMSPE: 0.23079	valid_1's rmse: 0.00050158	valid_1's RMSPE: 0.22822
[600]	training's rmse: 0.000495692	training's RMSPE: 0.22957	valid_1's rmse: 0.000499899	valid_1's RMSPE: 0.22746
[700]	training's rmse: 0.000493473	training's RMSPE: 0.22854	valid_1's rmse: 0.000498451	valid_1's RMSPE: 0.2268
[800]	training's rmse: 0.000491458	training's RMSPE: 0.22761	valid_1's rmse: 0.000497358	valid_1's RMSPE: 0.2263
[900]	training's rmse: 0.000489648	training's RMSPE: 0.22677	valid_1's rmse: 0.000496431	vali

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411774, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.00065927	training's RMSPE: 0.30517	valid_1's rmse: 0.00066505	valid_1's RMSPE: 0.30658
[200]	training's rmse: 0.000535141	training's RMSPE: 0.24771	valid_1's rmse: 0.000541889	valid_1's RMSPE: 0.2498
[300]	training's rmse: 0.000509346	training's RMSPE: 0.23577	valid_1's rmse: 0.000516433	valid_1's RMSPE: 0.23807
[400]	training's rmse: 0.000501943	training's RMSPE: 0.23234	valid_1's rmse: 0.000509497	valid_1's RMSPE: 0.23487
[500]	training's rmse: 0.000498136	training's RMSPE: 0.23058	valid_1's rmse: 0.000506537	valid_1's RMSPE: 0.23351
[600]	training's rmse: 0.000495565	training's RMSPE: 0.22939	valid_1's rmse: 0.000504945	valid_1's RMSPE: 0.23277
[700]	training's rmse: 0.000493316	training's RMSPE: 0.22835	valid_1's rmse: 0.000503726	valid_1's RMSPE: 0.23221
[800]	training's rmse: 0.000491279	training's RMSPE: 0.22741	valid_1's rmse: 0.000502705	valid_1's RMSPE: 0.23174
[900]	training's rmse: 0.00048945	training's RMSPE: 0.22656	valid_1's rmse: 0.000501934	val

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411774, number of used features: 30
[LightGBM] [Info] Start training from score 0.001801
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659681	training's RMSPE: 0.3052	valid_1's rmse: 0.000658237	valid_1's RMSPE: 0.30722
[200]	training's rmse: 0.000535596	training's RMSPE: 0.24779	valid_1's rmse: 0.000533943	valid_1's RMSPE: 0.24921
[300]	training's rmse: 0.000509809	training's RMSPE: 0.23586	valid_1's rmse: 0.000508547	valid_1's RMSPE: 0.23736
[400]	training's rmse: 0.00050235	training's RMSPE: 0.23241	valid_1's rmse: 0.000503249	valid_1's RMSPE: 0.23488
[500]	training's rmse: 0.000498533	training's RMSPE: 0.23065	valid_1's rmse: 0.00050075	valid_1's RMSPE: 0.23372
[600]	training's rmse: 0.000495913	training's RMSPE: 0.22943	valid_1's rmse: 0.000499605	valid_1's RMSPE: 0.23318
[700]	training's rmse: 0.000493648	training's RMSPE: 0.22839	valid_1's rmse: 0.000498954	valid_1's RMSPE: 0.23288
Early stopping, best iteration is:
[720]	training's rmse: 0.000493225	training's RMSPE: 0.22819	valid_1's rmse: 0.000498725	valid_1's RMSPE: 0.23277
Performance of the　prediction: , RMSPE: 0.233
Fold : 8


New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659187	training's RMSPE: 0.30508	valid_1's rmse: 0.000661941	valid_1's RMSPE: 0.30639
[200]	training's rmse: 0.000535193	training's RMSPE: 0.24769	valid_1's rmse: 0.000541067	valid_1's RMSPE: 0.25044
[300]	training's rmse: 0.000509416	training's RMSPE: 0.23576	valid_1's rmse: 0.000517351	valid_1's RMSPE: 0.23947
[400]	training's rmse: 0.000502	training's RMSPE: 0.23233	valid_1's rmse: 0.000511137	valid_1's RMSPE: 0.23659
[500]	training's rmse: 0.000498241	training's RMSPE: 0.23059	valid_1's rmse: 0.000508572	valid_1's RMSPE: 0.2354
[600]	training's rmse: 0.000495627	training's RMSPE: 0.22938	valid_1's rmse: 0.000507582	valid_1's RMSPE: 0.23495
Early stopping, best iteration is:
[575]	training's rmse: 0.000496253	training's RMSPE: 0.22967	valid_1's rmse: 0.000507407	valid_1's RMSPE: 0.23486
Performance of the　prediction: , RMSPE: 0.235
Fold : 9


New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659042	training's RMSPE: 0.30502	valid_1's rmse: 0.000664128	valid_1's RMSPE: 0.30714
[200]	training's rmse: 0.000535008	training's RMSPE: 0.24762	valid_1's rmse: 0.000543288	valid_1's RMSPE: 0.25125
[300]	training's rmse: 0.000509207	training's RMSPE: 0.23567	valid_1's rmse: 0.000520638	valid_1's RMSPE: 0.24078
[400]	training's rmse: 0.000501722	training's RMSPE: 0.23221	valid_1's rmse: 0.000517578	valid_1's RMSPE: 0.23936
Early stopping, best iteration is:
[438]	training's rmse: 0.000500005	training's RMSPE: 0.23142	valid_1's rmse: 0.00051713	valid_1's RMSPE: 0.23915
Performance of the　prediction: , RMSPE: 0.239
Fold : 10


New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001801
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000658943	training's RMSPE: 0.30491	valid_1's rmse: 0.000669593	valid_1's RMSPE: 0.31139
[200]	training's rmse: 0.000534857	training's RMSPE: 0.24749	valid_1's rmse: 0.000551142	valid_1's RMSPE: 0.25631
[300]	training's rmse: 0.000509072	training's RMSPE: 0.23556	valid_1's rmse: 0.000528207	valid_1's RMSPE: 0.24564
[400]	training's rmse: 0.000501694	training's RMSPE: 0.23214	valid_1's rmse: 0.000522252	valid_1's RMSPE: 0.24287
[500]	training's rmse: 0.000497975	training's RMSPE: 0.23042	valid_1's rmse: 0.000520342	valid_1's RMSPE: 0.24198
[600]	training's rmse: 0.000495395	training's RMSPE: 0.22923	valid_1's rmse: 0.000518876	valid_1's RMSPE: 0.2413
[700]	training's rmse: 0.000493129	training's RMSPE: 0.22818	valid_1's rmse: 0.000517945	valid_1's RMSPE: 0.24087
[800]	training's rmse: 0.000491126	training's RMSPE: 0.22725	valid_1's rmse: 0.000517193	valid_1's RMSPE: 0.24052
Early stopping, best iteration is:
[867]	training's rmse: 0.000489896	training's RMSPE: 0

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7507
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001801
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659434	training's RMSPE: 0.30502	valid_1's rmse: 0.000656296	valid_1's RMSPE: 0.30795
[200]	training's rmse: 0.000535346	training's RMSPE: 0.24762	valid_1's rmse: 0.000535994	valid_1's RMSPE: 0.2515
[300]	training's rmse: 0.000509485	training's RMSPE: 0.23566	valid_1's rmse: 0.000512303	valid_1's RMSPE: 0.24039
[400]	training's rmse: 0.000502055	training's RMSPE: 0.23222	valid_1's rmse: 0.000506308	valid_1's RMSPE: 0.23757
[500]	training's rmse: 0.000498318	training's RMSPE: 0.23049	valid_1's rmse: 0.000503979	valid_1's RMSPE: 0.23648
[600]	training's rmse: 0.000495717	training's RMSPE: 0.22929	valid_1's rmse: 0.000502483	valid_1's RMSPE: 0.23578
[700]	training's rmse: 0.000493444	training's RMSPE: 0.22824	valid_1's rmse: 0.000501178	valid_1's RMSPE: 0.23517
[800]	training's rmse: 0.000491469	training's RMSPE: 0.22733	valid_1's rmse: 0.000500153	valid_1's RMSPE: 0.23469
[900]	training's rmse: 0.000489675	training's RMSPE: 0.2265	valid_1's rmse: 0.000499347	v

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001799
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659386	training's RMSPE: 0.30523	valid_1's rmse: 0.000657658	valid_1's RMSPE: 0.30296
[200]	training's rmse: 0.000535269	training's RMSPE: 0.24778	valid_1's rmse: 0.000537408	valid_1's RMSPE: 0.24756
[300]	training's rmse: 0.000509458	training's RMSPE: 0.23583	valid_1's rmse: 0.000513235	valid_1's RMSPE: 0.23643
[400]	training's rmse: 0.000501989	training's RMSPE: 0.23237	valid_1's rmse: 0.000506741	valid_1's RMSPE: 0.23344
[500]	training's rmse: 0.000498176	training's RMSPE: 0.23061	valid_1's rmse: 0.000503914	valid_1's RMSPE: 0.23213
[600]	training's rmse: 0.00049558	training's RMSPE: 0.22941	valid_1's rmse: 0.000502378	valid_1's RMSPE: 0.23143
[700]	training's rmse: 0.0004933	training's RMSPE: 0.22835	valid_1's rmse: 0.000500999	valid_1's RMSPE: 0.23079
[800]	training's rmse: 0.00049129	training's RMSPE: 0.22742	valid_1's rmse: 0.000499995	valid_1's RMSPE: 0.23033
[900]	training's rmse: 0.000489504	training's RMSPE: 0.22659	valid_1's rmse: 0.000499146	val

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001801
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659016	training's RMSPE: 0.30498	valid_1's rmse: 0.000675735	valid_1's RMSPE: 0.31332
[200]	training's rmse: 0.000535283	training's RMSPE: 0.24772	valid_1's rmse: 0.000553386	valid_1's RMSPE: 0.25659
[300]	training's rmse: 0.000509578	training's RMSPE: 0.23582	valid_1's rmse: 0.00052641	valid_1's RMSPE: 0.24408
[400]	training's rmse: 0.000502145	training's RMSPE: 0.23238	valid_1's rmse: 0.000519654	valid_1's RMSPE: 0.24095
[500]	training's rmse: 0.00049837	training's RMSPE: 0.23063	valid_1's rmse: 0.000516887	valid_1's RMSPE: 0.23967
[600]	training's rmse: 0.000495757	training's RMSPE: 0.22942	valid_1's rmse: 0.000515445	valid_1's RMSPE: 0.239
[700]	training's rmse: 0.000493506	training's RMSPE: 0.22838	valid_1's rmse: 0.000514731	valid_1's RMSPE: 0.23867
Early stopping, best iteration is:
[756]	training's rmse: 0.00049239	training's RMSPE: 0.22787	valid_1's rmse: 0.000514377	valid_1's RMSPE: 0.2385
Performance of the　prediction: , RMSPE: 0.239
Fold : 14


New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7507
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001802
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659217	training's RMSPE: 0.30491	valid_1's rmse: 0.000670848	valid_1's RMSPE: 0.31505
[200]	training's rmse: 0.000535221	training's RMSPE: 0.24755	valid_1's rmse: 0.000546911	valid_1's RMSPE: 0.25684
[300]	training's rmse: 0.00050958	training's RMSPE: 0.23569	valid_1's rmse: 0.000520724	valid_1's RMSPE: 0.24455
[400]	training's rmse: 0.000502133	training's RMSPE: 0.23225	valid_1's rmse: 0.000513288	valid_1's RMSPE: 0.24105
[500]	training's rmse: 0.000498401	training's RMSPE: 0.23052	valid_1's rmse: 0.000509834	valid_1's RMSPE: 0.23943
[600]	training's rmse: 0.000495809	training's RMSPE: 0.22932	valid_1's rmse: 0.000508321	valid_1's RMSPE: 0.23872
[700]	training's rmse: 0.000493574	training's RMSPE: 0.22829	valid_1's rmse: 0.000506593	valid_1's RMSPE: 0.23791
[800]	training's rmse: 0.000491587	training's RMSPE: 0.22737	valid_1's rmse: 0.000505936	valid_1's RMSPE: 0.2376
[900]	training's rmse: 0.000489802	training's RMSPE: 0.22655	valid_1's rmse: 0.000505079	v

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7507
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001801
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659244	training's RMSPE: 0.30503	valid_1's rmse: 0.000659143	valid_1's RMSPE: 0.30684
[200]	training's rmse: 0.000535141	training's RMSPE: 0.24761	valid_1's rmse: 0.00053999	valid_1's RMSPE: 0.25137
[300]	training's rmse: 0.000509385	training's RMSPE: 0.23569	valid_1's rmse: 0.000516609	valid_1's RMSPE: 0.24049
[400]	training's rmse: 0.000501915	training's RMSPE: 0.23224	valid_1's rmse: 0.000510157	valid_1's RMSPE: 0.23748
[500]	training's rmse: 0.00049817	training's RMSPE: 0.2305	valid_1's rmse: 0.000507364	valid_1's RMSPE: 0.23618
[600]	training's rmse: 0.000495578	training's RMSPE: 0.2293	valid_1's rmse: 0.000505927	valid_1's RMSPE: 0.23551
[700]	training's rmse: 0.00049333	training's RMSPE: 0.22826	valid_1's rmse: 0.000505269	valid_1's RMSPE: 0.23521
[800]	training's rmse: 0.000491317	training's RMSPE: 0.22733	valid_1's rmse: 0.000504444	valid_1's RMSPE: 0.23482
[900]	training's rmse: 0.000489484	training's RMSPE: 0.22648	valid_1's rmse: 0.000504212	vali

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001802
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659214	training's RMSPE: 0.3049	valid_1's rmse: 0.000666086	valid_1's RMSPE: 0.31301
[200]	training's rmse: 0.000535175	training's RMSPE: 0.24753	valid_1's rmse: 0.000542738	valid_1's RMSPE: 0.25504
[300]	training's rmse: 0.000509376	training's RMSPE: 0.23559	valid_1's rmse: 0.000516825	valid_1's RMSPE: 0.24287
[400]	training's rmse: 0.000501946	training's RMSPE: 0.23216	valid_1's rmse: 0.000509608	valid_1's RMSPE: 0.23948
[500]	training's rmse: 0.000498183	training's RMSPE: 0.23042	valid_1's rmse: 0.000506405	valid_1's RMSPE: 0.23797
[600]	training's rmse: 0.00049563	training's RMSPE: 0.22924	valid_1's rmse: 0.000504926	valid_1's RMSPE: 0.23728
[700]	training's rmse: 0.000493355	training's RMSPE: 0.22818	valid_1's rmse: 0.000503617	valid_1's RMSPE: 0.23666
[800]	training's rmse: 0.00049135	training's RMSPE: 0.22726	valid_1's rmse: 0.000502957	valid_1's RMSPE: 0.23635
[900]	training's rmse: 0.000489555	training's RMSPE: 0.22643	valid_1's rmse: 0.000502496	va

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001799
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659276	training's RMSPE: 0.30516	valid_1's rmse: 0.000658095	valid_1's RMSPE: 0.30363
[200]	training's rmse: 0.000535	training's RMSPE: 0.24764	valid_1's rmse: 0.00054279	valid_1's RMSPE: 0.25043
[300]	training's rmse: 0.000509172	training's RMSPE: 0.23568	valid_1's rmse: 0.000520659	valid_1's RMSPE: 0.24022
[400]	training's rmse: 0.000501751	training's RMSPE: 0.23225	valid_1's rmse: 0.000514878	valid_1's RMSPE: 0.23755
[500]	training's rmse: 0.000498045	training's RMSPE: 0.23053	valid_1's rmse: 0.000512041	valid_1's RMSPE: 0.23625
[600]	training's rmse: 0.000495457	training's RMSPE: 0.22933	valid_1's rmse: 0.00051019	valid_1's RMSPE: 0.23539
[700]	training's rmse: 0.0004932	training's RMSPE: 0.22829	valid_1's rmse: 0.000508616	valid_1's RMSPE: 0.23467
[800]	training's rmse: 0.000491225	training's RMSPE: 0.22737	valid_1's rmse: 0.000507369	valid_1's RMSPE: 0.23409
[900]	training's rmse: 0.000489432	training's RMSPE: 0.22654	valid_1's rmse: 0.00050636	valid_1

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001798
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000658988	training's RMSPE: 0.3052	valid_1's rmse: 0.000665171	valid_1's RMSPE: 0.30274
[200]	training's rmse: 0.000534925	training's RMSPE: 0.24774	valid_1's rmse: 0.000548268	valid_1's RMSPE: 0.24954
[300]	training's rmse: 0.000509214	training's RMSPE: 0.23583	valid_1's rmse: 0.000525083	valid_1's RMSPE: 0.23899
[400]	training's rmse: 0.000501765	training's RMSPE: 0.23238	valid_1's rmse: 0.000518931	valid_1's RMSPE: 0.23619
[500]	training's rmse: 0.000497967	training's RMSPE: 0.23062	valid_1's rmse: 0.000516186	valid_1's RMSPE: 0.23494
[600]	training's rmse: 0.000495358	training's RMSPE: 0.22941	valid_1's rmse: 0.000514518	valid_1's RMSPE: 0.23418
[700]	training's rmse: 0.000493089	training's RMSPE: 0.22836	valid_1's rmse: 0.000513068	valid_1's RMSPE: 0.23352
[800]	training's rmse: 0.000491093	training's RMSPE: 0.22744	valid_1's rmse: 0.000511848	valid_1's RMSPE: 0.23296
[900]	training's rmse: 0.000489253	training's RMSPE: 0.22659	valid_1's rmse: 0.000510692	

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000658984	training's RMSPE: 0.305	valid_1's rmse: 0.000669504	valid_1's RMSPE: 0.30958
[200]	training's rmse: 0.000534979	training's RMSPE: 0.2476	valid_1's rmse: 0.000545378	valid_1's RMSPE: 0.25218
[300]	training's rmse: 0.000509199	training's RMSPE: 0.23567	valid_1's rmse: 0.000520731	valid_1's RMSPE: 0.24078
[400]	training's rmse: 0.00050174	training's RMSPE: 0.23222	valid_1's rmse: 0.000514449	valid_1's RMSPE: 0.23788
Early stopping, best iteration is:
[404]	training's rmse: 0.000501553	training's RMSPE: 0.23213	valid_1's rmse: 0.00051431	valid_1's RMSPE: 0.23781
Performance of the　prediction: , RMSPE: 0.238
Fold : 20


New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001799
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659498	training's RMSPE: 0.30537	valid_1's rmse: 0.000655491	valid_1's RMSPE: 0.29986
[200]	training's rmse: 0.000535408	training's RMSPE: 0.24791	valid_1's rmse: 0.000533791	valid_1's RMSPE: 0.24419
[300]	training's rmse: 0.000509612	training's RMSPE: 0.23597	valid_1's rmse: 0.000509653	valid_1's RMSPE: 0.23315
[400]	training's rmse: 0.000502165	training's RMSPE: 0.23252	valid_1's rmse: 0.000503515	valid_1's RMSPE: 0.23034
[500]	training's rmse: 0.000498422	training's RMSPE: 0.23079	valid_1's rmse: 0.000500819	valid_1's RMSPE: 0.22911
[600]	training's rmse: 0.000495815	training's RMSPE: 0.22958	valid_1's rmse: 0.000499053	valid_1's RMSPE: 0.2283
[700]	training's rmse: 0.00049355	training's RMSPE: 0.22853	valid_1's rmse: 0.00049758	valid_1's RMSPE: 0.22762
[800]	training's rmse: 0.000491508	training's RMSPE: 0.22759	valid_1's rmse: 0.000496401	valid_1's RMSPE: 0.22708
[900]	training's rmse: 0.000489691	training's RMSPE: 0.22674	valid_1's rmse: 0.000495447	va

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659218	training's RMSPE: 0.30514	valid_1's rmse: 0.000662345	valid_1's RMSPE: 0.30547
[200]	training's rmse: 0.000535054	training's RMSPE: 0.24767	valid_1's rmse: 0.000542071	valid_1's RMSPE: 0.25
[300]	training's rmse: 0.00050928	training's RMSPE: 0.23574	valid_1's rmse: 0.000518527	valid_1's RMSPE: 0.23914
[400]	training's rmse: 0.000501827	training's RMSPE: 0.23229	valid_1's rmse: 0.000512866	valid_1's RMSPE: 0.23653
[500]	training's rmse: 0.000498066	training's RMSPE: 0.23054	valid_1's rmse: 0.000510834	valid_1's RMSPE: 0.23559
[600]	training's rmse: 0.000495479	training's RMSPE: 0.22935	valid_1's rmse: 0.000509414	valid_1's RMSPE: 0.23494
[700]	training's rmse: 0.000493223	training's RMSPE: 0.2283	valid_1's rmse: 0.000508254	valid_1's RMSPE: 0.23441
[800]	training's rmse: 0.000491191	training's RMSPE: 0.22736	valid_1's rmse: 0.000507341	valid_1's RMSPE: 0.23398
[900]	training's rmse: 0.000489403	training's RMSPE: 0.22653	valid_1's rmse: 0.000506578	vali

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001801
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659436	training's RMSPE: 0.30512	valid_1's rmse: 0.000657471	valid_1's RMSPE: 0.30598
[200]	training's rmse: 0.000535504	training's RMSPE: 0.24778	valid_1's rmse: 0.000531657	valid_1's RMSPE: 0.24742
[300]	training's rmse: 0.000509701	training's RMSPE: 0.23584	valid_1's rmse: 0.000506916	valid_1's RMSPE: 0.23591
[400]	training's rmse: 0.000502214	training's RMSPE: 0.23238	valid_1's rmse: 0.000501007	valid_1's RMSPE: 0.23316
[500]	training's rmse: 0.000498472	training's RMSPE: 0.23065	valid_1's rmse: 0.000498621	valid_1's RMSPE: 0.23205
[600]	training's rmse: 0.000495855	training's RMSPE: 0.22943	valid_1's rmse: 0.000497114	valid_1's RMSPE: 0.23135
[700]	training's rmse: 0.000493597	training's RMSPE: 0.22839	valid_1's rmse: 0.000495941	valid_1's RMSPE: 0.2308
[800]	training's rmse: 0.000491596	training's RMSPE: 0.22746	valid_1's rmse: 0.000494997	valid_1's RMSPE: 0.23036
[900]	training's rmse: 0.00048977	training's RMSPE: 0.22662	valid_1's rmse: 0.000494129	v

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001799
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659417	training's RMSPE: 0.30526	valid_1's rmse: 0.000654469	valid_1's RMSPE: 0.30108
[200]	training's rmse: 0.000535285	training's RMSPE: 0.2478	valid_1's rmse: 0.000535732	valid_1's RMSPE: 0.24646
[300]	training's rmse: 0.00050949	training's RMSPE: 0.23586	valid_1's rmse: 0.000512825	valid_1's RMSPE: 0.23592
[400]	training's rmse: 0.00050203	training's RMSPE: 0.2324	valid_1's rmse: 0.000507185	valid_1's RMSPE: 0.23333
[500]	training's rmse: 0.000498231	training's RMSPE: 0.23064	valid_1's rmse: 0.000504684	valid_1's RMSPE: 0.23218
[600]	training's rmse: 0.000495578	training's RMSPE: 0.22942	valid_1's rmse: 0.000503305	valid_1's RMSPE: 0.23154
[700]	training's rmse: 0.00049334	training's RMSPE: 0.22838	valid_1's rmse: 0.000502039	valid_1's RMSPE: 0.23096
[800]	training's rmse: 0.000491324	training's RMSPE: 0.22745	valid_1's rmse: 0.00050091	valid_1's RMSPE: 0.23044
[900]	training's rmse: 0.000489516	training's RMSPE: 0.22661	valid_1's rmse: 0.000499959	valid

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001800
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000659031	training's RMSPE: 0.30505	valid_1's rmse: 0.0006637	valid_1's RMSPE: 0.3062
[200]	training's rmse: 0.000534908	training's RMSPE: 0.24759	valid_1's rmse: 0.000545513	valid_1's RMSPE: 0.25168
[300]	training's rmse: 0.000509157	training's RMSPE: 0.23568	valid_1's rmse: 0.000523239	valid_1's RMSPE: 0.2414
[400]	training's rmse: 0.000501736	training's RMSPE: 0.23224	valid_1's rmse: 0.000517779	valid_1's RMSPE: 0.23888
[500]	training's rmse: 0.000498028	training's RMSPE: 0.23052	valid_1's rmse: 0.000516054	valid_1's RMSPE: 0.23809
[600]	training's rmse: 0.000495426	training's RMSPE: 0.22932	valid_1's rmse: 0.00051471	valid_1's RMSPE: 0.23747
[700]	training's rmse: 0.000493208	training's RMSPE: 0.22829	valid_1's rmse: 0.000513559	valid_1's RMSPE: 0.23693
[800]	training's rmse: 0.000491203	training's RMSPE: 0.22736	valid_1's rmse: 0.00051306	valid_1's RMSPE: 0.2367
[900]	training's rmse: 0.00048942	training's RMSPE: 0.22654	valid_1's rmse: 0.000512437	valid_1

New categorical_feature is ['stock_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7506
[LightGBM] [Info] Number of data points in the train set: 411775, number of used features: 30
[LightGBM] [Info] Start training from score 0.001801
Training until validation scores don't improve for 30 rounds




[100]	training's rmse: 0.000658956	training's RMSPE: 0.30499	valid_1's rmse: 0.0006727	valid_1's RMSPE: 0.31093
[200]	training's rmse: 0.000535156	training's RMSPE: 0.24769	valid_1's rmse: 0.000542861	valid_1's RMSPE: 0.25092
[300]	training's rmse: 0.000509356	training's RMSPE: 0.23575	valid_1's rmse: 0.000517902	valid_1's RMSPE: 0.23938
[400]	training's rmse: 0.000501869	training's RMSPE: 0.23228	valid_1's rmse: 0.000512521	valid_1's RMSPE: 0.23689
[500]	training's rmse: 0.000498141	training's RMSPE: 0.23056	valid_1's rmse: 0.000510031	valid_1's RMSPE: 0.23574
[600]	training's rmse: 0.000495538	training's RMSPE: 0.22935	valid_1's rmse: 0.000509116	valid_1's RMSPE: 0.23532
Early stopping, best iteration is:
[665]	training's rmse: 0.000494058	training's RMSPE: 0.22867	valid_1's rmse: 0.000508602	valid_1's RMSPE: 0.23508
Performance of the　prediction: , RMSPE: 0.235


In [25]:
y_pred = df_test[['row_id']]
X_test = df_test.drop(['time_id', 'row_id'], axis = 1)

X_test

Unnamed: 0,stock_id,log_return_realized_volatility,log_return2_realized_volatility,log_return3_realized_volatility,wap_balance_mean,price_spread_mean,bid_spread_mean,ask_spread_mean,volume_imbalance_mean,total_volume_mean,...,wap_mean_300,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,stock_id_target_enc
0,0,0.000294,0.000252,0.000126,0.000145,0.000557,0.000393,-0.000115,164.666667,350.666667,...,,0.000295,3.0,201.0,3.666667,,,,,0.004028
1,0,,,,,,,,,,...,,,,,,,,,,0.004028
2,0,,,,,,,,,,...,,,,,,,,,,0.004028


In [26]:
target = np.zeros(len(X_test))

#light gbm models
for model in models:
    pred = model.predict(X_test[X_valid.columns], num_iteration=model.best_iteration)
    target += pred / len(models)

In [27]:
y_pred = y_pred.assign(target = target)
y_pred

Unnamed: 0,row_id,target
0,0-4,0.00102
1,0-32,0.000923
2,0-34,0.000923


In [28]:
y_pred.to_csv('submission.csv', index= False)