In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/optiver-realized-volatility-prediction/sample_submission.csv
/kaggle/input/optiver-realized-volatility-prediction/train.csv
/kaggle/input/optiver-realized-volatility-prediction/test.csv
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=97/888f813404d8417ca8d6b8aebd5f2951.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=43/bb0efa57f511470e817880842e3e2afa.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=21/1d8dc18ebfee47ffbb54b04e6afc0634.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=72/60f62a03d8854605901dda072c84db39.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=4/761268d671f9429abb29d9d2895e9bd2.parquet
/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=112/cd283097a5b54293ba400a19e811a7f9.parquet
/kaggle/input/optiver-realized-volatility-pr

In [2]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)
from sklearn.model_selection import GroupKFold

In [3]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

In [4]:
# Function to read our base train and test set
def read_train_test():
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    return test

In [5]:
# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    
    df['wap_mean'] = (df['wap1'] + df['wap2']) / 2
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    df['bas'] = (df[['ask_price1', 'ask_price2']].min(axis = 1)/ df[['bid_price1', 'bid_price2']].max(axis = 1) - 1) 
    df['h_spread_l1'] = df['ask_price1'] - df['bid_price1']
    df['h_spread_l2'] = df['ask_price2'] - df['bid_price2']
    df['log_return_bid_price1'] = np.log(df['bid_price1'].pct_change() + 1)
    df['log_return_ask_price1'] = np.log(df['ask_price1'].pct_change() + 1)
    df['log_return_bid_size1'] = np.log(df['bid_size1'].pct_change() + 1)
    df['log_return_ask_size1'] = np.log(df['ask_size1'].pct_change() + 1)
    df['log_ask_1_div_bid_1'] = np.log(df['ask_price1'] / df['bid_price1'])
    df['log_ask_1_div_bid_1_size'] = np.log(df['ask_size1'] / df['bid_size1'])
    df['log_return_bid_price2'] = np.log(df['bid_price2'].pct_change() + 1)
    df['log_return_ask_price2'] = np.log(df['ask_price2'].pct_change() + 1)
    df['log_return_bid_size2'] = np.log(df['bid_size2'].pct_change() + 1)
    df['log_return_ask_size2'] = np.log(df['ask_size2'].pct_change() + 1)
    df['log_ask_2_div_bid_2'] = np.log(df['ask_price2'] / df['bid_price2'])
    df['log_ask_2_div_bid_2_size'] = np.log(df['ask_size2'] / df['bid_size2'])
    
    
    # Dict for aggregations
    create_feature_dict = {
        # 测试1
        'wap1': [np.mean,np.max,np.std],#np.sum, 上次测试的结果
        'wap2': [np.mean,np.max,np.std],#np.sum, 上次测试的结果
        # np.mean, np.max 加入 sum
        #测试2
        'log_return1': [realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return2': [realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return_bid_price1':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return_ask_price1':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return_bid_size1':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return_ask_size1':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_ask_1_div_bid_1':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_ask_1_div_bid_1_size':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return_bid_price2':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return_ask_price2':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return_bid_size2':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_return_ask_size2':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_ask_2_div_bid_2':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        'log_ask_2_div_bid_2_size':[realized_volatility,np.sum], # np.sum, np.mean,np.std
        #测试3
        'wap_balance': [np.mean,np.max,np.std,np.sum], #上次测试的结果, np.sum
        'wap_mean': [np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
        'price_spread':[np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
        'bid_spread':[np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
        'ask_spread':[np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
        'total_volume':[np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
        'volume_imbalance':[np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
        'bas':[np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
        'h_spread_l1':[np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
        'h_spread_l2':[np.mean,np.max,np.std,np.sum],#上次测试的结果, np.sum
         #np.mean, np.max 加入 sum
                       }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)
    
    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

In [6]:
# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature


In [7]:
# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df

In [8]:
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

In [9]:
# Read train and test
test = read_train_test()


# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
test = get_time_stock(test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.3s finished


In [10]:
train = pd.read_pickle('../input/train-data/train.pkl') #/kaggle/input/train-data/train.pkl

In [11]:
from sklearn import datasets
from sklearn import model_selection

def create_folds(data, num_splits,target):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data[target], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

train = create_folds(train, 10,"target")

In [12]:
train.head()

Unnamed: 0,stock_id,time_id,target,row_id,wap1_mean,wap1_amax,wap1_std,wap2_mean,wap2_amax,wap2_std,log_return1_realized_volatility,log_return1_sum,log_return2_realized_volatility,log_return2_sum,log_return_bid_price1_realized_volatility,log_return_bid_price1_sum,log_return_ask_price1_realized_volatility,log_return_ask_price1_sum,log_return_bid_size1_realized_volatility,log_return_bid_size1_sum,log_return_ask_size1_realized_volatility,log_return_ask_size1_sum,log_ask_1_div_bid_1_realized_volatility,log_ask_1_div_bid_1_sum,log_ask_1_div_bid_1_size_realized_volatility,log_ask_1_div_bid_1_size_sum,log_return_bid_price2_realized_volatility,log_return_bid_price2_sum,log_return_ask_price2_realized_volatility,log_return_ask_price2_sum,log_return_bid_size2_realized_volatility,log_return_bid_size2_sum,log_return_ask_size2_realized_volatility,log_return_ask_size2_sum,log_ask_2_div_bid_2_realized_volatility,log_ask_2_div_bid_2_sum,log_ask_2_div_bid_2_size_realized_volatility,log_ask_2_div_bid_2_size_sum,wap_balance_mean,wap_balance_amax,wap_balance_std,wap_balance_sum,wap_mean_mean,wap_mean_amax,wap_mean_std,wap_mean_sum,price_spread_mean,price_spread_amax,price_spread_std,price_spread_sum,bid_spread_mean,bid_spread_amax,bid_spread_std,bid_spread_sum,ask_spread_mean,ask_spread_amax,ask_spread_std,ask_spread_sum,total_volume_mean,total_volume_amax,total_volume_std,total_volume_sum,volume_imbalance_mean,volume_imbalance_amax,volume_imbalance_std,volume_imbalance_sum,bas_mean,bas_amax,bas_std,bas_sum,h_spread_l1_mean,h_spread_l1_amax,h_spread_l1_std,h_spread_l1_sum,h_spread_l2_mean,h_spread_l2_amax,h_spread_l2_std,h_spread_l2_sum,wap1_mean_450,wap1_amax_450,wap1_std_450,wap2_mean_450,wap2_amax_450,wap2_std_450,log_return1_realized_volatility_450,log_return1_sum_450,log_return2_realized_volatility_450,log_return2_sum_450,log_return_bid_price1_realized_volatility_450,log_return_bid_price1_sum_450,log_return_ask_price1_realized_volatility_450,log_return_ask_price1_sum_450,log_return_bid_size1_realized_volatility_450,log_return_bid_size1_sum_450,log_return_ask_size1_realized_volatility_450,log_return_ask_size1_sum_450,log_ask_1_div_bid_1_realized_volatility_450,log_ask_1_div_bid_1_sum_450,log_ask_1_div_bid_1_size_realized_volatility_450,log_ask_1_div_bid_1_size_sum_450,log_return_bid_price2_realized_volatility_450,log_return_bid_price2_sum_450,log_return_ask_price2_realized_volatility_450,log_return_ask_price2_sum_450,log_return_bid_size2_realized_volatility_450,log_return_bid_size2_sum_450,log_return_ask_size2_realized_volatility_450,log_return_ask_size2_sum_450,log_ask_2_div_bid_2_realized_volatility_450,log_ask_2_div_bid_2_sum_450,log_ask_2_div_bid_2_size_realized_volatility_450,log_ask_2_div_bid_2_size_sum_450,wap_balance_mean_450,wap_balance_amax_450,wap_balance_std_450,wap_balance_sum_450,wap_mean_mean_450,wap_mean_amax_450,wap_mean_std_450,wap_mean_sum_450,price_spread_mean_450,price_spread_amax_450,price_spread_std_450,price_spread_sum_450,bid_spread_mean_450,bid_spread_amax_450,bid_spread_std_450,bid_spread_sum_450,ask_spread_mean_450,ask_spread_amax_450,ask_spread_std_450,ask_spread_sum_450,total_volume_mean_450,total_volume_amax_450,total_volume_std_450,total_volume_sum_450,volume_imbalance_mean_450,volume_imbalance_amax_450,volume_imbalance_std_450,volume_imbalance_sum_450,bas_mean_450,bas_amax_450,bas_std_450,bas_sum_450,h_spread_l1_mean_450,h_spread_l1_amax_450,h_spread_l1_std_450,h_spread_l1_sum_450,h_spread_l2_mean_450,h_spread_l2_amax_450,...,wap_balance_sum_150,wap_mean_mean_150,wap_mean_amax_150,wap_mean_std_150,wap_mean_sum_150,price_spread_mean_150,price_spread_amax_150,price_spread_std_150,price_spread_sum_150,bid_spread_mean_150,bid_spread_amax_150,bid_spread_std_150,bid_spread_sum_150,ask_spread_mean_150,ask_spread_amax_150,ask_spread_std_150,ask_spread_sum_150,total_volume_mean_150,total_volume_amax_150,total_volume_std_150,total_volume_sum_150,volume_imbalance_mean_150,volume_imbalance_amax_150,volume_imbalance_std_150,volume_imbalance_sum_150,bas_mean_150,bas_amax_150,bas_std_150,bas_sum_150,h_spread_l1_mean_150,h_spread_l1_amax_150,h_spread_l1_std_150,h_spread_l1_sum_150,h_spread_l2_mean_150,h_spread_l2_amax_150,h_spread_l2_std_150,h_spread_l2_sum_150,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_450,trade_seconds_in_bucket_count_unique_450,trade_size_sum_450,trade_order_count_mean_450,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,trade_log_return_realized_volatility_150,trade_seconds_in_bucket_count_unique_150,trade_size_sum_150,trade_order_count_mean_150,log_return1_realized_volatility_mean_stock,log_return1_realized_volatility_std_stock,log_return1_realized_volatility_max_stock,log_return1_realized_volatility_min_stock,log_return2_realized_volatility_mean_stock,log_return2_realized_volatility_std_stock,log_return2_realized_volatility_max_stock,log_return2_realized_volatility_min_stock,log_return1_realized_volatility_450_mean_stock,log_return1_realized_volatility_450_std_stock,log_return1_realized_volatility_450_max_stock,log_return1_realized_volatility_450_min_stock,log_return2_realized_volatility_450_mean_stock,log_return2_realized_volatility_450_std_stock,log_return2_realized_volatility_450_max_stock,log_return2_realized_volatility_450_min_stock,log_return1_realized_volatility_300_mean_stock,log_return1_realized_volatility_300_std_stock,log_return1_realized_volatility_300_max_stock,log_return1_realized_volatility_300_min_stock,log_return2_realized_volatility_300_mean_stock,log_return2_realized_volatility_300_std_stock,log_return2_realized_volatility_300_max_stock,log_return2_realized_volatility_300_min_stock,log_return1_realized_volatility_150_mean_stock,log_return1_realized_volatility_150_std_stock,log_return1_realized_volatility_150_max_stock,log_return1_realized_volatility_150_min_stock,log_return2_realized_volatility_150_mean_stock,log_return2_realized_volatility_150_std_stock,log_return2_realized_volatility_150_max_stock,log_return2_realized_volatility_150_min_stock,trade_log_return_realized_volatility_mean_stock,trade_log_return_realized_volatility_std_stock,trade_log_return_realized_volatility_max_stock,trade_log_return_realized_volatility_min_stock,trade_log_return_realized_volatility_450_mean_stock,trade_log_return_realized_volatility_450_std_stock,trade_log_return_realized_volatility_450_max_stock,trade_log_return_realized_volatility_450_min_stock,trade_log_return_realized_volatility_300_mean_stock,trade_log_return_realized_volatility_300_std_stock,trade_log_return_realized_volatility_300_max_stock,trade_log_return_realized_volatility_300_min_stock,trade_log_return_realized_volatility_150_mean_stock,trade_log_return_realized_volatility_150_std_stock,trade_log_return_realized_volatility_150_max_stock,trade_log_return_realized_volatility_150_min_stock,log_return1_realized_volatility_mean_time,log_return1_realized_volatility_std_time,log_return1_realized_volatility_max_time,log_return1_realized_volatility_min_time,log_return2_realized_volatility_mean_time,log_return2_realized_volatility_std_time,log_return2_realized_volatility_max_time,log_return2_realized_volatility_min_time,log_return1_realized_volatility_450_mean_time,log_return1_realized_volatility_450_std_time,log_return1_realized_volatility_450_max_time,log_return1_realized_volatility_450_min_time,log_return2_realized_volatility_450_mean_time,log_return2_realized_volatility_450_std_time,log_return2_realized_volatility_450_max_time,log_return2_realized_volatility_450_min_time,log_return1_realized_volatility_300_mean_time,log_return1_realized_volatility_300_std_time,log_return1_realized_volatility_300_max_time,log_return1_realized_volatility_300_min_time,log_return2_realized_volatility_300_mean_time,log_return2_realized_volatility_300_std_time,log_return2_realized_volatility_300_max_time,log_return2_realized_volatility_300_min_time,log_return1_realized_volatility_150_mean_time,log_return1_realized_volatility_150_std_time,log_return1_realized_volatility_150_max_time,log_return1_realized_volatility_150_min_time,log_return2_realized_volatility_150_mean_time,log_return2_realized_volatility_150_std_time,log_return2_realized_volatility_150_max_time,log_return2_realized_volatility_150_min_time,trade_log_return_realized_volatility_mean_time,trade_log_return_realized_volatility_std_time,trade_log_return_realized_volatility_max_time,trade_log_return_realized_volatility_min_time,trade_log_return_realized_volatility_450_mean_time,trade_log_return_realized_volatility_450_std_time,trade_log_return_realized_volatility_450_max_time,trade_log_return_realized_volatility_450_min_time,trade_log_return_realized_volatility_300_mean_time,trade_log_return_realized_volatility_300_std_time,trade_log_return_realized_volatility_300_max_time,trade_log_return_realized_volatility_300_min_time,trade_log_return_realized_volatility_150_mean_time,trade_log_return_realized_volatility_150_std_time,trade_log_return_realized_volatility_150_max_time,trade_log_return_realized_volatility_150_min_time,kfold
0,55,10105,0.004934,55-10105,0.99681,1.000645,0.00189,0.99678,1.001112,0.001911,0.00826,-0.003539,0.010189,-0.003463,0.005511,-0.003578,0.004547,-0.003344,25.977785,1.504077,25.792689,-2.484907,0.029625,0.498474,43.606586,241.745081,0.005715,-0.003831,0.004512,-0.003339,30.814197,4.60517,16.429456,-2.995732,0.040357,0.697499,41.774736,315.409966,0.000572,0.00252,0.000466,0.18774,0.996795,1.00087,0.001864,326.948883,0.00152,0.002699,0.000606,0.498475,0.000253,0.001332,0.000269,0.083036,-0.000352,-3.2e-05,0.000325,-0.11537,402.140244,1040,186.755327,131902,152.463415,617,102.767984,50008,0.001521,0.002703,0.000607,0.498913,0.001515,0.002697,0.000604,0.49688,0.00212,0.00377,0.000664,0.695286,0.997553,0.998267,0.00036,0.997453,0.998516,0.000388,0.002442,0.000453,0.003793,0.001355,0.001667,0.000358,0.00254,-3.272907e-05,8.908759,-0.798508,7.756543,-2.079442,0.008119,0.067557,18.034304,101.838209,0.002214,0.000293,0.00195,-3.3e-05,13.540811,2.120264,7.782024,-3.688879,0.013538,0.115304,18.297361,102.798149,0.000352,0.00095,0.000229,0.027825,0.997503,0.998372,0.000313,78.802734,0.000855,0.001693,0.000323,0.067557,0.000245,0.000942,0.00021,0.019337,-0.000358,-3.2e-05,0.000338,-0.028306,383.151899,715,164.775145,30269,140.518987,350,89.612679,11101,0.000856,0.001694,0.000324,0.06759,0.000853,0.00169,0.000323,0.067404,0.001456,0.002665,...,0.149666,0.996281,0.999859,0.001564,271.98465,0.00146,0.002582,0.000586,0.398704,0.000257,0.001332,0.000274,0.070036,-0.000378,-3.2e-05,0.000337,-0.10328,408.40293,1040,191.195847,111494,150.0,617,102.16797,40950,0.001462,0.002585,0.000587,0.399041,0.001455,0.002567,0.000583,0.397173,0.00209,0.00377,0.000668,0.570489,0.003032,43.0,6897.0,3.162791,0.001508,23.0,4164.0,3.0,0.002094,27.0,4507.0,3.148148,0.002958,39.0,6281.0,3.102564,0.005528,0.003578,0.038087,0.000791,0.00794,0.005075,0.046336,0.001154,0.002578,0.001701,0.017997,0.000166,0.003701,0.002354,0.017858,0.000268,0.003749,0.002372,0.021259,0.000408,0.005383,0.003334,0.029147,0.000581,0.004696,0.003,0.026241,0.000609,0.006738,0.004248,0.038296,0.00078,0.003003,0.001982,0.02245,0.000387,0.00141,0.000933,0.008348,3.3e-05,0.002066,0.001313,0.012356,0.000243,0.002591,0.001677,0.017211,0.000332,0.005487,0.002794,0.018482,0.001598,0.007138,0.003489,0.021038,0.001711,0.002491,0.001271,0.006481,0.000646,0.003324,0.001701,0.008728,0.000737,0.003676,0.0018,0.009244,0.000972,0.00491,0.002502,0.01256,0.001059,0.004691,0.002386,0.01508,0.001244,0.006119,0.003036,0.017761,0.001343,0.003419,0.001624,0.013788,0.000968,0.00157,0.000703,0.004046,0.000465,0.002366,0.001083,0.007387,0.000627,0.002958,0.001405,0.011105,0.000899,0
1,19,4432,0.00334,19-4432,0.998089,0.99936,0.000542,0.997979,0.999516,0.000619,0.003081,0.000696,0.005384,0.000855,0.005559,-0.004066,0.005986,-0.004199,18.880583,-3.401197,21.456871,-0.693147,0.011404,0.209218,35.038524,-37.623478,0.005613,-0.00422,0.00583,-0.004194,26.059932,0.8020016,14.753601,0.693147,0.019193,0.362554,33.946834,257.432567,0.00023,0.000916,0.000183,0.086112,0.998034,0.999429,0.000566,374.262756,0.000558,0.001189,0.000189,0.209217,0.000182,0.000445,6.4e-05,0.068368,-0.000226,-0.000148,8.4e-05,-0.084679,753.205333,1465,224.976465,282452,174.384,870,137.144082,65394,0.000558,0.001189,0.000189,0.209283,0.000557,0.001186,0.000188,0.208813,0.000965,0.001483,0.000218,0.36186,0.998763,0.99936,0.00028,0.998733,0.999516,0.000421,0.001558,0.000324,0.003239,0.000198,0.001385,0.000743,0.001866,0.0002969974,12.855745,-3.401197,13.580043,-0.693147,0.00552,0.050784,19.907298,37.863304,0.001579,0.000743,0.001889,0.000297,14.495867,-0.234735,4.890114,0.967584,0.009229,0.087755,12.218014,33.959765,0.000175,0.000838,0.000173,0.01665,0.998748,0.999429,0.000336,94.881058,0.000535,0.001188,0.000188,0.050783,0.000181,0.000297,6.2e-05,0.017203,-0.000208,-0.000148,7.6e-05,-0.019724,714.484211,1201,177.984153,67876,135.305263,570,112.531516,12854,0.000535,0.001189,0.000188,0.050799,0.000534,0.001186,0.000188,0.050719,0.000923,0.001483,...,0.057396,0.998171,0.999429,0.000567,281.484314,0.000526,0.001188,0.000176,0.148421,0.000178,0.000445,6.2e-05,0.050275,-0.000229,-0.000148,7.6e-05,-0.064659,712.648936,1465,193.543491,200967,167.939716,689,135.041999,47359,0.000526,0.001189,0.000176,0.148465,0.000525,0.001186,0.000176,0.148156,0.000933,0.001483,0.000207,0.263089,0.002786,61.0,8524.0,2.52459,0.001278,17.0,2387.0,2.470588,0.001735,35.0,4202.0,2.257143,0.002323,48.0,6420.0,2.5,0.00321,0.002908,0.047665,0.000327,0.004421,0.004049,0.060296,0.000685,0.00147,0.001287,0.026701,9.6e-05,0.002039,0.001739,0.031643,7.7e-05,0.002139,0.001828,0.035488,0.000251,0.002954,0.00248,0.043936,0.000433,0.002695,0.002327,0.044948,0.000322,0.00371,0.003171,0.048635,0.000643,0.002155,0.001739,0.022963,0.000358,0.000998,0.000807,0.015295,0.0,0.001464,0.001163,0.018839,0.000118,0.001845,0.001473,0.021937,0.000289,0.004994,0.001359,0.010184,0.002476,0.006619,0.002205,0.013201,0.002974,0.002354,0.000725,0.005514,0.000988,0.003254,0.001213,0.007647,0.001157,0.003494,0.000984,0.00804,0.00176,0.004699,0.001676,0.01087,0.002088,0.004331,0.001186,0.009263,0.002146,0.005752,0.001949,0.011944,0.002551,0.003333,0.000661,0.00501,0.001908,0.001576,0.000436,0.002966,0.000418,0.00233,0.000504,0.00416,0.001383,0.00289,0.000568,0.004611,0.001679,0
2,17,31984,0.001286,17-31984,1.000571,1.001499,0.000399,1.000517,1.00155,0.000404,0.001431,0.001224,0.001751,0.001272,0.001678,0.00056,0.001233,0.000586,28.573571,0.555526,15.971306,3.610918,0.003899,0.060325,34.943022,30.598956,0.001597,0.000523,0.001274,0.000484,25.865727,-8.812395e-16,11.211118,0.17185,0.006715,0.108905,34.092915,277.364858,0.000102,0.00029,7.3e-05,0.029025,1.000544,1.001525,0.000397,285.154968,0.000212,0.00053,9.3e-05,0.060322,9e-05,0.000265,5.6e-05,0.025567,-8.1e-05,-3.3e-05,4.7e-05,-0.023042,398.007018,866,132.911942,113432,119.691228,382,88.347083,34112,0.000212,0.000531,9.3e-05,0.060332,0.000212,0.000531,9.3e-05,0.060357,0.000382,0.000696,0.000111,0.108966,1.001223,1.001499,0.000113,1.001138,1.00155,0.000151,0.000815,0.00055,0.001117,0.000486,0.000672,0.000464,0.000396,0.0002981832,17.266463,4.110874,2.870419,-0.301105,0.001777,0.012918,14.455523,40.135922,0.000631,0.000497,0.000294,0.000365,17.292876,0.248461,4.353447,-0.274437,0.002883,0.02166,19.262572,93.570483,0.000136,0.00029,8.3e-05,0.007864,1.001181,1.001525,0.000115,58.068478,0.000223,0.000397,7e-05,0.012918,8.9e-05,0.000265,4.9e-05,0.005172,-6.2e-05,-3.3e-05,4.9e-05,-0.00358,368.931034,608,91.654742,21398,109.344828,374,77.104065,6342,0.000223,0.000398,7e-05,0.012919,0.000223,0.000398,7e-05,0.012934,0.000374,0.000497,...,0.02163,1.000642,1.001525,0.000402,215.138016,0.000208,0.00053,9.3e-05,0.044772,8.9e-05,0.000265,5.1e-05,0.019035,-7.1e-05,-3.3e-05,4.6e-05,-0.015318,384.483721,866,125.932367,82664,121.432558,382,82.813459,26108,0.000208,0.000531,9.3e-05,0.044778,0.000208,0.000531,9.3e-05,0.044803,0.000368,0.000696,0.000106,0.079156,0.000676,34.0,3275.0,2.294118,0.000349,9.0,854.0,2.444444,0.000556,21.0,1710.0,2.047619,0.00061,29.0,2524.0,2.103448,0.004493,0.003532,0.032445,0.000769,0.006224,0.005293,0.054145,0.001142,0.002105,0.001671,0.020164,0.000273,0.002919,0.002462,0.024709,0.000387,0.003038,0.002338,0.023141,0.000456,0.004201,0.003451,0.032561,0.000711,0.0038,0.002925,0.028206,0.00065,0.005242,0.004337,0.040619,0.000973,0.002423,0.001765,0.022305,0.00038,0.001142,0.000835,0.007606,3.3e-05,0.001663,0.001178,0.010893,0.00023,0.002077,0.001472,0.013902,0.000339,0.001653,0.000831,0.005977,0.000453,0.00232,0.001402,0.012102,0.000538,0.000798,0.000477,0.003521,0.000235,0.001155,0.000939,0.008721,6.7e-05,0.001153,0.000609,0.004672,0.000303,0.001611,0.001134,0.010506,0.000131,0.00142,0.000678,0.004812,0.000354,0.001986,0.001242,0.011141,0.000464,0.001026,0.000483,0.003442,0.0,0.000494,0.000274,0.001625,0.000148,0.000727,0.000363,0.002423,0.0,0.000894,0.000415,0.002438,0.0,0
3,88,19489,0.001476,88-19489,1.000915,1.002148,0.000604,1.000942,1.002753,0.000661,0.003145,0.001269,0.005811,0.000478,0.002824,0.001823,0.002681,0.002644,14.844352,1.94591,10.531244,-1.098612,0.017395,0.186248,33.709368,127.302036,0.001674,0.001768,0.003074,0.002456,16.806466,0.4054651,15.007727,1.94591,0.021635,0.24371,22.975783,106.536429,0.000628,0.001557,0.0004,0.084773,1.000928,1.002152,0.000511,135.125336,0.00138,0.002419,0.000584,0.186248,0.000212,0.001083,0.000264,0.028638,-0.000214,-2.8e-05,0.000143,-0.028891,207.222222,520,106.274614,27975,110.037037,371,77.746854,14855,0.001381,0.002422,0.000584,0.186399,0.001381,0.002422,0.000585,0.186478,0.001807,0.002793,0.000459,0.244007,1.001434,1.002148,0.000633,1.001235,1.002753,0.000736,0.001903,0.001875,0.004231,0.001924,0.001967,0.000969,0.001235,0.001280379,9.645485,2.639057,7.75607,-0.693147,0.007984,0.05435,18.664766,27.346347,0.001244,0.001083,0.00146,0.001223,13.841581,2.197225,8.72805,-3.381966,0.011368,0.078596,16.808671,33.101562,0.000697,0.001385,0.000413,0.03486,1.001335,1.002152,0.000561,50.066738,0.001087,0.001708,0.000308,0.05435,0.00031,0.001083,0.000342,0.015501,-0.000176,-2.8e-05,9.8e-05,-0.008776,184.04,520,101.54178,9202,98.92,282,65.682551,4946,0.001088,0.001709,0.000309,0.054382,0.001089,0.00171,0.000309,0.054428,0.001574,0.002536,...,0.07558,1.000969,1.002152,0.000529,118.114326,0.001288,0.00222,0.00052,0.152043,0.00021,0.001083,0.000255,0.024763,-0.000209,-2.8e-05,0.000112,-0.024702,205.923729,520,107.293762,24299,98.618644,351,67.212499,11637,0.001289,0.002222,0.000521,0.152157,0.00129,0.002223,0.000521,0.152226,0.001709,0.002536,0.000403,0.201691,0.001333,11.0,305.0,2.363636,0.000946,4.0,283.0,4.5,0.001049,7.0,289.0,3.0,0.001333,9.0,302.0,2.666667,0.005494,0.004136,0.052881,0.000785,0.008146,0.006079,0.068164,0.00107,0.002471,0.001921,0.019909,2e-06,0.003662,0.002781,0.030274,1e-05,0.00364,0.00276,0.029981,0.000121,0.005394,0.00391,0.044831,0.000166,0.004575,0.003396,0.044789,0.000448,0.006778,0.004856,0.060811,0.000804,0.002538,0.001867,0.021077,0.0,0.0012,0.000982,0.012844,0.0,0.001763,0.001325,0.018762,2.8e-05,0.002213,0.001631,0.020614,0.0,0.003284,0.001903,0.009077,0.000585,0.004426,0.002418,0.011184,0.000699,0.001463,0.000903,0.005991,0.00026,0.002108,0.001265,0.006981,0.00029,0.002171,0.001293,0.006974,0.000383,0.00306,0.001762,0.008833,0.000436,0.002749,0.001596,0.008095,0.000484,0.003738,0.002088,0.010227,0.000576,0.00178,0.00105,0.006438,0.000532,0.000829,0.000606,0.00449,0.000196,0.001197,0.000795,0.005181,0.000225,0.001533,0.000924,0.005945,0.000408,0
4,89,4219,0.002721,89-4219,1.001622,1.004503,0.001092,1.00161,1.004263,0.001035,0.002991,0.001376,0.003855,0.001757,0.00354,0.000293,0.003634,0.000488,10.505058,-0.024278,12.929481,-1.389622,0.008098,0.106095,16.333608,14.425005,0.003509,0.000357,0.00368,0.000942,6.809691,-0.8109302,8.285711,-1.660009,0.016135,0.232184,16.817982,20.903209,0.000215,0.000877,0.000201,0.047896,1.001616,1.004309,0.001053,223.360397,0.000476,0.001296,0.000261,0.106096,0.000279,0.00052,6.7e-05,0.06211,-0.000288,-0.00026,9.5e-05,-0.064189,1330.076233,2964,549.661426,296607,411.878924,2164,408.548693,91849,0.000476,0.001297,0.000261,0.106128,0.000477,0.001299,0.000262,0.106289,0.001043,0.001819,0.00029,0.232589,1.001365,1.002236,0.000466,1.001405,1.002391,0.000477,0.0011,6.1e-05,0.001675,0.000176,0.001445,-0.000259,0.001468,1.151639e-07,5.507447,1.250403,8.078491,-0.741937,0.003511,0.024139,9.568171,-33.220746,0.001446,-0.00026,0.001421,0.000519,3.353394,0.287682,2.769465,0.119121,0.00779,0.057875,7.227618,-18.657636,0.000138,0.00042,9.9e-05,0.008113,1.001385,1.002288,0.000464,59.081718,0.000409,0.001038,0.000206,0.024138,0.000264,0.00052,3.4e-05,0.015593,-0.000308,-0.00026,0.00014,-0.018191,1295.254237,2132,471.495734,76420,404.711864,1188,366.497968,23878,0.000409,0.001038,0.000206,0.024145,0.00041,0.00104,0.000206,0.024168,0.000982,0.001559,...,0.031753,1.001814,1.004309,0.001067,162.293854,0.000483,0.001038,0.000243,0.078327,0.000274,0.00052,6e-05,0.044439,-0.000298,-0.00026,0.000109,-0.048337,1240.302469,2457,503.75026,200929,343.141975,1188,278.526823,55589,0.000484,0.001038,0.000243,0.078351,0.000484,0.00104,0.000243,0.078483,0.001057,0.001819,0.000275,0.171258,0.002946,65.0,21939.0,4.0,0.001086,17.0,3720.0,3.235294,0.001524,33.0,7934.0,3.30303,0.002629,48.0,14486.0,3.729167,0.003729,0.003015,0.040193,0.000396,0.005048,0.003989,0.057986,0.000173,0.001709,0.001309,0.022073,2e-06,0.002316,0.001783,0.026552,2.3e-05,0.002497,0.001862,0.026069,0.000234,0.003385,0.002522,0.033769,6.2e-05,0.003128,0.002369,0.033052,0.000305,0.004242,0.003195,0.044269,0.000155,0.00292,0.002263,0.026693,0.0,0.001347,0.001012,0.009938,0.0,0.001976,0.001434,0.016816,0.0,0.002484,0.00182,0.022633,0.0,0.00348,0.001663,0.011198,0.000546,0.00477,0.002358,0.01685,0.000755,0.001635,0.000806,0.004356,0.00025,0.002223,0.001073,0.006064,0.000354,0.002352,0.001157,0.007353,0.000389,0.003247,0.001609,0.010439,0.00059,0.002938,0.001418,0.009902,0.000467,0.004046,0.002019,0.015017,0.000664,0.002097,0.000788,0.005809,0.0005,0.001006,0.000464,0.003334,2.4e-05,0.001477,0.000603,0.004866,0.000346,0.001835,0.000696,0.005497,0.000429,0


In [13]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False


In [14]:
# Split features and target
x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']
x_test = test.drop(['row_id', 'time_id'], axis = 1)
    # Transform stock id to a numeric value
x['stock_id'] = x['stock_id'].astype(int)
x_test['stock_id'] = x_test['stock_id'].astype(int)
    
 # Create out of folds array
oof_predictions = np.zeros(x.shape[0])
# Create test array to store predictions
test_predictions = np.zeros(x_test.shape[0])
# # Create a groupFold object
# groupkfold = GroupKFold(n_splits = 10, shuffle=True, random_state=42)
# # Iterate through each fold
# for fold, (trn_ind, val_ind) in enumerate(groupkfold.split(train,train['target'],'time_id')):
#     x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
#     y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
for fold in range(10):
        traindf = train[train["kfold"]!=fold].reset_index(drop=True)
        validdf = train[train["kfold"]==fold].reset_index(drop=True)
        x_train = traindf.drop(['row_id', 'target', 'time_id',"kfold"], axis = 1)
        y_train = traindf['target']
        x_val = validdf.drop(['row_id', 'target', 'time_id',"kfold"], axis = 1)
        y_val =  validdf['target']

In [15]:
x_train.head()

Unnamed: 0,stock_id,wap1_mean,wap1_amax,wap1_std,wap2_mean,wap2_amax,wap2_std,log_return1_realized_volatility,log_return1_sum,log_return2_realized_volatility,log_return2_sum,log_return_bid_price1_realized_volatility,log_return_bid_price1_sum,log_return_ask_price1_realized_volatility,log_return_ask_price1_sum,log_return_bid_size1_realized_volatility,log_return_bid_size1_sum,log_return_ask_size1_realized_volatility,log_return_ask_size1_sum,log_ask_1_div_bid_1_realized_volatility,log_ask_1_div_bid_1_sum,log_ask_1_div_bid_1_size_realized_volatility,log_ask_1_div_bid_1_size_sum,log_return_bid_price2_realized_volatility,log_return_bid_price2_sum,log_return_ask_price2_realized_volatility,log_return_ask_price2_sum,log_return_bid_size2_realized_volatility,log_return_bid_size2_sum,log_return_ask_size2_realized_volatility,log_return_ask_size2_sum,log_ask_2_div_bid_2_realized_volatility,log_ask_2_div_bid_2_sum,log_ask_2_div_bid_2_size_realized_volatility,log_ask_2_div_bid_2_size_sum,wap_balance_mean,wap_balance_amax,wap_balance_std,wap_balance_sum,wap_mean_mean,wap_mean_amax,wap_mean_std,wap_mean_sum,price_spread_mean,price_spread_amax,price_spread_std,price_spread_sum,bid_spread_mean,bid_spread_amax,bid_spread_std,bid_spread_sum,ask_spread_mean,ask_spread_amax,ask_spread_std,ask_spread_sum,total_volume_mean,total_volume_amax,total_volume_std,total_volume_sum,volume_imbalance_mean,volume_imbalance_amax,volume_imbalance_std,volume_imbalance_sum,bas_mean,bas_amax,bas_std,bas_sum,h_spread_l1_mean,h_spread_l1_amax,h_spread_l1_std,h_spread_l1_sum,h_spread_l2_mean,h_spread_l2_amax,h_spread_l2_std,h_spread_l2_sum,wap1_mean_450,wap1_amax_450,wap1_std_450,wap2_mean_450,wap2_amax_450,wap2_std_450,log_return1_realized_volatility_450,log_return1_sum_450,log_return2_realized_volatility_450,log_return2_sum_450,log_return_bid_price1_realized_volatility_450,log_return_bid_price1_sum_450,log_return_ask_price1_realized_volatility_450,log_return_ask_price1_sum_450,log_return_bid_size1_realized_volatility_450,log_return_bid_size1_sum_450,log_return_ask_size1_realized_volatility_450,log_return_ask_size1_sum_450,log_ask_1_div_bid_1_realized_volatility_450,log_ask_1_div_bid_1_sum_450,log_ask_1_div_bid_1_size_realized_volatility_450,log_ask_1_div_bid_1_size_sum_450,log_return_bid_price2_realized_volatility_450,log_return_bid_price2_sum_450,log_return_ask_price2_realized_volatility_450,log_return_ask_price2_sum_450,log_return_bid_size2_realized_volatility_450,log_return_bid_size2_sum_450,log_return_ask_size2_realized_volatility_450,log_return_ask_size2_sum_450,log_ask_2_div_bid_2_realized_volatility_450,log_ask_2_div_bid_2_sum_450,log_ask_2_div_bid_2_size_realized_volatility_450,log_ask_2_div_bid_2_size_sum_450,wap_balance_mean_450,wap_balance_amax_450,wap_balance_std_450,wap_balance_sum_450,wap_mean_mean_450,wap_mean_amax_450,wap_mean_std_450,wap_mean_sum_450,price_spread_mean_450,price_spread_amax_450,price_spread_std_450,price_spread_sum_450,bid_spread_mean_450,bid_spread_amax_450,bid_spread_std_450,bid_spread_sum_450,ask_spread_mean_450,ask_spread_amax_450,ask_spread_std_450,ask_spread_sum_450,total_volume_mean_450,total_volume_amax_450,total_volume_std_450,total_volume_sum_450,volume_imbalance_mean_450,volume_imbalance_amax_450,volume_imbalance_std_450,volume_imbalance_sum_450,bas_mean_450,bas_amax_450,bas_std_450,bas_sum_450,h_spread_l1_mean_450,h_spread_l1_amax_450,h_spread_l1_std_450,h_spread_l1_sum_450,h_spread_l2_mean_450,h_spread_l2_amax_450,h_spread_l2_std_450,h_spread_l2_sum_450,wap1_mean_300,...,wap_balance_std_150,wap_balance_sum_150,wap_mean_mean_150,wap_mean_amax_150,wap_mean_std_150,wap_mean_sum_150,price_spread_mean_150,price_spread_amax_150,price_spread_std_150,price_spread_sum_150,bid_spread_mean_150,bid_spread_amax_150,bid_spread_std_150,bid_spread_sum_150,ask_spread_mean_150,ask_spread_amax_150,ask_spread_std_150,ask_spread_sum_150,total_volume_mean_150,total_volume_amax_150,total_volume_std_150,total_volume_sum_150,volume_imbalance_mean_150,volume_imbalance_amax_150,volume_imbalance_std_150,volume_imbalance_sum_150,bas_mean_150,bas_amax_150,bas_std_150,bas_sum_150,h_spread_l1_mean_150,h_spread_l1_amax_150,h_spread_l1_std_150,h_spread_l1_sum_150,h_spread_l2_mean_150,h_spread_l2_amax_150,h_spread_l2_std_150,h_spread_l2_sum_150,trade_log_return_realized_volatility,trade_seconds_in_bucket_count_unique,trade_size_sum,trade_order_count_mean,trade_log_return_realized_volatility_450,trade_seconds_in_bucket_count_unique_450,trade_size_sum_450,trade_order_count_mean_450,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,trade_log_return_realized_volatility_150,trade_seconds_in_bucket_count_unique_150,trade_size_sum_150,trade_order_count_mean_150,log_return1_realized_volatility_mean_stock,log_return1_realized_volatility_std_stock,log_return1_realized_volatility_max_stock,log_return1_realized_volatility_min_stock,log_return2_realized_volatility_mean_stock,log_return2_realized_volatility_std_stock,log_return2_realized_volatility_max_stock,log_return2_realized_volatility_min_stock,log_return1_realized_volatility_450_mean_stock,log_return1_realized_volatility_450_std_stock,log_return1_realized_volatility_450_max_stock,log_return1_realized_volatility_450_min_stock,log_return2_realized_volatility_450_mean_stock,log_return2_realized_volatility_450_std_stock,log_return2_realized_volatility_450_max_stock,log_return2_realized_volatility_450_min_stock,log_return1_realized_volatility_300_mean_stock,log_return1_realized_volatility_300_std_stock,log_return1_realized_volatility_300_max_stock,log_return1_realized_volatility_300_min_stock,log_return2_realized_volatility_300_mean_stock,log_return2_realized_volatility_300_std_stock,log_return2_realized_volatility_300_max_stock,log_return2_realized_volatility_300_min_stock,log_return1_realized_volatility_150_mean_stock,log_return1_realized_volatility_150_std_stock,log_return1_realized_volatility_150_max_stock,log_return1_realized_volatility_150_min_stock,log_return2_realized_volatility_150_mean_stock,log_return2_realized_volatility_150_std_stock,log_return2_realized_volatility_150_max_stock,log_return2_realized_volatility_150_min_stock,trade_log_return_realized_volatility_mean_stock,trade_log_return_realized_volatility_std_stock,trade_log_return_realized_volatility_max_stock,trade_log_return_realized_volatility_min_stock,trade_log_return_realized_volatility_450_mean_stock,trade_log_return_realized_volatility_450_std_stock,trade_log_return_realized_volatility_450_max_stock,trade_log_return_realized_volatility_450_min_stock,trade_log_return_realized_volatility_300_mean_stock,trade_log_return_realized_volatility_300_std_stock,trade_log_return_realized_volatility_300_max_stock,trade_log_return_realized_volatility_300_min_stock,trade_log_return_realized_volatility_150_mean_stock,trade_log_return_realized_volatility_150_std_stock,trade_log_return_realized_volatility_150_max_stock,trade_log_return_realized_volatility_150_min_stock,log_return1_realized_volatility_mean_time,log_return1_realized_volatility_std_time,log_return1_realized_volatility_max_time,log_return1_realized_volatility_min_time,log_return2_realized_volatility_mean_time,log_return2_realized_volatility_std_time,log_return2_realized_volatility_max_time,log_return2_realized_volatility_min_time,log_return1_realized_volatility_450_mean_time,log_return1_realized_volatility_450_std_time,log_return1_realized_volatility_450_max_time,log_return1_realized_volatility_450_min_time,log_return2_realized_volatility_450_mean_time,log_return2_realized_volatility_450_std_time,log_return2_realized_volatility_450_max_time,log_return2_realized_volatility_450_min_time,log_return1_realized_volatility_300_mean_time,log_return1_realized_volatility_300_std_time,log_return1_realized_volatility_300_max_time,log_return1_realized_volatility_300_min_time,log_return2_realized_volatility_300_mean_time,log_return2_realized_volatility_300_std_time,log_return2_realized_volatility_300_max_time,log_return2_realized_volatility_300_min_time,log_return1_realized_volatility_150_mean_time,log_return1_realized_volatility_150_std_time,log_return1_realized_volatility_150_max_time,log_return1_realized_volatility_150_min_time,log_return2_realized_volatility_150_mean_time,log_return2_realized_volatility_150_std_time,log_return2_realized_volatility_150_max_time,log_return2_realized_volatility_150_min_time,trade_log_return_realized_volatility_mean_time,trade_log_return_realized_volatility_std_time,trade_log_return_realized_volatility_max_time,trade_log_return_realized_volatility_min_time,trade_log_return_realized_volatility_450_mean_time,trade_log_return_realized_volatility_450_std_time,trade_log_return_realized_volatility_450_max_time,trade_log_return_realized_volatility_450_min_time,trade_log_return_realized_volatility_300_mean_time,trade_log_return_realized_volatility_300_std_time,trade_log_return_realized_volatility_300_max_time,trade_log_return_realized_volatility_300_min_time,trade_log_return_realized_volatility_150_mean_time,trade_log_return_realized_volatility_150_std_time,trade_log_return_realized_volatility_150_max_time,trade_log_return_realized_volatility_150_min_time
0,55,0.99681,1.000645,0.00189,0.99678,1.001112,0.001911,0.00826,-0.003539,0.010189,-0.003463,0.005511,-0.003578,0.004547,-0.003344,25.977785,1.504077,25.792689,-2.484907,0.029625,0.498474,43.606586,241.745081,0.005715,-0.003831,0.004512,-0.003339,30.814197,4.60517,16.429456,-2.995732,0.040357,0.697499,41.774736,315.409966,0.000572,0.00252,0.000466,0.18774,0.996795,1.00087,0.001864,326.948883,0.00152,0.002699,0.000606,0.498475,0.000253,0.001332,0.000269,0.083036,-0.000352,-3.2e-05,0.000325,-0.11537,402.140244,1040,186.755327,131902,152.463415,617,102.767984,50008,0.001521,0.002703,0.000607,0.498913,0.001515,0.002697,0.000604,0.49688,0.00212,0.00377,0.000664,0.695286,0.997553,0.998267,0.00036,0.997453,0.998516,0.000388,0.002442,0.000453,0.003793,0.001355,0.001667,0.000358,0.00254,-3.272907e-05,8.908759,-0.798508,7.756543,-2.079442,0.008119,0.067557,18.034304,101.838209,0.002214,0.000293,0.00195,-3.3e-05,13.540811,2.120264,7.782024,-3.688879,0.013538,0.115304,18.297361,102.798149,0.000352,0.00095,0.000229,0.027825,0.997503,0.998372,0.000313,78.802734,0.000855,0.001693,0.000323,0.067557,0.000245,0.000942,0.00021,0.019337,-0.000358,-3.2e-05,0.000338,-0.028306,383.151899,715,164.775145,30269,140.518987,350,89.612679,11101,0.000856,0.001694,0.000324,0.06759,0.000853,0.00169,0.000323,0.067404,0.001456,0.002665,0.000437,0.115047,0.99611,...,0.00043,0.149666,0.996281,0.999859,0.001564,271.98465,0.00146,0.002582,0.000586,0.398704,0.000257,0.001332,0.000274,0.070036,-0.000378,-3.2e-05,0.000337,-0.10328,408.40293,1040,191.195847,111494,150.0,617,102.16797,40950,0.001462,0.002585,0.000587,0.399041,0.001455,0.002567,0.000583,0.397173,0.00209,0.00377,0.000668,0.570489,0.003032,43.0,6897.0,3.162791,0.001508,23.0,4164.0,3.0,0.002094,27.0,4507.0,3.148148,0.002958,39.0,6281.0,3.102564,0.005528,0.003578,0.038087,0.000791,0.00794,0.005075,0.046336,0.001154,0.002578,0.001701,0.017997,0.000166,0.003701,0.002354,0.017858,0.000268,0.003749,0.002372,0.021259,0.000408,0.005383,0.003334,0.029147,0.000581,0.004696,0.003,0.026241,0.000609,0.006738,0.004248,0.038296,0.00078,0.003003,0.001982,0.02245,0.000387,0.00141,0.000933,0.008348,3.3e-05,0.002066,0.001313,0.012356,0.000243,0.002591,0.001677,0.017211,0.000332,0.005487,0.002794,0.018482,0.001598,0.007138,0.003489,0.021038,0.001711,0.002491,0.001271,0.006481,0.000646,0.003324,0.001701,0.008728,0.000737,0.003676,0.0018,0.009244,0.000972,0.00491,0.002502,0.01256,0.001059,0.004691,0.002386,0.01508,0.001244,0.006119,0.003036,0.017761,0.001343,0.003419,0.001624,0.013788,0.000968,0.00157,0.000703,0.004046,0.000465,0.002366,0.001083,0.007387,0.000627,0.002958,0.001405,0.011105,0.000899
1,19,0.998089,0.99936,0.000542,0.997979,0.999516,0.000619,0.003081,0.000696,0.005384,0.000855,0.005559,-0.004066,0.005986,-0.004199,18.880583,-3.401197,21.456871,-0.693147,0.011404,0.209218,35.038524,-37.623478,0.005613,-0.00422,0.00583,-0.004194,26.059932,0.8020016,14.753601,0.693147,0.019193,0.362554,33.946834,257.432567,0.00023,0.000916,0.000183,0.086112,0.998034,0.999429,0.000566,374.262756,0.000558,0.001189,0.000189,0.209217,0.000182,0.000445,6.4e-05,0.068368,-0.000226,-0.000148,8.4e-05,-0.084679,753.205333,1465,224.976465,282452,174.384,870,137.144082,65394,0.000558,0.001189,0.000189,0.209283,0.000557,0.001186,0.000188,0.208813,0.000965,0.001483,0.000218,0.36186,0.998763,0.99936,0.00028,0.998733,0.999516,0.000421,0.001558,0.000324,0.003239,0.000198,0.001385,0.000743,0.001866,0.0002969974,12.855745,-3.401197,13.580043,-0.693147,0.00552,0.050784,19.907298,37.863304,0.001579,0.000743,0.001889,0.000297,14.495867,-0.234735,4.890114,0.967584,0.009229,0.087755,12.218014,33.959765,0.000175,0.000838,0.000173,0.01665,0.998748,0.999429,0.000336,94.881058,0.000535,0.001188,0.000188,0.050783,0.000181,0.000297,6.2e-05,0.017203,-0.000208,-0.000148,7.6e-05,-0.019724,714.484211,1201,177.984153,67876,135.305263,570,112.531516,12854,0.000535,0.001189,0.000188,0.050799,0.000534,0.001186,0.000188,0.050719,0.000923,0.001483,0.000209,0.087647,0.99828,...,0.000151,0.057396,0.998171,0.999429,0.000567,281.484314,0.000526,0.001188,0.000176,0.148421,0.000178,0.000445,6.2e-05,0.050275,-0.000229,-0.000148,7.6e-05,-0.064659,712.648936,1465,193.543491,200967,167.939716,689,135.041999,47359,0.000526,0.001189,0.000176,0.148465,0.000525,0.001186,0.000176,0.148156,0.000933,0.001483,0.000207,0.263089,0.002786,61.0,8524.0,2.52459,0.001278,17.0,2387.0,2.470588,0.001735,35.0,4202.0,2.257143,0.002323,48.0,6420.0,2.5,0.00321,0.002908,0.047665,0.000327,0.004421,0.004049,0.060296,0.000685,0.00147,0.001287,0.026701,9.6e-05,0.002039,0.001739,0.031643,7.7e-05,0.002139,0.001828,0.035488,0.000251,0.002954,0.00248,0.043936,0.000433,0.002695,0.002327,0.044948,0.000322,0.00371,0.003171,0.048635,0.000643,0.002155,0.001739,0.022963,0.000358,0.000998,0.000807,0.015295,0.0,0.001464,0.001163,0.018839,0.000118,0.001845,0.001473,0.021937,0.000289,0.004994,0.001359,0.010184,0.002476,0.006619,0.002205,0.013201,0.002974,0.002354,0.000725,0.005514,0.000988,0.003254,0.001213,0.007647,0.001157,0.003494,0.000984,0.00804,0.00176,0.004699,0.001676,0.01087,0.002088,0.004331,0.001186,0.009263,0.002146,0.005752,0.001949,0.011944,0.002551,0.003333,0.000661,0.00501,0.001908,0.001576,0.000436,0.002966,0.000418,0.00233,0.000504,0.00416,0.001383,0.00289,0.000568,0.004611,0.001679
2,17,1.000571,1.001499,0.000399,1.000517,1.00155,0.000404,0.001431,0.001224,0.001751,0.001272,0.001678,0.00056,0.001233,0.000586,28.573571,0.555526,15.971306,3.610918,0.003899,0.060325,34.943022,30.598956,0.001597,0.000523,0.001274,0.000484,25.865727,-8.812395e-16,11.211118,0.17185,0.006715,0.108905,34.092915,277.364858,0.000102,0.00029,7.3e-05,0.029025,1.000544,1.001525,0.000397,285.154968,0.000212,0.00053,9.3e-05,0.060322,9e-05,0.000265,5.6e-05,0.025567,-8.1e-05,-3.3e-05,4.7e-05,-0.023042,398.007018,866,132.911942,113432,119.691228,382,88.347083,34112,0.000212,0.000531,9.3e-05,0.060332,0.000212,0.000531,9.3e-05,0.060357,0.000382,0.000696,0.000111,0.108966,1.001223,1.001499,0.000113,1.001138,1.00155,0.000151,0.000815,0.00055,0.001117,0.000486,0.000672,0.000464,0.000396,0.0002981832,17.266463,4.110874,2.870419,-0.301105,0.001777,0.012918,14.455523,40.135922,0.000631,0.000497,0.000294,0.000365,17.292876,0.248461,4.353447,-0.274437,0.002883,0.02166,19.262572,93.570483,0.000136,0.00029,8.3e-05,0.007864,1.001181,1.001525,0.000115,58.068478,0.000223,0.000397,7e-05,0.012918,8.9e-05,0.000265,4.9e-05,0.005172,-6.2e-05,-3.3e-05,4.9e-05,-0.00358,368.931034,608,91.654742,21398,109.344828,374,77.104065,6342,0.000223,0.000398,7e-05,0.012919,0.000223,0.000398,7e-05,0.012934,0.000374,0.000497,6.2e-05,0.021686,1.00087,...,7.2e-05,0.02163,1.000642,1.001525,0.000402,215.138016,0.000208,0.00053,9.3e-05,0.044772,8.9e-05,0.000265,5.1e-05,0.019035,-7.1e-05,-3.3e-05,4.6e-05,-0.015318,384.483721,866,125.932367,82664,121.432558,382,82.813459,26108,0.000208,0.000531,9.3e-05,0.044778,0.000208,0.000531,9.3e-05,0.044803,0.000368,0.000696,0.000106,0.079156,0.000676,34.0,3275.0,2.294118,0.000349,9.0,854.0,2.444444,0.000556,21.0,1710.0,2.047619,0.00061,29.0,2524.0,2.103448,0.004493,0.003532,0.032445,0.000769,0.006224,0.005293,0.054145,0.001142,0.002105,0.001671,0.020164,0.000273,0.002919,0.002462,0.024709,0.000387,0.003038,0.002338,0.023141,0.000456,0.004201,0.003451,0.032561,0.000711,0.0038,0.002925,0.028206,0.00065,0.005242,0.004337,0.040619,0.000973,0.002423,0.001765,0.022305,0.00038,0.001142,0.000835,0.007606,3.3e-05,0.001663,0.001178,0.010893,0.00023,0.002077,0.001472,0.013902,0.000339,0.001653,0.000831,0.005977,0.000453,0.00232,0.001402,0.012102,0.000538,0.000798,0.000477,0.003521,0.000235,0.001155,0.000939,0.008721,6.7e-05,0.001153,0.000609,0.004672,0.000303,0.001611,0.001134,0.010506,0.000131,0.00142,0.000678,0.004812,0.000354,0.001986,0.001242,0.011141,0.000464,0.001026,0.000483,0.003442,0.0,0.000494,0.000274,0.001625,0.000148,0.000727,0.000363,0.002423,0.0,0.000894,0.000415,0.002438,0.0
3,88,1.000915,1.002148,0.000604,1.000942,1.002753,0.000661,0.003145,0.001269,0.005811,0.000478,0.002824,0.001823,0.002681,0.002644,14.844352,1.94591,10.531244,-1.098612,0.017395,0.186248,33.709368,127.302036,0.001674,0.001768,0.003074,0.002456,16.806466,0.4054651,15.007727,1.94591,0.021635,0.24371,22.975783,106.536429,0.000628,0.001557,0.0004,0.084773,1.000928,1.002152,0.000511,135.125336,0.00138,0.002419,0.000584,0.186248,0.000212,0.001083,0.000264,0.028638,-0.000214,-2.8e-05,0.000143,-0.028891,207.222222,520,106.274614,27975,110.037037,371,77.746854,14855,0.001381,0.002422,0.000584,0.186399,0.001381,0.002422,0.000585,0.186478,0.001807,0.002793,0.000459,0.244007,1.001434,1.002148,0.000633,1.001235,1.002753,0.000736,0.001903,0.001875,0.004231,0.001924,0.001967,0.000969,0.001235,0.001280379,9.645485,2.639057,7.75607,-0.693147,0.007984,0.05435,18.664766,27.346347,0.001244,0.001083,0.00146,0.001223,13.841581,2.197225,8.72805,-3.381966,0.011368,0.078596,16.808671,33.101562,0.000697,0.001385,0.000413,0.03486,1.001335,1.002152,0.000561,50.066738,0.001087,0.001708,0.000308,0.05435,0.00031,0.001083,0.000342,0.015501,-0.000176,-2.8e-05,9.8e-05,-0.008776,184.04,520,101.54178,9202,98.92,282,65.682551,4946,0.001088,0.001709,0.000309,0.054382,0.001089,0.00171,0.000309,0.054428,0.001574,0.002536,0.000341,0.078705,1.00119,...,0.000419,0.07558,1.000969,1.002152,0.000529,118.114326,0.001288,0.00222,0.00052,0.152043,0.00021,0.001083,0.000255,0.024763,-0.000209,-2.8e-05,0.000112,-0.024702,205.923729,520,107.293762,24299,98.618644,351,67.212499,11637,0.001289,0.002222,0.000521,0.152157,0.00129,0.002223,0.000521,0.152226,0.001709,0.002536,0.000403,0.201691,0.001333,11.0,305.0,2.363636,0.000946,4.0,283.0,4.5,0.001049,7.0,289.0,3.0,0.001333,9.0,302.0,2.666667,0.005494,0.004136,0.052881,0.000785,0.008146,0.006079,0.068164,0.00107,0.002471,0.001921,0.019909,2e-06,0.003662,0.002781,0.030274,1e-05,0.00364,0.00276,0.029981,0.000121,0.005394,0.00391,0.044831,0.000166,0.004575,0.003396,0.044789,0.000448,0.006778,0.004856,0.060811,0.000804,0.002538,0.001867,0.021077,0.0,0.0012,0.000982,0.012844,0.0,0.001763,0.001325,0.018762,2.8e-05,0.002213,0.001631,0.020614,0.0,0.003284,0.001903,0.009077,0.000585,0.004426,0.002418,0.011184,0.000699,0.001463,0.000903,0.005991,0.00026,0.002108,0.001265,0.006981,0.00029,0.002171,0.001293,0.006974,0.000383,0.00306,0.001762,0.008833,0.000436,0.002749,0.001596,0.008095,0.000484,0.003738,0.002088,0.010227,0.000576,0.00178,0.00105,0.006438,0.000532,0.000829,0.000606,0.00449,0.000196,0.001197,0.000795,0.005181,0.000225,0.001533,0.000924,0.005945,0.000408
4,89,1.001622,1.004503,0.001092,1.00161,1.004263,0.001035,0.002991,0.001376,0.003855,0.001757,0.00354,0.000293,0.003634,0.000488,10.505058,-0.024278,12.929481,-1.389622,0.008098,0.106095,16.333608,14.425005,0.003509,0.000357,0.00368,0.000942,6.809691,-0.8109302,8.285711,-1.660009,0.016135,0.232184,16.817982,20.903209,0.000215,0.000877,0.000201,0.047896,1.001616,1.004309,0.001053,223.360397,0.000476,0.001296,0.000261,0.106096,0.000279,0.00052,6.7e-05,0.06211,-0.000288,-0.00026,9.5e-05,-0.064189,1330.076233,2964,549.661426,296607,411.878924,2164,408.548693,91849,0.000476,0.001297,0.000261,0.106128,0.000477,0.001299,0.000262,0.106289,0.001043,0.001819,0.00029,0.232589,1.001365,1.002236,0.000466,1.001405,1.002391,0.000477,0.0011,6.1e-05,0.001675,0.000176,0.001445,-0.000259,0.001468,1.151639e-07,5.507447,1.250403,8.078491,-0.741937,0.003511,0.024139,9.568171,-33.220746,0.001446,-0.00026,0.001421,0.000519,3.353394,0.287682,2.769465,0.119121,0.00779,0.057875,7.227618,-18.657636,0.000138,0.00042,9.9e-05,0.008113,1.001385,1.002288,0.000464,59.081718,0.000409,0.001038,0.000206,0.024138,0.000264,0.00052,3.4e-05,0.015593,-0.000308,-0.00026,0.00014,-0.018191,1295.254237,2132,471.495734,76420,404.711864,1188,366.497968,23878,0.000409,0.001038,0.000206,0.024145,0.00041,0.00104,0.000206,0.024168,0.000982,0.001559,0.00026,0.057952,1.001178,...,0.000174,0.031753,1.001814,1.004309,0.001067,162.293854,0.000483,0.001038,0.000243,0.078327,0.000274,0.00052,6e-05,0.044439,-0.000298,-0.00026,0.000109,-0.048337,1240.302469,2457,503.75026,200929,343.141975,1188,278.526823,55589,0.000484,0.001038,0.000243,0.078351,0.000484,0.00104,0.000243,0.078483,0.001057,0.001819,0.000275,0.171258,0.002946,65.0,21939.0,4.0,0.001086,17.0,3720.0,3.235294,0.001524,33.0,7934.0,3.30303,0.002629,48.0,14486.0,3.729167,0.003729,0.003015,0.040193,0.000396,0.005048,0.003989,0.057986,0.000173,0.001709,0.001309,0.022073,2e-06,0.002316,0.001783,0.026552,2.3e-05,0.002497,0.001862,0.026069,0.000234,0.003385,0.002522,0.033769,6.2e-05,0.003128,0.002369,0.033052,0.000305,0.004242,0.003195,0.044269,0.000155,0.00292,0.002263,0.026693,0.0,0.001347,0.001012,0.009938,0.0,0.001976,0.001434,0.016816,0.0,0.002484,0.00182,0.022633,0.0,0.00348,0.001663,0.011198,0.000546,0.00477,0.002358,0.01685,0.000755,0.001635,0.000806,0.004356,0.00025,0.002223,0.001073,0.006064,0.000354,0.002352,0.001157,0.007353,0.000389,0.003247,0.001609,0.010439,0.00059,0.002938,0.001418,0.009902,0.000467,0.004046,0.002019,0.015017,0.000664,0.002097,0.000788,0.005809,0.0005,0.001006,0.000464,0.003334,2.4e-05,0.001477,0.000603,0.004866,0.000346,0.001835,0.000696,0.005497,0.000429


In [16]:
x_val.shape

(42893, 409)

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import os
import glob
from tqdm import tqdm
from joblib import Parallel, delayed
import gc

from sklearn.model_selection import train_test_split, KFold

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import optuna
from optuna.samplers import TPESampler

In [18]:
def objective(trial):
    
    def rmspe(y_true, y_pred):
        return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
    
    valid = [(x_val, y_val)]
    
    param = {
        "device": "gpu",
        'boosting': 'gbdt',
        "metric": "rmse",
        "verbosity": -1,
        'learning_rate':trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        "max_depth": -1,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "n_estimators": trial.suggest_int("n_estimators", 100, 4000),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 2000),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        'feature_fraction_bynode': trial.suggest_float("feature_fraction_bynode", 0.4, 1.0),
        'min_sum_hessian_in_leaf':trial.suggest_int("min_sum_hessian_in_leaf", 10, 100),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'seed':1,
        'feature_fraction_seed': 1,
        'bagging_seed': 1,
        'drop_seed': 1,
        'data_random_seed': 1}

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")
    model = LGBMRegressor(**param)
    
    model.fit(x_train, y_train, eval_set=valid, verbose=False, callbacks=[pruning_callback], early_stopping_rounds=100)

    preds = model.predict(x_val)
    
    rmspe = rmspe(y_val, preds)
    return rmspe

In [19]:
study = optuna.create_study(sampler=TPESampler(), direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, n_trials=100, gc_after_trial=True)

[32m[I 2021-08-25 01:20:02,612][0m A new study created in memory with name: no-name-1187a560-6436-47bd-9246-e24ae47ad29c[0m




[32m[I 2021-08-25 01:21:17,787][0m Trial 0 finished with value: 0.24922532256227206 and parameters: {'learning_rate': 0.3370694457912124, 'lambda_l1': 0.057791415235657055, 'lambda_l2': 0.0001064981514788677, 'num_leaves': 908, 'n_estimators': 1965, 'min_data_in_leaf': 410, 'feature_fraction': 0.5983359065460901, 'feature_fraction_bynode': 0.6583153230415957, 'min_sum_hessian_in_leaf': 49, 'bagging_fraction': 0.5462673950736119, 'bagging_freq': 1}. Best is trial 0 with value: 0.24922532256227206.[0m




[32m[I 2021-08-25 01:23:00,647][0m Trial 1 finished with value: 0.24636881081474052 and parameters: {'learning_rate': 0.18174690421424808, 'lambda_l1': 1.3403483541368712e-08, 'lambda_l2': 0.00018335926502107195, 'num_leaves': 42, 'n_estimators': 2574, 'min_data_in_leaf': 1529, 'feature_fraction': 0.9330955525426714, 'feature_fraction_bynode': 0.46215766132983804, 'min_sum_hessian_in_leaf': 10, 'bagging_fraction': 0.6783216538385552, 'bagging_freq': 3}. Best is trial 1 with value: 0.24636881081474052.[0m




[32m[I 2021-08-25 01:24:40,400][0m Trial 2 finished with value: 0.25910242660485144 and parameters: {'learning_rate': 0.36686372665106437, 'lambda_l1': 2.3802212654971494e-08, 'lambda_l2': 7.585547855409162e-06, 'num_leaves': 650, 'n_estimators': 2671, 'min_data_in_leaf': 250, 'feature_fraction': 0.641171831856715, 'feature_fraction_bynode': 0.7385303234577627, 'min_sum_hessian_in_leaf': 67, 'bagging_fraction': 0.43579256375143277, 'bagging_freq': 4}. Best is trial 1 with value: 0.24636881081474052.[0m




[32m[I 2021-08-25 01:36:12,692][0m Trial 3 finished with value: 0.2447950682659941 and parameters: {'learning_rate': 0.011413959842017922, 'lambda_l1': 6.393484937256699e-07, 'lambda_l2': 9.688957258498976e-06, 'num_leaves': 815, 'n_estimators': 3110, 'min_data_in_leaf': 843, 'feature_fraction': 0.4870690801684412, 'feature_fraction_bynode': 0.7646999898853415, 'min_sum_hessian_in_leaf': 46, 'bagging_fraction': 0.4876475698163599, 'bagging_freq': 6}. Best is trial 3 with value: 0.2447950682659941.[0m




[32m[I 2021-08-25 01:37:27,314][0m Trial 4 finished with value: 0.2488074753549808 and parameters: {'learning_rate': 0.3518689862680233, 'lambda_l1': 0.1942829715428632, 'lambda_l2': 4.3991508768546766e-07, 'num_leaves': 221, 'n_estimators': 1302, 'min_data_in_leaf': 1611, 'feature_fraction': 0.42894109311683676, 'feature_fraction_bynode': 0.7503018090094022, 'min_sum_hessian_in_leaf': 59, 'bagging_fraction': 0.6770991189254055, 'bagging_freq': 3}. Best is trial 3 with value: 0.2447950682659941.[0m




[32m[I 2021-08-25 01:37:52,845][0m Trial 5 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:38:17,964][0m Trial 6 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:38:43,247][0m Trial 7 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:39:08,987][0m Trial 8 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:39:34,308][0m Trial 9 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:40:00,648][0m Trial 10 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:40:25,990][0m Trial 11 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:40:52,992][0m Trial 12 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:41:18,577][0m Trial 13 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:41:44,932][0m Trial 14 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 01:42:11,767][0m Trial 15 pruned. Trial was pruned at iteration 5.[0m
[32



[32m[I 2021-08-25 02:16:44,359][0m Trial 74 pruned. Trial was pruned at iteration 257.[0m
[32m[I 2021-08-25 02:17:11,463][0m Trial 75 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:18:15,350][0m Trial 76 pruned. Trial was pruned at iteration 100.[0m
[32m[I 2021-08-25 02:18:41,861][0m Trial 77 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:19:09,236][0m Trial 78 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:19:35,605][0m Trial 79 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:20:03,202][0m Trial 80 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:21:02,982][0m Trial 81 finished with value: 0.24978658658395092 and parameters: {'learning_rate': 0.353291122207088, 'lambda_l1': 0.04054325771950118, 'lambda_l2': 0.00011729030681188265, 'num_leaves': 849, 'n_estimators': 2742, 'min_data_in_leaf': 801, 'feature_fraction': 0.6906301362326259, 'feature_fraction_bynode': 0.7618040074395



[32m[I 2021-08-25 02:21:32,885][0m Trial 82 pruned. Trial was pruned at iteration 22.[0m
[32m[I 2021-08-25 02:21:59,048][0m Trial 83 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:22:26,889][0m Trial 84 pruned. Trial was pruned at iteration 13.[0m
[32m[I 2021-08-25 02:22:55,125][0m Trial 85 pruned. Trial was pruned at iteration 9.[0m
[32m[I 2021-08-25 02:23:21,776][0m Trial 86 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:23:49,086][0m Trial 87 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:24:15,492][0m Trial 88 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:25:11,031][0m Trial 89 pruned. Trial was pruned at iteration 83.[0m
[32m[I 2021-08-25 02:25:38,509][0m Trial 90 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:26:05,070][0m Trial 91 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-25 02:26:37,803][0m Trial 92 pruned. Trial was pruned at iteration 29.

In [20]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

Number of finished trials: 100
Best trial: {'learning_rate': 0.011413959842017922, 'lambda_l1': 6.393484937256699e-07, 'lambda_l2': 9.688957258498976e-06, 'num_leaves': 815, 'n_estimators': 3110, 'min_data_in_leaf': 843, 'feature_fraction': 0.4870690801684412, 'feature_fraction_bynode': 0.7646999898853415, 'min_sum_hessian_in_leaf': 46, 'bagging_fraction': 0.4876475698163599, 'bagging_freq': 6}


In [21]:
optuna.visualization.plot_optimization_history(study)