In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
/kaggle/input/optiver-trading-at-the-close/train.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
/kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import optiver2023
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from tqdm import tqdm
import lightgbm as lgb
from sklearn.linear_model import RANSACRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor

In [3]:
def set_seeds(seed):
    np.random.seed(seed)
    
class CFG(object):
    def __init__(self):
        self.work_dir = Path('/kaggle/input/optiver-trading-at-the-close/')
        self.params = {"boosting_type":'dart',"num_leaves":31,"max_depth":10,"learning_rate":0.01,
                  "n_estimators":500,"objective":'regression'}
        self.SEED = 42
    def make_lgbm(self,seed=11):
        lgbm = make_pipeline(MinMaxScaler(feature_range=(0,10)),
                      RANSACRegressor(
                          estimator=lgb.LGBMRegressor(boosting_type='dart',num_leaves=31,
                                                      max_depth=7,learning_rate=0.01,
                                                      n_estimators=300,objective='regression'),
                          max_trials=10, min_samples=100, loss='squared_error', 
                          residual_threshold=5.0,random_state=seed))
        return lgbm
    def make_rf(self,seed=11):
        rf = make_pipeline(MinMaxScaler(feature_range=(0,10)),
                        RandomForestRegressor(n_estimators=10,max_depth=6,min_samples_split=2,
                                              max_features=0.65,random_state=seed,criterion="squared_error"))
        return rf
    
CFG = CFG()
set_seeds(CFG.SEED)
env = optiver2023.make_env()
iter_test = env.iter_test()

In [4]:
train_data = pd.read_csv(CFG.work_dir/"train.csv")

In [5]:
class EdaXe(object):
    """
    Helper Fucntion for formatting time series data into rolling features.
    """
    
    def __init__(self, data=train_data, features='wap', window=3, splits=5):
        self.df = data
        self.features = features
        self.window = window
        self.splits = splits
        self.error = 0
        self.tscv = TimeSeriesSplit(n_splits=splits)
        
    def rolling_time_series_split(self,df):
        n_samples = len(df)
        folds = n_samples // self.splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.splits): 
            start = i * folds
            stop = start + folds
            temp = int(0.8 * folds) + start
            yield indices[start: temp], indices[temp + margin: stop]
    
    def roll_stock(self, data):
        data['movement_ratio'] = data['bid_size']/data['ask_size']
        data['wap_diff'] = np.log(data['wap']).diff(2)
        data['imbalance_diff'] = data['imbalance_size'].diff()
        data['reference_price'] = data['reference_price'].diff(2)
        cols = ['movement_ratio','imbalance_buy_sell_flag','wap_diff','imbalance_diff',
                'reference_price']
        stock_data = pd.concat([data[self.features].rolling(self.window).sum(),data[cols]], axis=1)
        return stock_data
    
    def roll_data(self, stock_data):
        stock_data = stock_data.copy()
        data = pd.concat([self.roll_stock(stock_data),stock_data[['target']]],axis=1).dropna()
        windowed_data = data.values[:,:-1]
        window_features = data[['target']].values
        step = self.window * windowed_data.shape[1]
        column = step + window_features.shape[1]
        dt = np.full((len(windowed_data), column), np.nan)
        for start in range(0, len(windowed_data)):
            end = start+self.window
            if end > len(windowed_data) - 1:
                end = len(windowed_data)
            length = end - start
            step = length * windowed_data.shape[1]
            target = end - 1
            dt[start, :step] = windowed_data[start:end].reshape((1,-1))
            dt[start, -1] = window_features[target]
        mask = np.any(np.isnan(dt), axis=1)
        dt = dt[~mask]
        return dt
    
    def roll_test(self,stock_data):
        stock_data = stock_data.copy()
        windowed_data = self.roll_stock(stock_data).dropna().values
        step = self.window * windowed_data.shape[1]
        dt = np.full((len(windowed_data), step), np.nan)
        for start in range(0, len(windowed_data)):
            end = start+self.window
            if end > len(windowed_data) - 1:
                end = len(windowed_data)
            length = end - start
            step = length * windowed_data.shape[1]
            dt[start, :step] = windowed_data[start:end].reshape((1,-1))
        mask = np.any(np.isnan(dt), axis=1)
        dt = dt[~mask]
        return dt
        
    def train(self, pipeline):
        self.df = self.df.fillna(0,axis=0)
        columns_given = ['seconds_in_bucket', 'imbalance_size',
                         'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
                         'far_price', 'near_price', 'bid_price', 'bid_size',
                         'ask_price', 'ask_size', 'wap',]
        self.df[columns_given] = self.df[columns_given].astype(float)
        self.df['wap'] = self.df['wap'] + 1e-12
        self.error_list = []
        self.pipelines = []
        for stock in tqdm(set(self.df['stock_id']), desc='training'):
            sample  = self.df.loc[self.df.stock_id == stock]
            dt = self.roll_data(sample)
            scores = []
            for train_index, test_index in self.tscv.split(dt):
                X_train, X_test = dt[train_index], dt[test_index]
                pipeline.fit(X_train[:,:-1],X_train[:,-1])
                preds = pipeline.predict(X_test[:, :-1])
                score = mean_absolute_error(y_true=X_test[:, -1], y_pred=preds)
                scores.append(score)
            self.error_list.append(np.mean(scores))
            self.pipelines.append(pipeline)
        self.pipeline = pipeline
        self.error = np.mean(self.error_list)
        return self.error
    
    def forecast(self,test):
        date = test['date_id'].iloc[0]
        lagged_time = date - 2
        self.df = self.df.loc[self.df['date_id'] >= lagged_time]
        if 'target' in self.df.columns:
            self.df = self.df.drop(['target','time_id'], axis=1)
        columns_given = ['seconds_in_bucket', 'imbalance_size',
                         'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
                         'far_price', 'near_price', 'bid_price', 'bid_size',
                         'ask_price', 'ask_size', 'wap',]
        self.df[columns_given] = self.df[columns_given].astype(float)
        self.df = pd.concat([self.df,  test])
        self.df = self.df.fillna(0,axis=0)
        self.df['wap'] = self.df['wap'] + 1e-12
        submission = []
        for i, stock in enumerate(set(test['stock_id'])):
            sample  = self.df.loc[self.df.stock_id == stock]
            dt = self.roll_test(sample)
            test_length = len(test.loc[test['stock_id'] == stock])
            test_data = dt[-test_length:]
            preds = self.pipelines[i].predict(test_data)
            submission.extend(preds.tolist())
        return np.array(submission)

In [6]:
preprocess = EdaXe(data=train_data, features=['seconds_in_bucket'], window=20, splits=4)

In [7]:
import time

strt = time.time()

pipeline = VotingRegressor(estimators=[('lgbm', CFG.make_lgbm(seed=31)), ('rf', CFG.make_rf(31)),
                                       ('rf2', CFG.make_rf(121)),('rf3', CFG.make_rf(71))], weights=[0.65,0.65,0.4,0.27])

score = preprocess.train(pipeline=pipeline)
end = time.time()
ellapsed_time = end - strt
print(f'Completed in {ellapsed_time:.2f}')
print(f'Mean Score: {score:.4f}')

training: 100%|██████████| 200/200 [4:39:04<00:00, 83.72s/it]

Completed in 16745.68
Mean Score: 6.4915





In [8]:
score

6.491504879170595

Completed in 8462.62

Rolling Split: ['seconds_in_bucket','movement_ratio','imbalance_buy_sell_flag','wap_diff','imbalance_diff','reference_price'], window: 20, splits: 4 </br>
Mean Score: 6.3912

TimeSeries Split: ['seconds_in_bucket','movement_ratio','imbalance_buy_sell_flag','wap_diff','imbalance_diff','reference_price'], window:20, splits:4 </br>
Mean Score: 6.4915

In [9]:
# optiver2023.make_env.__called__ = False
# env = optiver2023.make_env()
# type(env)._state = type(type(env)._state).__dict__["INIT"]
# iter_test = env.iter_test()

In [10]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    if counter == 0:
        print(test.head(3))
    preds = preprocess.forecast(test)
    sample_prediction['target'] = preds
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
   stock_id  date_id  seconds_in_bucket  imbalance_size  \
0         0      478                  0      3753451.43   
1         1      478                  0       985977.11   
2         2      478                  0       599128.74   

   imbalance_buy_sell_flag  reference_price  matched_size  far_price  \
0                       -1         0.999875   11548975.43        NaN   
1                       -1         1.000245    3850033.97        NaN   
2                        1         1.000584    4359198.25        NaN   

   near_price  bid_price  bid_size  ask_price  ask_size  wap   row_id  \
0         NaN   0.999875  22940.00   1.000050   9177.60  1.0  478_0_0   
1         NaN   0.999940   1967.90   1.000601  19692.00  1.0  478_0_1   
2         NaN   0.999918   4488.22   1.000636  34955.12  1.0  478_0_2   

   currently_scored  
0             False  
1           