In [1]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import joblib  
import lightgbm as lgb  
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
import polars as pl
from tqdm import tqdm
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

is_offline = False 
LGB = True
NN = False
is_train = True  
is_infer = True 
max_lookback = np.nan 
split_day = 435 



In [2]:
def weighted_average(a):
    w = []
    n = len(a)
    for j in range(1, n + 1):
        j = 2 if j == 1 else j
        w.append(1 / (2**(n + 1 - j)))
    return w

In [3]:
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

class PurgedGroupTimeSeriesSplit(_BaseKFold):
    
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [4]:
def reduce_mem_usage(df, verbose=0):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
               
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df

In [5]:
df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
df = df.dropna(subset=["target", "ask_price", "bid_price"])
df.reset_index(drop=True, inplace=True)
df_shape = df.shape

In [6]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features

@njit(parallel=True)
def get_ema(data, window=14):
    """
    Calculate Exponential Moving Average (EMA) for each column in the input DataFrame.

    Parameters:
    - data (numpy.ndarray): Input DataFrame containing price data.
    - window (int): EMA calculation window.

    Returns:
    - ema_values (numpy.ndarray): EMA values for each element in the input DataFrame.
    """
    rows, cols = data.shape
    ema_values = np.zeros((rows, cols))
    alpha = 2 / (window + 1)

    for col in prange(cols):
        ema_values[window - 1, col] = np.mean(data[:window, col])

        for i in prange(window, rows):
            ema_values[i, col] = (data[i, col] - ema_values[i - 1, col]) * alpha + ema_values[i - 1, col]
            
    return ema_values

@njit(parallel=True)
def get_rsi(data, period=14):
    """
    Calculate Relative Strength Index (RSI) for each column in the input DataFrame.

    Parameters:
    - data (numpy.ndarray): Input DataFrame containing price data.
    - period (int): RSI calculation period.

    Returns:
    - rsi_values (numpy.ndarray): RSI values for each element in the input DataFrame.
    """
    rows, cols = data.shape
    rsi_values = np.zeros((rows, cols))

    for col in prange(cols):
        delta = np.zeros(rows)
        delta[1:] = data[1:, col] - data[:-1, col]

        gain = np.where(delta > 0, delta, 0)
        loss = -np.where(delta < 0, delta, 0)

        avg_gain = np.zeros(rows)
        avg_loss = np.zeros(rows)

        avg_gain[:period] = np.mean(gain[:period])
        avg_loss[:period] = np.mean(loss[:period])

        for i in prange(period, rows):
            avg_gain[i] = (avg_gain[i - 1] * (period - 1) + gain[i]) / period
            avg_loss[i] = (avg_loss[i - 1] * (period - 1) + loss[i]) / period

        rs = avg_gain / avg_loss
        rsi_values[:, col] = 100 - (100 / (1 + rs))

    return rsi_values

@njit(parallel=True)
def get_macd(data, short_window=12, long_window=26, signal_window=9):
    """
    Calculate Moving Average Convergence Divergence (MACD) for each column in the input DataFrame.

    Parameters:
    - data (numpy.ndarray): Input DataFrame containing price data.
    - short_window (int): Short-term EMA window for MACD calculation.
    - long_window (int): Long-term EMA window for MACD calculation.
    - signal_window (int): Signal line window for MACD calculation.

    Returns:
    - macd_values (numpy.ndarray): MACD values for each element in the input DataFrame.
    - signal_line_values (numpy.ndarray): Signal line values for each element in the input DataFrame.
    - histogram_values (numpy.ndarray): MACD histogram values for each element in the input DataFrame.
    """
    rows, cols = data.shape
    macd_values = np.zeros((rows, cols))
    signal_line_values = np.zeros((rows, cols))
    histogram_values = np.zeros((rows, cols))

    short_alpha = 2 / (short_window + 1)
    long_alpha = 2 / (long_window + 1)
    signal_alpha = 2 / (signal_window + 1)

    for col in prange(cols):
        short_ema = np.zeros(rows)
        long_ema = np.zeros(rows)
        signal_line = np.zeros(rows)

        short_ema[1:] = data[1:, col].copy()
        long_ema[1:] = data[1:, col].copy()

        for i in prange(1, rows):
            short_ema[i] = (data[i, col] - short_ema[i - 1]) * short_alpha + short_ema[i - 1]
            long_ema[i] = (data[i, col] - long_ema[i - 1]) * long_alpha + long_ema[i - 1]

        macd_values[:, col] = short_ema - long_ema

        signal_line[1:] = macd_values[1:, col].copy()

        for i in prange(1, rows):
            signal_line[i] = (macd_values[i, col] - signal_line[i - 1]) * signal_alpha + signal_line[i - 1]

        signal_line_values[:, col] = signal_line
        histogram_values[:, col] = macd_values[:, col] - signal_line

    return macd_values, signal_line_values, histogram_values
'''
@njit(parallel=True)
def get_bband(data, window=20, num_std_dev=2):
    """
    Calculate Bollinger Bands for each column in the input DataFrame.

    Parameters:
    - data (numpy.ndarray): Input DataFrame containing price data.
    - window (int): Rolling window for Bollinger Bands calculation.
    - num_std_dev (int): Number of standard deviations for upper and lower bands.

    Returns:
    - upper_bands (numpy.ndarray): Upper Bollinger Bands values for each element in the input DataFrame.
    - mid_bands (numpy.ndarray): Middle Bollinger Bands (moving average) values for each element in the input DataFrame.
    - lower_bands (numpy.ndarray): Lower Bollinger Bands values for each element in the input DataFrame.
    """
    num_rows, num_cols = data.shape
    upper_bands = np.zeros_like(data)
    lower_bands = np.zeros_like(data)
    mid_bands = np.zeros_like(data)

    for col in prange(num_cols):
        for i in prange(window - 1, num_rows):
            window_slice = data[i - window + 1 : i + 1, col]
            mid_bands[i, col] = np.mean(window_slice)
            std_dev = np.std(window_slice)
            upper_bands[i, col] = mid_bands[i, col] + num_std_dev * std_dev
            lower_bands[i, col] = mid_bands[i, col] - num_std_dev * std_dev

    return upper_bands, mid_bands, lower_bands
'''

'\n@njit(parallel=True)\ndef get_bband(data, window=20, num_std_dev=2):\n    """\n    Calculate Bollinger Bands for each column in the input DataFrame.\n\n    Parameters:\n    - data (numpy.ndarray): Input DataFrame containing price data.\n    - window (int): Rolling window for Bollinger Bands calculation.\n    - num_std_dev (int): Number of standard deviations for upper and lower bands.\n\n    Returns:\n    - upper_bands (numpy.ndarray): Upper Bollinger Bands values for each element in the input DataFrame.\n    - mid_bands (numpy.ndarray): Middle Bollinger Bands (moving average) values for each element in the input DataFrame.\n    - lower_bands (numpy.ndarray): Lower Bollinger Bands values for each element in the input DataFrame.\n    """\n    num_rows, num_cols = data.shape\n    upper_bands = np.zeros_like(data)\n    lower_bands = np.zeros_like(data)\n    mid_bands = np.zeros_like(data)\n\n    for col in prange(num_cols):\n        for i in prange(window - 1, num_rows):\n            w

In [7]:
def imbalance_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(["reference_price", "ask_price", "bid_price", "wap"], 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
        
    for c in [['mid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
        
    for _, single_stock_prices_df in tqdm(df.groupby('stock_id')[["reference_price", "ask_price", "bid_price", "wap"]]):
        
        # RSI
        col_rsi = [f'rsi_{col}' for col in single_stock_prices_df.columns]
        rsi_values = get_rsi(single_stock_prices_df.values)
        df.loc[single_stock_prices_df.index, col_rsi] = rsi_values
        
        # MACD
        macd_values, signal_line_values, histogram_values = get_macd(single_stock_prices_df.values)
        col_macd = [f'macd_{col}' for col in single_stock_prices_df.columns]
        col_signal = [f'macd_sig_{col}' for col in single_stock_prices_df.columns]
        col_hist = [f'macd_hist_{col}' for col in single_stock_prices_df.columns]

        df.loc[single_stock_prices_df.index, col_macd] = macd_values
        df.loc[single_stock_prices_df.index, col_signal] = signal_line_values
        df.loc[single_stock_prices_df.index, col_hist] = histogram_values

    df["stock_weights"] = df["stock_id"].map(weights)
    df["group_id"] = df["stock_id"].map(group_id_mapping)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']
    
    
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1,3,5,10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'weighted_wap','price_spread']:
        for window in [1,3,5,10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    
    for window in [3,5,10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']

    pl_df = pl.from_pandas(df)

    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    group = ["stock_id"]
    expressions = []

    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    lazy_df = pl_df.lazy().with_columns(expressions)

    pl_df = lazy_df.collect()

    df = pl_df.to_pandas()
    gc.collect()
    
    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')
    
    for col in df.columns:
        df[col] = df[col].replace([np.inf, -np.inf], 0)

    return df

def other_features(df):
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60 
    df['stage'] = df['minute'].apply(lambda x: 0 if x < 5 else 1)
    df['time_to_market_close'] = 540 - df['seconds_in_bucket']
    
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

def generate_all_features(df):
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    
    df = imbalance_features(df)
    gc.collect() 
    df = other_features(df)
    gc.collect()  
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    
    return df[feature_name]

In [8]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

In [9]:
group_id_mapping = [
    0, 1, 3, 0, 0, 5, 1, 4, 1, 0, 0, 1, 2, 4, 4, 5, 2, 4, 1, 6, 
    2, 2, 4, 4, 0, 2, 0, 4, 2, 1, 2, 6, 0, 5, 3, 0, 2, 0, 3, 2, 
    5, 4, 3, 1, 2, 4, 1, 2, 5, 0, 5, 5, 2, 0, 0, 0, 5, 2, 0, 1, 
    5, 1, 3, 0, 3, 2, 1, 1, 0, 1, 3, 3, 4, 1, 5, 4, 2, 2, 3, 3, 
    3, 6, 6, 4, 4, 3, 3, 4, 0, 5, 2, 5, 3, 1, 1, 4, 4, 3, 1, 2, 
    3, 6, 3, 5, 0, 0, 4, 3, 5, 0, 2, 5, 4, 5, 1, 1, 0, 3, 1, 1, 
    2, 2, 2, 2, 1, 6, 4, 5, 0, 3, 0, 1, 2, 0, 6, 5, 1, 1, 3, 5, 
    4, 4, 5, 3, 0, 2, 1, 2, 2, 2, 5, 0, 1, 6, 5, 3, 1, 0, 6, 2, 
    0, 1, 5, 0, 0, 2, 2, 2, 4, 0, 2, 0, 3, 1, 1, 4, 5, 3, 5, 4, 
    3, 2, 0, 4, 5, 2, 0, 2, 1, 5, 5, 5, 5, 4, 5, 2, 2, 5, 0, 1]
group_id_mapping = {int(k):v for k,v in enumerate(group_id_mapping)}

In [10]:
if is_offline:
    
    df_train = df[df["date_id"] <= split_day]
    df_valid = df[df["date_id"] > split_day]
    print("Offline mode")
    print(f"train : {df_train.shape}, valid : {df_valid.shape}")
    
else:
    df_train = df
    print("Online mode")

Online mode


In [11]:
if is_train:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }
    if is_offline:
        df_train_feats = generate_all_features(df_train)
        print("Build Train Feats Finished.")
        df_valid_feats = generate_all_features(df_valid)
        print("Build Valid Feats Finished.")
        df_valid_feats = reduce_mem_usage(df_valid_feats)
    else:
        df_train_feats = generate_all_features(df_train)
        print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

100%|██████████| 200/200 [00:20<00:00,  9.76it/s]


Build Online Train Feats Finished.


In [12]:
import optiver2023
import time
import pandas as pd
import numpy as np
import os
import lightgbm as lgb

feature_columns = list(df_train_feats.columns)

def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out

if is_infer:
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    counter = 0
    y_min, y_max = -64, 64
    qps = []
    cache = pd.DataFrame()

    models = []
    model_save_path = '/kaggle/input/rsi-and-macd/modelitos_para_despues'
    for i in range(1, 11):
        model_path = os.path.join(model_save_path, f'doblez_{i}.txt')
        model = lgb.Booster(model_file=model_path)
        models.append(model)
    
    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        if counter > 0:
            cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
        feat = generate_all_features(cache)[-len(test):]
        
        
        lgb_model_weights = weighted_average(models)
        lgb_predictions = np.zeros(len(test))
        for model, weight in zip(models, lgb_model_weights):
            lgb_predictions += weight * model.predict(feat[feature_columns])
        
        predictions = lgb_predictions
        final_predictions = predictions - np.mean(predictions)
        clipped_predictions = np.clip(final_predictions, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")



This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


100%|██████████| 200/200 [00:09<00:00, 21.12it/s] 
100%|██████████| 200/200 [00:09<00:00, 21.80it/s] 
100%|██████████| 200/200 [00:01<00:00, 146.62it/s]
100%|██████████| 200/200 [00:01<00:00, 149.00it/s]
100%|██████████| 200/200 [00:01<00:00, 149.04it/s]
100%|██████████| 200/200 [00:01<00:00, 144.82it/s]
100%|██████████| 200/200 [00:01<00:00, 144.16it/s]
100%|██████████| 200/200 [00:01<00:00, 152.46it/s]
100%|██████████| 200/200 [00:01<00:00, 153.99it/s]
100%|██████████| 200/200 [00:01<00:00, 146.60it/s]


10 qps: 5.064247941970825


100%|██████████| 200/200 [00:01<00:00, 153.51it/s]
100%|██████████| 200/200 [00:01<00:00, 148.03it/s]
100%|██████████| 200/200 [00:01<00:00, 149.82it/s]
100%|██████████| 200/200 [00:01<00:00, 149.25it/s]
100%|██████████| 200/200 [00:01<00:00, 147.38it/s]
100%|██████████| 200/200 [00:01<00:00, 146.39it/s]
100%|██████████| 200/200 [00:01<00:00, 156.84it/s]
100%|██████████| 200/200 [00:01<00:00, 147.36it/s]
100%|██████████| 200/200 [00:01<00:00, 155.26it/s]
100%|██████████| 200/200 [00:01<00:00, 154.26it/s]


20 qps: 4.213846349716187


100%|██████████| 200/200 [00:01<00:00, 148.49it/s]
100%|██████████| 200/200 [00:01<00:00, 149.16it/s]
100%|██████████| 200/200 [00:01<00:00, 149.51it/s]
100%|██████████| 200/200 [00:01<00:00, 150.12it/s]
100%|██████████| 200/200 [00:01<00:00, 143.02it/s]
100%|██████████| 200/200 [00:01<00:00, 149.85it/s]
100%|██████████| 200/200 [00:01<00:00, 155.35it/s]
100%|██████████| 200/200 [00:01<00:00, 148.93it/s]
100%|██████████| 200/200 [00:01<00:00, 155.29it/s]
100%|██████████| 200/200 [00:01<00:00, 151.02it/s]


30 qps: 3.9524752298990884


100%|██████████| 200/200 [00:01<00:00, 141.55it/s]
100%|██████████| 200/200 [00:01<00:00, 154.81it/s]
100%|██████████| 200/200 [00:01<00:00, 150.24it/s]
100%|██████████| 200/200 [00:01<00:00, 146.15it/s]
100%|██████████| 200/200 [00:01<00:00, 154.13it/s]
100%|██████████| 200/200 [00:01<00:00, 148.53it/s]
100%|██████████| 200/200 [00:01<00:00, 150.97it/s]
100%|██████████| 200/200 [00:01<00:00, 156.71it/s]
100%|██████████| 200/200 [00:01<00:00, 146.64it/s]
100%|██████████| 200/200 [00:01<00:00, 143.56it/s]


40 qps: 3.801205426454544


100%|██████████| 200/200 [00:01<00:00, 153.70it/s]
100%|██████████| 200/200 [00:01<00:00, 151.72it/s]
100%|██████████| 200/200 [00:01<00:00, 146.11it/s]
100%|██████████| 200/200 [00:01<00:00, 151.60it/s]
100%|██████████| 200/200 [00:01<00:00, 158.08it/s]
100%|██████████| 200/200 [00:01<00:00, 153.08it/s]
100%|██████████| 200/200 [00:01<00:00, 146.56it/s]
100%|██████████| 200/200 [00:01<00:00, 156.70it/s]
100%|██████████| 200/200 [00:01<00:00, 148.33it/s]
100%|██████████| 200/200 [00:01<00:00, 152.18it/s]


50 qps: 3.7112180042266845


100%|██████████| 200/200 [00:01<00:00, 153.42it/s]
100%|██████████| 200/200 [00:01<00:00, 141.24it/s]
100%|██████████| 200/200 [00:01<00:00, 153.39it/s]
100%|██████████| 200/200 [00:01<00:00, 148.13it/s]
100%|██████████| 200/200 [00:01<00:00, 147.43it/s]
100%|██████████| 200/200 [00:01<00:00, 153.99it/s]
100%|██████████| 200/200 [00:01<00:00, 153.48it/s]
100%|██████████| 200/200 [00:01<00:00, 155.30it/s]
100%|██████████| 200/200 [00:01<00:00, 155.46it/s]
100%|██████████| 200/200 [00:01<00:00, 149.25it/s]


60 qps: 3.662972100575765


100%|██████████| 200/200 [00:01<00:00, 148.05it/s]
100%|██████████| 200/200 [00:01<00:00, 152.12it/s]
100%|██████████| 200/200 [00:01<00:00, 149.85it/s]
100%|██████████| 200/200 [00:01<00:00, 154.87it/s]
100%|██████████| 200/200 [00:01<00:00, 138.91it/s]
100%|██████████| 200/200 [00:01<00:00, 158.32it/s]
100%|██████████| 200/200 [00:01<00:00, 153.73it/s]
100%|██████████| 200/200 [00:01<00:00, 145.68it/s]
100%|██████████| 200/200 [00:01<00:00, 156.86it/s]
100%|██████████| 200/200 [00:01<00:00, 148.99it/s]


70 qps: 3.621771080153329


100%|██████████| 200/200 [00:01<00:00, 155.09it/s]
100%|██████████| 200/200 [00:01<00:00, 150.67it/s]
100%|██████████| 200/200 [00:01<00:00, 150.70it/s]
100%|██████████| 200/200 [00:01<00:00, 153.19it/s]
100%|██████████| 200/200 [00:01<00:00, 151.95it/s]
100%|██████████| 200/200 [00:01<00:00, 149.60it/s]
100%|██████████| 200/200 [00:01<00:00, 148.90it/s]
100%|██████████| 200/200 [00:01<00:00, 153.07it/s]
100%|██████████| 200/200 [00:01<00:00, 147.27it/s]
100%|██████████| 200/200 [00:01<00:00, 156.91it/s]


80 qps: 3.5852551251649856


100%|██████████| 200/200 [00:01<00:00, 147.47it/s]
100%|██████████| 200/200 [00:01<00:00, 157.53it/s]
100%|██████████| 200/200 [00:01<00:00, 156.05it/s]
100%|██████████| 200/200 [00:01<00:00, 146.71it/s]
100%|██████████| 200/200 [00:01<00:00, 155.99it/s]
100%|██████████| 200/200 [00:01<00:00, 154.43it/s]
100%|██████████| 200/200 [00:01<00:00, 153.26it/s]
100%|██████████| 200/200 [00:01<00:00, 151.20it/s]
100%|██████████| 200/200 [00:01<00:00, 153.12it/s]
100%|██████████| 200/200 [00:01<00:00, 154.31it/s]


90 qps: 3.5624162753423056


100%|██████████| 200/200 [00:01<00:00, 155.02it/s]
100%|██████████| 200/200 [00:01<00:00, 146.36it/s]
100%|██████████| 200/200 [00:01<00:00, 152.13it/s]
100%|██████████| 200/200 [00:01<00:00, 149.37it/s]
100%|██████████| 200/200 [00:01<00:00, 155.46it/s]
100%|██████████| 200/200 [00:01<00:00, 153.08it/s]
100%|██████████| 200/200 [00:01<00:00, 142.99it/s]
100%|██████████| 200/200 [00:01<00:00, 157.03it/s]
100%|██████████| 200/200 [00:01<00:00, 153.53it/s]
100%|██████████| 200/200 [00:01<00:00, 149.07it/s]


100 qps: 3.5444720149040223


100%|██████████| 200/200 [00:01<00:00, 155.72it/s]
100%|██████████| 200/200 [00:01<00:00, 148.34it/s]
100%|██████████| 200/200 [00:01<00:00, 154.33it/s]
100%|██████████| 200/200 [00:01<00:00, 151.85it/s]
100%|██████████| 200/200 [00:01<00:00, 134.97it/s]
100%|██████████| 200/200 [00:01<00:00, 152.23it/s]
100%|██████████| 200/200 [00:01<00:00, 154.84it/s]
100%|██████████| 200/200 [00:01<00:00, 144.95it/s]
100%|██████████| 200/200 [00:01<00:00, 158.63it/s]
100%|██████████| 200/200 [00:01<00:00, 149.60it/s]


110 qps: 3.528876839984547


100%|██████████| 200/200 [00:01<00:00, 150.66it/s]
100%|██████████| 200/200 [00:01<00:00, 154.80it/s]
100%|██████████| 200/200 [00:01<00:00, 148.46it/s]
100%|██████████| 200/200 [00:01<00:00, 154.09it/s]
100%|██████████| 200/200 [00:01<00:00, 143.34it/s]
100%|██████████| 200/200 [00:01<00:00, 146.65it/s]
100%|██████████| 200/200 [00:01<00:00, 155.61it/s]
100%|██████████| 200/200 [00:01<00:00, 146.75it/s]
100%|██████████| 200/200 [00:01<00:00, 154.71it/s]
100%|██████████| 200/200 [00:01<00:00, 150.65it/s]


120 qps: 3.5158303479353585


100%|██████████| 200/200 [00:01<00:00, 150.52it/s]
100%|██████████| 200/200 [00:01<00:00, 153.77it/s]
100%|██████████| 200/200 [00:01<00:00, 151.02it/s]
100%|██████████| 200/200 [00:01<00:00, 146.49it/s]
100%|██████████| 200/200 [00:01<00:00, 154.13it/s]
100%|██████████| 200/200 [00:01<00:00, 143.97it/s]
100%|██████████| 200/200 [00:01<00:00, 156.77it/s]
100%|██████████| 200/200 [00:01<00:00, 154.87it/s]
100%|██████████| 200/200 [00:01<00:00, 150.18it/s]
100%|██████████| 200/200 [00:01<00:00, 156.58it/s]


130 qps: 3.504638860775874


100%|██████████| 200/200 [00:01<00:00, 152.45it/s]
100%|██████████| 200/200 [00:01<00:00, 150.02it/s]
100%|██████████| 200/200 [00:01<00:00, 142.81it/s]
100%|██████████| 200/200 [00:01<00:00, 152.35it/s]
100%|██████████| 200/200 [00:01<00:00, 159.83it/s]
100%|██████████| 200/200 [00:01<00:00, 153.89it/s]
100%|██████████| 200/200 [00:01<00:00, 151.60it/s]
100%|██████████| 200/200 [00:01<00:00, 161.14it/s]
100%|██████████| 200/200 [00:01<00:00, 156.07it/s]
100%|██████████| 200/200 [00:01<00:00, 152.59it/s]


140 qps: 3.4942992499896457


100%|██████████| 200/200 [00:01<00:00, 154.18it/s]
100%|██████████| 200/200 [00:01<00:00, 138.60it/s]
100%|██████████| 200/200 [00:01<00:00, 157.18it/s]
100%|██████████| 200/200 [00:01<00:00, 156.47it/s]
100%|██████████| 200/200 [00:01<00:00, 143.99it/s]
100%|██████████| 200/200 [00:01<00:00, 155.37it/s]
100%|██████████| 200/200 [00:01<00:00, 154.76it/s]
100%|██████████| 200/200 [00:01<00:00, 154.15it/s]
100%|██████████| 200/200 [00:01<00:00, 151.21it/s]
100%|██████████| 200/200 [00:01<00:00, 146.68it/s]


150 qps: 3.485888498624166


100%|██████████| 200/200 [00:01<00:00, 146.10it/s]
100%|██████████| 200/200 [00:01<00:00, 152.15it/s]
100%|██████████| 200/200 [00:01<00:00, 150.86it/s]
100%|██████████| 200/200 [00:01<00:00, 154.40it/s]
100%|██████████| 200/200 [00:01<00:00, 154.69it/s]
100%|██████████| 200/200 [00:01<00:00, 153.52it/s]
100%|██████████| 200/200 [00:01<00:00, 154.04it/s]
100%|██████████| 200/200 [00:01<00:00, 148.75it/s]
100%|██████████| 200/200 [00:01<00:00, 158.14it/s]
100%|██████████| 200/200 [00:01<00:00, 155.06it/s]


160 qps: 3.480410860478878


100%|██████████| 200/200 [00:01<00:00, 151.00it/s]
100%|██████████| 200/200 [00:01<00:00, 154.66it/s]
100%|██████████| 200/200 [00:01<00:00, 148.71it/s]
100%|██████████| 200/200 [00:01<00:00, 157.15it/s]
100%|██████████| 200/200 [00:01<00:00, 151.32it/s]


The code will take approximately 3.9844 hours to reason about
