In [31]:
import pandas as pd
import numpy as np
import warnings
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin

In [32]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

## **Loading Data**

In [33]:
df = pd.read_csv("/Users/ayush/Documents/University/Year 03/Sem 01/DATA3888/Optiver-07/Data/individual_book_train/stock_1.csv")
df.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,5,0,1.000754,1.001542,1.000689,1.001607,1,25,25,100,1
1,5,1,1.000754,1.001673,1.000689,1.001739,26,60,25,100,1
2,5,2,1.000754,1.001411,1.000623,1.001476,1,25,25,125,1
3,5,3,1.000754,1.001542,1.000689,1.001607,125,25,126,36,1
4,5,4,1.000754,1.001476,1.000623,1.001542,100,100,25,25,1


In [34]:
stock1 = df.copy()

## **Features**

In [35]:
def make_features(df: pd.DataFrame) -> pd.DataFrame:

    df['mid_price'] = (df['bid_price1'] + df['ask_price1']) / 2
    df['spread'] = df['ask_price1'] - df['bid_price1']
    df['rel_spread'] = df['spread'] / df['mid_price']
    df['imbalance'] = (df['bid_size1'] - df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    df['book_pressure'] = ((df['bid_size1'] + df['bid_size2']) - (df['ask_size1'] + df['ask_size2'])) / (df['bid_size1'] + df['bid_size2'] + df['ask_size1'] + df['ask_size2'])
    df['microprice'] = ((df['ask_price1'] * df['bid_size1'] + df['bid_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1']))
    df['normalized_spread'] = df['spread'] / df['mid_price']
    df['OBI_L2'] = ((df['bid_size1'] + df['bid_size2']) / (df['bid_size1'] + df['bid_size2'] + df['ask_size1'] + df['ask_size2']))

    sizes = df[['bid_size1', 'bid_size2', 'ask_size1', 'ask_size2']].astype(float).values
    total = sizes.sum(axis=1, keepdims=True)
    p = np.divide(sizes, total, where=total != 0)
    entropy = -np.nansum(np.where(p > 0, p * np.log(p, where=p > 0), 0), axis=1)
    df['LOB_entropy'] = entropy
    df['LOB_entropy_normalized'] = entropy / np.log(4)

    df['log_return'] = df.groupby('time_id')['mid_price'].transform(lambda x: np.log(x / x.shift(1)))

    # df['realized_volatility'] = df.groupby('time_id')['log_return'].transform(lambda x: np.sqrt(x.pow(2).rolling(window=10, min_periods=1).sum()))
    # df['realized_volatility'] = df.groupby('time_id')['log_return'].transform(lambda x: np.sqrt(x.pow(2).rolling(window=30, min_periods=1).sum()))
    df['realized_volatility'] = df.groupby('time_id')['log_return'].transform(lambda x: np.sqrt(x.pow(2).rolling(window=60, min_periods=1).sum()))

    # df['bipower_var'] = df.groupby('time_id')['log_return'].transform(
    #     lambda x: (
    #         x.abs().rolling(2).apply(lambda r: r[0] * r[1], raw=True)
    #         .rolling(10, min_periods=1).mean()
    #     )
    # )
    # df['bipower_var'] = df.groupby('time_id')['log_return'].transform(
    #     lambda x: (
    #         x.abs().rolling(2).apply(lambda r: r[0] * r[1], raw=True)
    #         .rolling(30, min_periods=1).mean()
    #     )
    # )
    df['bipower_var'] = df.groupby('time_id')['log_return'].transform(
        lambda x: (
            x.abs().rolling(2).apply(lambda r: r[0] * r[1], raw=True)
            .rolling(60, min_periods=1).mean()
        )
    )
    
    # df['rolling_integrated_variance'] = df.groupby('time_id')['log_return'].transform(
    #     lambda x: x.pow(2).rolling(window=10, min_periods=1).sum()
    # )
    # df['rolling_integrated_variance'] = df.groupby('time_id')['log_return'].transform(
    #     lambda x: x.pow(2).rolling(window=30, min_periods=1).sum()
    # )
    df['rolling_integrated_variance'] = df.groupby('time_id')['log_return'].transform(
        lambda x: x.pow(2).rolling(window=60, min_periods=1).sum()
    )

    return df.dropna()

In [36]:
stock1 = make_features(stock1)

In [37]:
stock1

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,...,book_pressure,microprice,normalized_spread,OBI_L2,LOB_entropy,LOB_entropy_normalized,log_return,realized_volatility,bipower_var,rolling_integrated_variance
2,5,2,1.000754,1.001411,1.000623,1.001476,1,25,25,125,...,-0.704545,1.000780,0.000655,0.147727,0.826831,0.596432,-0.000131,0.000147,8.589981e-09,2.148029e-08
3,5,3,1.000754,1.001542,1.000689,1.001607,125,25,126,36,...,0.608974,1.001411,0.000786,0.804487,1.184063,0.854121,0.000066,0.000161,8.593536e-09,2.578061e-08
4,5,4,1.000754,1.001476,1.000623,1.001542,100,100,25,25,...,0.000000,1.001115,0.000721,0.500000,1.193550,0.860964,-0.000033,0.000164,6.445187e-09,2.685402e-08
5,5,5,1.000754,1.001542,1.000623,1.001673,100,25,25,60,...,0.190476,1.001384,0.000786,0.595238,1.217958,0.878571,0.000033,0.000167,5.102243e-09,2.792743e-08
6,5,6,1.000820,1.001542,1.000754,1.001673,100,125,125,60,...,0.097561,1.001141,0.000721,0.548780,1.349679,0.973587,0.000033,0.000170,4.296796e-09,2.900404e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1507527,32767,588,0.998911,0.999109,0.998812,0.999208,126,42,101,100,...,0.230352,0.999060,0.000198,0.615176,1.322727,0.954146,-0.000050,0.000693,1.552235e-09,4.802919e-07
1507528,32767,589,0.998911,0.999109,0.998812,0.999208,126,126,101,200,...,-0.179024,0.999010,0.000198,0.410488,1.352368,0.975527,0.000000,0.000693,1.552235e-09,4.802919e-07
1507529,32767,591,0.998911,0.999109,0.998812,0.999208,126,226,101,200,...,-0.304747,0.998982,0.000198,0.347626,1.335784,0.963564,0.000000,0.000693,1.552235e-09,4.802919e-07
1507530,32767,592,0.998911,0.999109,0.998812,0.999208,226,225,101,100,...,0.003067,0.999011,0.000198,0.501534,1.310861,0.945586,0.000000,0.000693,1.552235e-09,4.802919e-07


## **Pre Processing**

In [38]:
time_ids = stock1['time_id'].unique()
full_index = pd.MultiIndex.from_product(
    [time_ids, range(600)],
    names=['time_id', 'seconds_in_bucket']
)

stock1 = (
    stock1
    .set_index(['time_id', 'seconds_in_bucket'])
    .reindex(full_index)
)

columns_to_fill = [col for col in stock1.columns if col not in ['time_id', 'seconds_in_bucket']]
stock1[columns_to_fill] = stock1.groupby(level=0)[columns_to_fill].ffill()
stock1 = stock1.reset_index()
if 'stock_id' in stock1.columns:
    mapping = (
        stock1
        .dropna(subset=['stock_id'])
        .drop_duplicates(subset=['time_id'])
        .set_index('time_id')['stock_id']
    )
    stock1['stock_id'] = stock1['time_id'].map(mapping)
    stock1.drop("stock_id", axis=1, inplace=True)
    stock1.dropna(inplace=True)

In [39]:
unique_time_ids = sorted(stock1['time_id'].unique())
time_id_map = {old: new for new, old in enumerate(unique_time_ids, start=1)}
stock1['time_id'] = stock1['time_id'].map(time_id_map)

In [40]:
stock1

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,...,book_pressure,microprice,normalized_spread,OBI_L2,LOB_entropy,LOB_entropy_normalized,log_return,realized_volatility,bipower_var,rolling_integrated_variance
2,1,2,1.000754,1.001411,1.000623,1.001476,1.0,25.0,25.0,125.0,...,-0.704545,1.000780,0.000655,0.147727,0.826831,0.596432,-0.000131,0.000147,8.589981e-09,2.148029e-08
3,1,3,1.000754,1.001542,1.000689,1.001607,125.0,25.0,126.0,36.0,...,0.608974,1.001411,0.000786,0.804487,1.184063,0.854121,0.000066,0.000161,8.593536e-09,2.578061e-08
4,1,4,1.000754,1.001476,1.000623,1.001542,100.0,100.0,25.0,25.0,...,0.000000,1.001115,0.000721,0.500000,1.193550,0.860964,-0.000033,0.000164,6.445187e-09,2.685402e-08
5,1,5,1.000754,1.001542,1.000623,1.001673,100.0,25.0,25.0,60.0,...,0.190476,1.001384,0.000786,0.595238,1.217958,0.878571,0.000033,0.000167,5.102243e-09,2.792743e-08
6,1,6,1.000820,1.001542,1.000754,1.001673,100.0,125.0,125.0,60.0,...,0.097561,1.001141,0.000721,0.548780,1.349679,0.973587,0.000033,0.000170,4.296796e-09,2.900404e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2297995,3830,595,0.998911,0.999109,0.998812,0.999208,125.0,225.0,101.0,100.0,...,-0.179673,0.998982,0.000198,0.410163,1.322976,0.954326,0.000000,0.000693,1.552235e-09,4.802919e-07
2297996,3830,596,0.998911,0.999109,0.998812,0.999208,125.0,225.0,101.0,100.0,...,-0.179673,0.998982,0.000198,0.410163,1.322976,0.954326,0.000000,0.000693,1.552235e-09,4.802919e-07
2297997,3830,597,0.998911,0.999109,0.998812,0.999208,125.0,225.0,101.0,100.0,...,-0.179673,0.998982,0.000198,0.410163,1.322976,0.954326,0.000000,0.000693,1.552235e-09,4.802919e-07
2297998,3830,598,0.998911,0.999109,0.998812,0.999208,125.0,225.0,101.0,100.0,...,-0.179673,0.998982,0.000198,0.410163,1.322976,0.954326,0.000000,0.000693,1.552235e-09,4.802919e-07


## **Train-Test Split**

In [41]:
train = stock1[(stock1['time_id'] >= 1) & (stock1['time_id'] <= 2681)]
validation = stock1[(stock1['time_id'] >= 2682) & (stock1['time_id'] <= 3255)]
test = stock1[stock1['time_id'] >= 3256]

In [42]:
train

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,...,book_pressure,microprice,normalized_spread,OBI_L2,LOB_entropy,LOB_entropy_normalized,log_return,realized_volatility,bipower_var,rolling_integrated_variance
2,1,2,1.000754,1.001411,1.000623,1.001476,1.0,25.0,25.0,125.0,...,-0.704545,1.000780,0.000655,0.147727,0.826831,0.596432,-0.000131,0.000147,8.589981e-09,2.148029e-08
3,1,3,1.000754,1.001542,1.000689,1.001607,125.0,25.0,126.0,36.0,...,0.608974,1.001411,0.000786,0.804487,1.184063,0.854121,0.000066,0.000161,8.593536e-09,2.578061e-08
4,1,4,1.000754,1.001476,1.000623,1.001542,100.0,100.0,25.0,25.0,...,0.000000,1.001115,0.000721,0.500000,1.193550,0.860964,-0.000033,0.000164,6.445187e-09,2.685402e-08
5,1,5,1.000754,1.001542,1.000623,1.001673,100.0,25.0,25.0,60.0,...,0.190476,1.001384,0.000786,0.595238,1.217958,0.878571,0.000033,0.000167,5.102243e-09,2.792743e-08
6,1,6,1.000820,1.001542,1.000754,1.001673,100.0,125.0,125.0,60.0,...,0.097561,1.001141,0.000721,0.548780,1.349679,0.973587,0.000033,0.000170,4.296796e-09,2.900404e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1608595,2681,595,0.999575,1.000142,0.999292,1.000425,126.0,125.0,94.0,201.0,...,-0.194139,0.999860,0.000566,0.402930,1.346676,0.971422,0.000000,0.001068,9.013895e-09,1.139710e-06
1608596,2681,596,0.999953,1.000330,0.999858,1.000425,100.0,97.0,200.0,1.0,...,0.507538,1.000144,0.000378,0.753769,1.051961,0.758829,0.000283,0.001103,8.717892e-09,1.217692e-06
1608597,2681,597,0.999953,1.000614,0.999764,1.000802,100.0,100.0,100.0,100.0,...,0.000000,1.000283,0.000661,0.500000,1.386294,1.000000,0.000142,0.001113,9.386102e-09,1.237734e-06
1608598,2681,598,1.000142,1.000614,0.999953,1.000991,1.0,100.0,100.0,248.0,...,-0.550111,1.000146,0.000472,0.224944,1.010443,0.728881,0.000094,0.001080,9.608765e-09,1.166687e-06


In [43]:
validation

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,...,book_pressure,microprice,normalized_spread,OBI_L2,LOB_entropy,LOB_entropy_normalized,log_return,realized_volatility,bipower_var,rolling_integrated_variance
1608602,2682,2,1.002811,1.00346,1.002703,1.003568,100.0,100.0,102.0,100.0,...,0.004975,1.003135,0.000647,0.502488,1.386257,0.999973,0.000162,0.000162,0.000000e+00,2.613274e-08
1608603,2682,3,1.002811,1.00346,1.002703,1.003568,200.0,100.0,2.0,100.0,...,0.004975,1.003244,0.000647,0.502488,1.065895,0.768881,0.000000,0.000162,0.000000e+00,2.613274e-08
1608604,2682,4,1.002811,1.00346,1.002703,1.003568,200.0,100.0,3.0,100.0,...,0.007444,1.003244,0.000647,0.503722,1.075876,0.776081,0.000000,0.000162,0.000000e+00,2.613274e-08
1608605,2682,5,1.002811,1.00346,1.002703,1.003568,200.0,100.0,3.0,100.0,...,0.007444,1.003244,0.000647,0.503722,1.075876,0.776081,0.000000,0.000162,0.000000e+00,2.613274e-08
1608606,2682,6,1.003135,1.00346,1.003027,1.003568,100.0,100.0,100.0,100.0,...,0.000000,1.003298,0.000323,0.500000,1.386294,1.000000,0.000162,0.000229,0.000000e+00,5.225703e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1952995,3255,595,0.999154,1.00000,0.999084,1.000071,55.0,200.0,101.0,200.0,...,-0.438849,0.999337,0.000846,0.280576,1.274262,0.919186,0.000000,0.001422,1.082488e-08,2.020915e-06
1952996,3255,596,0.999084,1.00000,0.999013,1.000071,201.0,300.0,14.0,100.0,...,-0.300813,0.999451,0.000917,0.349593,1.097130,0.791412,-0.000035,0.001422,1.082488e-08,2.022159e-06
1952997,3255,597,0.999084,1.00000,0.999013,1.000071,356.0,300.0,14.0,100.0,...,-0.038961,0.999581,0.000917,0.480519,1.061879,0.765984,0.000000,0.001422,1.082488e-08,2.020912e-06
1952998,3255,598,0.999084,1.00000,0.999013,1.000071,356.0,300.0,14.0,100.0,...,-0.038961,0.999581,0.000917,0.480519,1.061879,0.765984,0.000000,0.001422,1.082488e-08,2.020912e-06


In [44]:
test

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,...,book_pressure,microprice,normalized_spread,OBI_L2,LOB_entropy,LOB_entropy_normalized,log_return,realized_volatility,bipower_var,rolling_integrated_variance
1953002,3256,2,0.998771,0.999244,0.998676,0.999433,125.0,125.0,51.0,4.0,...,0.154098,0.999007,0.000473,0.577049,1.087044,0.784136,0.000000,0.000000,0.000000e+00,0.000000e+00
1953003,3256,3,0.998771,0.999244,0.998676,0.999433,125.0,125.0,51.0,4.0,...,0.154098,0.999007,0.000473,0.577049,1.087044,0.784136,0.000000,0.000000,0.000000e+00,0.000000e+00
1953004,3256,4,0.998960,0.999433,0.998865,0.999527,125.0,4.0,100.0,168.0,...,0.133501,0.999418,0.000473,0.566751,1.121399,0.808918,0.000189,0.000189,0.000000e+00,3.583072e-08
1953005,3256,5,0.998960,0.999433,0.998865,0.999527,125.0,4.0,100.0,168.0,...,0.133501,0.999418,0.000473,0.566751,1.121399,0.808918,0.000189,0.000189,0.000000e+00,3.583072e-08
1953006,3256,6,0.998960,0.999433,0.998865,0.999527,125.0,209.0,100.0,168.0,...,-0.252492,0.999137,0.000473,0.373754,1.348048,0.972411,0.000000,0.000189,0.000000e+00,3.583072e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2297995,3830,595,0.998911,0.999109,0.998812,0.999208,125.0,225.0,101.0,100.0,...,-0.179673,0.998982,0.000198,0.410163,1.322976,0.954326,0.000000,0.000693,1.552235e-09,4.802919e-07
2297996,3830,596,0.998911,0.999109,0.998812,0.999208,125.0,225.0,101.0,100.0,...,-0.179673,0.998982,0.000198,0.410163,1.322976,0.954326,0.000000,0.000693,1.552235e-09,4.802919e-07
2297997,3830,597,0.998911,0.999109,0.998812,0.999208,125.0,225.0,101.0,100.0,...,-0.179673,0.998982,0.000198,0.410163,1.322976,0.954326,0.000000,0.000693,1.552235e-09,4.802919e-07
2297998,3830,598,0.998911,0.999109,0.998812,0.999208,125.0,225.0,101.0,100.0,...,-0.179673,0.998982,0.000198,0.410163,1.322976,0.954326,0.000000,0.000693,1.552235e-09,4.802919e-07


## **Model Building**

In [45]:
feature_cols = [c for c in stock1.columns if c not in ('time_id','realized_vol')]
X_train, y_train = train[feature_cols], train['realized_volatility']
X_val,   y_val   = validation[feature_cols],   validation['realized_volatility']
X_test,  y_test  = test[feature_cols],  test['realized_volatility']

In [46]:
def winsorize(X, lower=1, upper=99):
    lows  = np.percentile(X, lower, axis=0)
    highs = np.percentile(X, upper, axis=0)
    return np.clip(X, lows, highs)

winsorizer = FunctionTransformer(winsorize)
scaler = StandardScaler()

preproc = Pipeline([
    ('winsorize', winsorizer),
    ('scale',      scaler),
])

In [47]:
preproc.fit(X_train)

X_train_proc = preproc.transform(X_train)
X_val_proc   = preproc.transform(X_val)
X_test_proc  = preproc.transform(X_test)

In [48]:
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,          
    max_features='sqrt',  
    n_jobs=-1,             
    random_state=0
)
model.fit(X_train_proc, y_train)

In [49]:
y_val_pred = model.predict(X_val_proc)
rmse_val = root_mean_squared_error(y_val, y_val_pred)
print(f'Validation RMSE: {rmse_val:.4f}')

Validation RMSE: 0.0002


In [50]:
y_test_pred = model.predict(X_test_proc)
rmse_test = root_mean_squared_error(y_test, y_test_pred)
print(f'Test RMSE: {rmse_test:.4f}')

Test RMSE: 0.0002


## **Pipeline**

In [54]:
class MakeFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['mid_price'] = (df['bid_price1'] + df['ask_price1']) / 2
        df['spread'] = df['ask_price1'] - df['bid_price1']
        df['rel_spread'] = df['spread'] / df['mid_price']
        df['imbalance'] = (df['bid_size1'] - df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
        df['book_pressure'] = ((df['bid_size1'] + df['bid_size2']) - (df['ask_size1'] + df['ask_size2'])) / (df['bid_size1'] + df['bid_size2'] + df['ask_size1'] + df['ask_size2'])
        df['microprice'] = ((df['ask_price1'] * df['bid_size1'] + df['bid_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1']))
        df['normalized_spread'] = df['spread'] / df['mid_price']
        df['OBI_L2'] = ((df['bid_size1'] + df['bid_size2']) / (df['bid_size1'] + df['bid_size2'] + df['ask_size1'] + df['ask_size2']))

        sizes = df[['bid_size1', 'bid_size2', 'ask_size1', 'ask_size2']].astype(float).values
        total = sizes.sum(axis=1, keepdims=True)
        p = np.divide(sizes, total, where=total != 0)
        entropy = -np.nansum(np.where(p > 0, p * np.log(p, where=p > 0), 0), axis=1)
        df['LOB_entropy'] = entropy
        df['LOB_entropy_normalized'] = entropy / np.log(4)

        df['log_return'] = df.groupby('time_id')['mid_price'].transform(lambda x: np.log(x / x.shift(1)))

        #df['realized_volatility'] = df.groupby('time_id')['log_return'].transform(lambda x: np.sqrt(x.pow(2).rolling(window=10, min_periods=1).sum()))
        df['realized_volatility'] = df.groupby('time_id')['log_return'].transform(lambda x: np.sqrt(x.pow(2).rolling(window=30, min_periods=1).sum()))
        # df['realized_volatility'] = df.groupby('time_id')['log_return'].transform(lambda x: np.sqrt(x.pow(2).rolling(window=60, min_periods=1).sum()))

        # df['bipower_var'] = df.groupby('time_id')['log_return'].transform(
        #     lambda x: (
        #         x.abs().rolling(2).apply(lambda r: r[0] * r[1], raw=True)
        #         .rolling(10, min_periods=1).mean()
        #     )
        # )
        df['bipower_var'] = df.groupby('time_id')['log_return'].transform(
            lambda x: (
                x.abs().rolling(2).apply(lambda r: r[0] * r[1], raw=True)
                .rolling(30, min_periods=1).mean()
            )
        )
        # df['bipower_var'] = df.groupby('time_id')['log_return'].transform(
        #     lambda x: (
        #         x.abs().rolling(2).apply(lambda r: r[0] * r[1], raw=True)
        #         .rolling(60, min_periods=1).mean()
        #     )
        # )
        
        # df['rolling_integrated_variance'] = df.groupby('time_id')['log_return'].transform(
        #     lambda x: x.pow(2).rolling(window=10, min_periods=1).sum()
        # )
        df['rolling_integrated_variance'] = df.groupby('time_id')['log_return'].transform(
            lambda x: x.pow(2).rolling(window=30, min_periods=1).sum()
        )
        # df['rolling_integrated_variance'] = df.groupby('time_id')['log_return'].transform(
        #     lambda x: x.pow(2).rolling(window=60, min_periods=1).sum()
        # )

        return df.dropna()


class ReindexFill(BaseEstimator, TransformerMixin):
    def __init__(self, n_seconds=600):
        self.n_seconds = n_seconds

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        time_ids = df['time_id'].unique()
        full_index = pd.MultiIndex.from_product(
            [time_ids, range(self.n_seconds)],
            names=['time_id', 'seconds_in_bucket']
        )
        df = df.set_index(['time_id', 'seconds_in_bucket']).reindex(full_index)
        cols_to_fill = [c for c in df.columns if c not in ['time_id', 'seconds_in_bucket']]
        df[cols_to_fill] = df.groupby(level=0)[cols_to_fill].ffill()
        return df.reset_index()


class MapTimeID(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        if 'stock_id' in df.columns:
            mapping = (df.dropna(subset=['stock_id'])
                         .drop_duplicates(subset=['time_id'])
                         .set_index('time_id')['stock_id'])
            df['stock_id'] = df['time_id'].map(mapping)
            df = df.drop('stock_id', axis=1).dropna()
        unique_time_ids = sorted(df['time_id'].unique())
        time_id_map = {old: new for new, old in enumerate(unique_time_ids, start=1)}
        df['time_id'] = df['time_id'].map(time_id_map)
        return df


# Assemble the pipeline
pipeline = Pipeline([
    ('make_features', MakeFeatures()),
    ('reindex_fill', ReindexFill()),
    ('map_time_id', MapTimeID()),
])


In [55]:
transformed_stock1 = pipeline.fit_transform(df)

In [56]:
# 1) Winsorizer
def winsorize(X, lower=1, upper=99):
    lows  = np.percentile(X, lower, axis=0)
    highs = np.percentile(X, upper, axis=0)
    return np.clip(X, lows, highs)

winsorizer = FunctionTransformer(winsorize)

# 2) Assemble full pipeline
model_pipeline = Pipeline([
    ('winsorize', winsorizer),
    ('scale',      StandardScaler()),
    ('rf',         RandomForestRegressor(
                      n_estimators=100,
                      max_depth=10,
                      max_features='sqrt',
                      n_jobs=-1,
                      random_state=0
                  ))
])

# Usage example:
feature_cols = [c for c in transformed_stock1.columns 
                if c not in ('time_id','realized_volatility')]

X = transformed_stock1[feature_cols].values
y = transformed_stock1['realized_volatility'].values

# suppose you’ve already split into train/val/test sets:
# X_train, y_train, X_val, y_val, X_test, y_test

model_pipeline.fit(X_train, y_train)

# Validation
y_val_pred = model_pipeline.predict(X_val)
rmse_val = root_mean_squared_error(y_val, y_val_pred)
print(f'Validation RMSE: {rmse_val:.4f}')

# Test
y_test_pred = model_pipeline.predict(X_test)
rmse_test = root_mean_squared_error(y_test, y_test_pred)
print(f'Test RMSE:       {rmse_test:.4f}')


Validation RMSE: 0.0002
Test RMSE:       0.0002
