In [None]:
import pandas as pd
import numpy as np

from itertools import groupby
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
from itertools import combinations
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [None]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
revealed_targets = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
test = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')
sample_submission = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

In [None]:
median_vol = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()

In [None]:
median_sizes = train.groupby('stock_id').median()

In [None]:
median_sizes

In [None]:
def feat_eng(df):
    
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']]
    df = df[cols]
    
    df['imbalance_buy_flag'] = np.where(df['imbalance_buy_sell_flag']==1, 1, 0) 
    df['imbalance_sell_flag'] = np.where(df['imbalance_buy_sell_flag']==-1, 1, 0) 
    df['bid_plus_ask_sizes'] = df['bid_size'] + train['ask_size']
    df['median_vol'] = df['stock_id'].map(median_vol.to_dict())
    df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['median_vol'], 1, 0) 
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')

    df['ask_x_size'] = df.eval('ask_size*ask_price')
    df['bid_x_size'] = df.eval('bid_size*bid_price')
        
    df["bid_size_over_ask_size"] = df["bid_size"].div(df["ask_size"])
    df["bid_price_over_ask_price"] = df["bid_price"].div(df["ask_price"])
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for c in combinations(prices, 2):
        
        df[f'{c[0]}_minus_{c[1]}'] = (df[f'{c[0]}'] - df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_times_{c[1]}'] = (df[f'{c[0]}'] * df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]}-{c[1]})/({c[0]}+{c[1]})')

    for c in combinations(prices, 3):
        
        max_ = df[list(c)].max(axis=1)
        min_ = df[list(c)].min(axis=1)
        mid_ = df[list(c)].sum(axis=1)-min_-max_

        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_-mid_)/(mid_-min_)
    
    df.drop(columns=[
        'date_id', 'imbalance_buy_sell_flag', 'far_price_near_price_imb','reference_price_far_price_wap_imb2','reference_price_far_price_imb' ], 
            inplace=True)
    
    gc.collect()
    
    return df

In [None]:
y = train['target'].values
X = feat_eng(train.drop(columns='target'))

In [None]:
X

In [None]:
X.shape

In [None]:
m = lgb.LGBMRegressor(objective='regression_l1', n_estimators=600, random_state=51)
m.fit(X, y)

In [None]:
feat_imp = pd.Series(m.feature_importances_, index=X.columns).sort_values()
print('Columns with poor contribution', feat_imp[feat_imp<10].index)
fig = px.bar(x=feat_imp, y=feat_imp.index, orientation='h')
fig.show()

In [None]:
feat_imp.sort_values()

In [None]:
test = feat_eng(test)

In [None]:
test.shape

In [None]:
m.predict(test)

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    
    feat = feat_eng(test)
    sample_prediction['target'] = m.predict(feat)
    env.predict(sample_prediction)
    
    counter += 1

In [None]:
sample_prediction