In [1]:
import joblib
from tqdm import tqdm

import pandas as pd
import numpy as np

from optiver_trading_at_the_close.feature_engineering import FE
from optiver_trading_at_the_close.nearest_neightbors_features import NearestNeighborsFeatures
from optiver_trading_at_the_close.column_selector import ColumnSelector
from optiver_trading_at_the_close.memory_reduction import MemoryReduction
from optiver_trading_at_the_close.mean_regressor_ensemble import MeanRegressorEnsemble

from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline

ModuleNotFoundError: No module named 'optiver_trading_at_the_close'

## Hard Variables

In [None]:
DATA_PATH = './../data/train.csv'


MODEL_PATHS = [
    # './../output/models/V3/lightgbm/lightgbm-89ab1659-547a-4845-94b8-793690fbcee0.joblib',
    # './../output/models/V3/lightgbm/lightgbm-910699bf-8877-4b40-b01a-2e887d23616d.joblib',
    # './../output/models/V3/lightgbm/lightgbm-b931c388-7be7-4fd7-a1ad-6002a444fe2c.joblib',
    './../output/models/V3/lightgbm/lightgbm-ec04e2b2-6681-4436-b694-ebbd62880006.joblib',
]

## Read Data

In [None]:
df = pd.read_csv(DATA_PATH)

In [None]:
df.head()

## Feature Engineering

In [None]:
# df = df.dropna(subset=['target'], axis=0)

# y = df['target']
# X = df.drop(columns='target')

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
df = df.dropna(subset=['target'], axis=0)

X_train = df.loc[df['date_id'] <= 420]
X_test = df.loc[df['date_id'] > 420]

y_train = X_train['target']
# X_train = X_train.drop(columns='target')

y_test = X_test['target']
# X_test = X_test.drop(columns='target')

## Fit Model

In [None]:
estimators = [joblib.load(model_path) for model_path in MODEL_PATHS]

pipeline = Pipeline(steps=[
    ('fe', FE()),
    # ('nn_features', NearestNeighborsFeatures(
    #     features_to_use_for_distance_computation=['seconds_in_bucket', 'wap', 'bid_plus_ask_sizes', 'bid_ask_size_imb'],
    #     get_target=True,
    #     features_get=['wap', 'bid_ask_size_imb'],
    #     n_neighbors=[40],
    #     metrics=['l1'],
    #     n_jobs=-1
    # )),
    ('column_selector', ColumnSelector(cols_to_drop=['time_id', 'row_id', 'date_id', 'target'])),
    ('memore_reduction', MemoryReduction()),
    ('mean_regressor_ensemble', MeanRegressorEnsemble(estimators))
])

# pipeline.fit(X_train, y_train, mean_regressor_ensemble__fit_estimators=False)
pipeline.fit(X_train, y_train)

## Weights Tunning

### Get each model predictions

In [None]:
def get_each_model_predictions(pipeline, X):
    _X = X.copy()

    for step in pipeline.steps[:-1]:
        _X = step[1].transform(_X)
        
    preds = [estimator.predict(_X, nn_features__exclude_self=False) for estimator in pipeline.steps[-1][1].estimators]
    
    return preds

preds = get_each_model_predictions(pipeline, X_test)

### Tunning

In [None]:
def get_all_possible_weights(n, grid=np.arange(0, 1.01, 0.01)):
    grid = grid
    results = []
    
    def find_combinations(target_sum, current_combination, start_index):
        if target_sum == 0 and len(current_combination) == n:
            results.append(current_combination)
            return
        if target_sum < 0 or len(current_combination) == n:
            return
        for i in range(start_index, len(grid)):
            new_combination = current_combination + [grid[i]]
            find_combinations(target_sum - grid[i], new_combination, i)

    find_combinations(1, [], 0)
    
    return results


weights = get_all_possible_weights(len(preds))

In [None]:
def zero_sum(listOfPrices, listOfVolumes):
    """
    Source - https://www.kaggle.com/code/kaito510/goto-conversion-optiver-baseline-models
    """
    
    #compute standard errors assuming standard deviation is same for all stocks
    listOfSe = np.sqrt(listOfVolumes)
    step = sum(listOfPrices)/sum(listOfSe)
    outputListOfPrices = listOfPrices - listOfSe*step
    return outputListOfPrices

In [None]:
y_min = np.min(y_train)
y_max = np.max(y_train)

results = dict()

for weight in tqdm(weights):
    pred = np.average(preds, axis=0, weights=weight)
    pred = zero_sum(pred, X_test.loc[:,'bid_size'] + X_test.loc[:,'ask_size'])
    pred = np.clip(pred, y_min, y_max)
    
    results[tuple(weight)] = mean_absolute_error(y_test, pred)
    
results = sorted(results.items(), key=lambda x:x[1])
results # 5.692617867330589

In [None]:
train_pred = pipeline.predict(X_train)

# train_pred = zero_sum(train_pred, X_train.loc[:,'bid_size'] + X_train.loc[:,'ask_size'])
# train_pred = np.clip(train_pred, y_min, y_max)

test_pred = pipeline.predict(X_test)

# test_pred = zero_sum(test_pred, X_test.loc[:,'bid_size'] + X_test.loc[:,'ask_size'])
# test_pred = np.clip(test_pred, y_min, y_max)

print(f'Train = {mean_absolute_error(y_train, train_pred):.3f}')
print(f'Test = {mean_absolute_error(y_test, test_pred):.3f}')

# Train = 6.092
# Test = 5.692

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

def get_booster_importance(lightgbm):
    importance_types = ['gain', 'split']
    
    booster = lightgbm.booster_
    
    results = pd.DataFrame(index=booster.feature_name())
    
    for importance_type in importance_types:
        importance_type_order = pd.DataFrame(
            {importance_type:booster.feature_importance(importance_type=importance_type)},
            index = booster.feature_name()
        )

        importance_type_order = importance_type_order.sort_values(by=importance_type, ascending=False)
        importance_type_order[f'{importance_type}_importance'] = np.arange(1, importance_type_order.shape[0]+1)
        importance_type_order.loc[importance_type_order[importance_type]==0, f'{importance_type}_importance'] = importance_type_order.shape[0]+1
        
        results.loc[results.index, f'{importance_type}_importance'] = importance_type_order.loc[results.index, f'{importance_type}_importance']
        
    results['average_importance'] = results.mean(axis=1)
    
    return results.sort_values(by='average_importance', ascending=True)

aa = get_booster_importance(pipeline['mean_regressor_ensemble'].estimators[0])

fig, ax = plt.subplots(1, 1, figsize=(7, 37))

sns.barplot(
    aa['average_importance'],
    orient="y",
    ax=ax
)

In [None]:
import lightgbm as lgb
import seaborn as sns
sns.set()

lgb.plot_importance(
    pipeline['mean_regressor_ensemble'].estimators[0],
    importance_type="gain",
    figsize=(7,37),
    title="LightGBM Feature Importance (Gain)",
)

In [None]:
import lightgbm as lgb
import seaborn as sns
sns.set()

lgb.plot_importance(
    pipeline['mean_regressor_ensemble'].estimators[0],
    importance_type="split",
    figsize=(7,37),
    title="LightGBM Feature Importance (Gain)",
)

In [None]:
# import os
# import joblib
# import uuid

# DATA_PATH = './../data/train.csv'

# VERSION = 'V4'

# MODEL = 'lightgbm'

# SAVE_MODEL_PATH_FOLDER = f'./../output/models/{VERSION}/lightgbm/'
# SAVE_MODEL_BASE_NAME = 'lightgbm'

In [None]:

# for estimator in pipeline['mean_regressor_ensemble'].estimators:
#     os.makedirs(SAVE_MODEL_PATH_FOLDER, exist_ok=True)
#     joblib.dump(estimator, os.path.join(SAVE_MODEL_PATH_FOLDER, f'{SAVE_MODEL_BASE_NAME}-{str(uuid.uuid4())}.joblib'))

In [None]:
# import warnings
# warnings.simplefilter(action='ignore')

# expected_preds = pd.DataFrame()

# expected_preds[['date_id', 'seconds_in_bucket']] = df.loc[df['date_id'] >= 475, ['date_id', 'seconds_in_bucket']]
# expected_preds['expected_preds'] = pipeline.predict(df.loc[df['date_id'] >= 475])
# expected_preds = expected_preds.loc[expected_preds['date_id'] >= 478]
# expected_preds.loc[:, 'submited_preds'] = np.array(preds).flatten()
# expected_preds.loc[:, 'submited_preds_goto'] = pd.concat(preds_goto)['target'].values
# expected_preds

In [None]:
# from pandas.testing import assert_frame_equal

# df_origin = df.tail(11000).reset_index(drop=True)
# pred_current_df = current_day_df.reset_index(drop=True)

# for step in pipeline.steps[:-1]:
#     df_origin = step[1].transform(df_origin)
#     pred_current_df = step[1].transform(pred_current_df)

# assert_frame_equal(
#     df_origin,
#     pred_current_df,
#     check_dtype=True,
#     check_exact=True
# #     atol=1e-6
# )