In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from   IPython.display import display, HTML
from collections import deque
import random
import os
import sys
import pathlib
from collections import deque
from tqdm.notebook import tqdm, trange
import datetime
from scipy import interpolate
import math

import matplotlib.ticker as mtick
import matplotlib.dates as md

from IPython.display import clear_output

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import torch
import gpytorch
import matplotlib.dates as md

In [3]:
parent_module_path = os.path.abspath(os.path.join('..'))
if parent_module_path not in sys.path:
    sys.path.append(parent_module_path)

In [4]:
pd.set_option('display.width', 1000)
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('mode.chained_assignment', None)

In [19]:
def ema(series, periods, fillna=False):
    if fillna:
        return series.ewm(span=periods, min_periods=0).mean()
    return series.ewm(span=periods, min_periods=periods).mean()

def macd_signal(close, n_fast=12, n_slow=26, n_sign=9, fillna=False):
    """Moving Average Convergence Divergence (MACD Signal)
    Args:
        close(pandas.Series): dataset 'Close' column.
        n_fast(int): n period short-term.
        n_slow(int): n period long-term.
        n_sign(int): n period to signal.
        fillna(bool): if True, fill nan values.

    Returns:
        EMA of MACD: pandas.Series: New feature generated.
    """
    emafast = ema(close, n_fast, fillna)
    emaslow = ema(close, n_slow, fillna)
    macd = emafast - emaslow
    macd_signal = ema(macd, n_sign, fillna)
    
    macd_diff = macd - macd_signal
    if fillna:
        macd_signal = macd_signal.replace([np.inf, -np.inf], np.nan).fillna(0)
    return pd.Series(macd_diff, name='MACD_diff')


def macd_signal_2(close, n_fast=12, n_slow=26, n_sign=9, fillna=False):
    emafast = ema(close, n_fast, fillna)
    emaslow = ema(close, n_slow, fillna)
    macd = emafast - emaslow
    macd = (macd - ema(macd, n_sign, fillna))
    macd = macd / close.shift(n_slow+n_sign)
    
    if fillna:
        macd = macd.replace([np.inf, -np.inf], np.nan).fillna(0)
        
    return pd.Series(macd, name='MACD')

def get_LOB_features(LOB_data, trade_date, tick_size):
    # create features
    LOB_data['mid_price']       = (LOB_data['ask_1'] + LOB_data['bid_1'])/2
    LOB_features                = LOB_data[['mid_price']]/ 10000
    LOB_features['imbalance_1'] = (LOB_data['bid_volume_1'] - LOB_data['ask_volume_1'])/(LOB_data['bid_volume_1'] + LOB_data['ask_volume_1'])
    LOB_features['macd_120_260_90'] = macd_signal(LOB_features['mid_price'], n_fast=120, n_slow=260, n_sign=90, fillna=True)
    LOB_features['macd_1200_2600_900'] = macd_signal(LOB_features['mid_price'], n_fast=1200, n_slow=2600, n_sign=900, fillna=True)
    LOB_features['timestamp']   = [datetime.datetime.strptime(trade_date, '%Y-%m-%d') + datetime.timedelta(seconds=i) for i in LOB_features.index]    

#     # create predictors
#     unique_index_values = LOB_features['macd_120_260_90'].round(4).unique()
#     for unique_value in unique_index_values:
#         if unique_value not in macd_model.index:
#             macd_model.loc[unique_value,:] = np.nan
#     macd_model2 = macd_model.sort_index().fillna(method='ffill').fillna(method='bfill')  

#     nb_trades_ahead, fwd_price_move_w = 100, 'fwd_price_move_100'
#     LOB_features['imbalance_signal'] = (imb_model.loc[LOB_features['imbalance_1'].round(2), fwd_price_move_w].values*tick_size/nb_trades_ahead)*len(LOB_features)

#     nb_trades_ahead, fwd_price_move_w = 20000, 'fwd_price_move_20000'
#     LOB_features['macd_1_signal'] = (macd_model2.loc[LOB_features['macd_120_260_90'].round(4), fwd_price_move_w].values*tick_size/nb_trades_ahead)*len(LOB_features)

#     nb_trades_ahead, fwd_price_move_w = 500, 'fwd_price_move_500'
#     LOB_features['macd_2_signal'] = (macd_model2.loc[LOB_features['macd_120_260_90'].round(4), fwd_price_move_w].values*tick_size/nb_trades_ahead)*len(LOB_features)
    
    timeofday = ((LOB_features.timestamp.astype('datetime64[%s]' % 'h') - \
                LOB_features.timestamp.astype('datetime64[%s]' % 'D')).values/3600000000000).astype(float) + \
                (((LOB_features.timestamp.astype('datetime64[%s]' % 'm')-\
                 LOB_features.timestamp.astype('datetime64[%s]' % 'h')).values/60000000000).astype(float)/60).round(1)
    LOB_features['time_of_day'] = timeofday
    
    return LOB_features

In [20]:
def get_LOB_data(data_path, asset_name, trade_date):
    LOB_data_path = os.path.join(data_path, asset_name, f'{asset_name}_{trade_date}_34200000_57600000_orderbook_5.csv')
    LOB_data      = pd.read_csv(LOB_data_path,
                                usecols   = [0, 1, 2, 3],
                                engine    = 'python',
                                index_col = None,
                                header    = None,
                                infer_datetime_format = True)

    LOB_messages = pd.read_csv(f'{data_path}/{asset_name}/{asset_name}_{trade_date}_34200000_57600000_message_5.csv', 
                               usecols   = [0],
                               engine    = 'python',
                               index_col = 0,
                               header    = None)
    LOB_data.index = LOB_messages.index
    LOB_data.columns = [f'ask_1', f'ask_volume_1', f'bid_1', f'bid_volume_1']
    LOB_data.index.name = 'time'
    LOB_data = LOB_data.reset_index(drop=False).groupby('time').last()
    
    return LOB_data

In [9]:
# LOB_data

In [10]:
# imb_model  = pd.read_pickle('../nb_trading/imb_model.pkl')
# macd_model = pd.read_pickle('../nb_trading/macd_model.pkl')

# Load Data & Signals

In [21]:
from AOE.utils import get_meta_order_df, verbose_print
from AOE.plots import rescale_plot
from AOE.plots import hit_ratio_analysis, reward_distribution_analysis, regret_plots, analyze_meta_order

In [26]:
data_path     = pathlib.Path(os.path.abspath(os.path.join('..')), "data")
reward_path   = pathlib.Path(os.path.abspath(os.path.join('..')), "data", "Rewards")
feature_path  = pathlib.Path(os.path.abspath(os.path.join('..')), "data", "Features")

asset_name    = "BIDU"
tick_size     = 0.01

In [28]:
all_data = {}
i        = 0

for filename in os.listdir(os.path.join(data_path, asset_name)):
    i += 1
    f = os.path.join(os.path.join(data_path, asset_name),filename)
    if 'orderbook' in f:
        if os.path.isfile(f):
            trade_date = f.split('_')[1]
            print('%: ', 100*(round(i/len(os.listdir(os.path.join(data_path, asset_name))), 2)) , "%")
            print('File name: ', filename)
            # if not os.path.isfile(f'../data/Features/LOB_features_{trade_date}.pkl'):
            LOB_data      = get_LOB_data(data_path, asset_name, trade_date)
            print('got the data')
            # LOB_data      = LOB_data.iloc[::10,:]
            LOB_features  = get_LOB_features(LOB_data, trade_date, tick_size)
            
            LOB_features['macd_120_260_90'] = macd_signal_2(LOB_features['mid_price'], n_fast=120, n_slow=260, n_sign=90, fillna=True)*1e5
            LOB_features['macd_12_26_9'] = macd_signal_2(LOB_features['mid_price'], n_fast=12, n_slow=26, n_sign=9, fillna=True)*1e5
            
            LOB_features.to_pickle(f'{str(feature_path)}/{asset_name}_{trade_date}_LOB_features.pkl')
            # else:
            #     LOB_features = pd.read_pickle(f'../data/Features/LOB_features_{trade_date}.pkl')
            
            # take data every 10 trades
            LOB_features = LOB_features.set_index('timestamp')
            # all_data[trade_date] = LOB_features
            
            clear_output(wait=True)

%:  3.0 %
File name:  BIDU_2022-12-05_34200000_57600000_orderbook_5.csv


KeyboardInterrupt: 