In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pingouin as pg
import datetime
import seaborn as sns 
import tensorflow as tf
import time
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from arch.unitroot import engle_granger
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from metric_clf_hft import metric
from sklearn import feature_selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM


pd.options.mode.chained_assignment = None 
pd.set_option('display.max_rows', 100)

Functions for features calculations

In [22]:
def Order_book_spread(data: pd.DataFrame):
    """book"""
    #1
    data['Book_spread'] = (data['asks[0].price'] -data['bids[0].price'])

def Trade_price_spread(data: pd.DataFrame, delta_t: str='1000ms'):
    """"trades"""
    #checked
    if 'timestamp' not in data.columns:
        data['timestamp'] = pd.to_datetime(data.index, unit='ns')
    High = data[['price','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).max()
    Low = data[['price','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).min()
    data['price_spread_'+delta_t] = High.price - Low.price
    del High, Low
    
def Trade_price_speed(data: pd.DataFrame, delta_t: str='1000ms'):
    """"trades"""
    #checked
    if 'price_spread_' + delta_t not in data.columns:
        Trade_price_spread(data, delta_t)
    delta_t_int = int(delta_t[:-2])
    data['price_speed_'+delta_t] = data['price_spread_'+delta_t]/delta_t_int

def Trades_frequency(data: pd.DataFrame, delta_t: str='500ms'):
    """"trades"""
    #checked
    if 'timestamp' not in data.columns:
        data['timestamp'] = pd.to_datetime(data.index, unit='ns')
    delta_t_int = int(delta_t[:-2])
    data['Trades_frequency'] = data[['price','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).count().price/delta_t_int   

def Trades_volume_speed(data: pd.DataFrame, delta_t: str='500ms'):
    """"trades"""
    #checked    
    if 'timestamp' not in data.columns:
        data['timestamp'] = pd.to_datetime(data.index, unit='ns')
    delta_t_int = int(delta_t[:-2])
    data['Trades_volume_speed'] = data[['amount','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum().amount/delta_t_int

    data['sells_amount'] = np.where(data.side == 'S', data['amount'], 0)
    data['buys_amount'] = np.where(data.side == 'B', data['amount'], 0)

    data['Sells_volume_speed'] = data[['sells_amount','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum().sells_amount/delta_t_int
    data['Buys_volume_speed'] = data[['buys_amount','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum().buys_amount/delta_t_int

    del data['buys_amount'], data['sells_amount']


def Big_trades_frequency(data: pd.DataFrame, delta_t: str='5000ms', delta_t_block: str='50ms', times_mean: int=3):
    """"trades"""
    #checked 
    if 'timestamp' not in data.columns:
        data['timestamp'] = pd.to_datetime(data.index, unit='ns')
    delta_t_int = int(delta_t[:-2])
    data['block_amount'] =  data[['amount','timestamp']].rolling(delta_t_block, on='timestamp', min_periods=1).sum().amount
    data['C_amount'] = data['block_amount'].cumsum()
    data['one'] = 1
    data['C_count'] = data.one.cumsum()
    data['block_amount_mean'] = data['C_amount']/data['C_count']
    del data['C_amount'], data['one'], data['C_count']
    data['big_trade_flg'] = np.where(data['block_amount'] > data['block_amount_mean'] * times_mean, 1, 0)
    data['Big_trades_frequency'] = data[['big_trade_flg','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum().big_trade_flg/delta_t_int

    data['big_sells_amount'] = np.where((data['block_amount'] > data['block_amount_mean'] * times_mean) & (data.side == 'S'), data['amount'], 0)
    data['big_buys_amount'] = np.where((data['block_amount'] > data['block_amount_mean'] * times_mean) & (data.side == 'B'), data['amount'], 0)

    data['Big_sells_amount_speed'] = data[['big_sells_amount','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum().big_sells_amount/delta_t_int
    data['Big_buys_amount_speed'] = data[['big_buys_amount','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum().big_buys_amount/delta_t_int

    del data['big_trade_flg'], data['big_sells_amount'], data['big_buys_amount'], data['block_amount_mean']


def Book_sum_volume_speed(data: pd.DataFrame, delta_t: str='1000ms', n_levels: int=10) -> np.float64:
    """"book"""
    #checked 
    if 'timestamp' not in data.columns:
        data['timestamp'] = pd.to_datetime(data.index, unit='ns')
    data['asks_volume'] = 0
    data['bids_volume'] = 0
    for i in range(n_levels):
        data['asks_volume'] = data['asks['+str(i)+'].amount'] + data['asks_volume'] 
        data['bids_volume'] = data['bids['+str(i)+'].amount'] + data['bids_volume']
    delta_t_int = int(delta_t[:-2])
    data_prev = data[['timestamp','asks_volume','bids_volume']]
    data_prev.timestamp = data_prev.timestamp + pd.to_timedelta(delta_t) 
    data_delta = pd.merge_asof(data[['timestamp','asks_volume','bids_volume']], data_prev[['timestamp','asks_volume','bids_volume']], suffixes=[None, '_prev'] , direction='backward', left_on='timestamp' , right_on='timestamp')
    data['book_ask_volume_speed'] = (data_delta['asks_volume'].values - data_delta['asks_volume_prev'].values)/delta_t_int
    data['book_bid_volume_speed'] = (data_delta['bids_volume'].values - data_delta['bids_volume_prev'].values)/delta_t_int
    data['book_ask_volume_speed'] = data['book_ask_volume_speed'].fillna(0)
    data['book_bid_volume_speed'] = data['book_bid_volume_speed'].fillna(0)


def Delta_volume_without_trades(book: pd.DataFrame, trades: pd.DataFrame, n_levels: int=9) -> np.float64:
    """"book and trades"""
    #checked 
    if 'timestamp' not in book.columns:
        book['timestamp'] = pd.to_datetime(book.index, unit='ns')
    if 'timestamp' not in trades.columns:
        trades['timestamp'] = pd.to_datetime(trades.index, unit='ns')
    if 'Sells_volume_speed'not in trades.columns:
        Trades_volume_speed(trades)
    if 'book_ask_volume_speed' not in book.columns:
        Book_sum_volume_speed(book)  
    data = pd.merge_asof(book[['timestamp', 'book_ask_volume_speed','book_bid_volume_speed']], trades[['timestamp', 'Sells_volume_speed', 'Buys_volume_speed']], suffixes=[None, '_2'] , direction='backward', left_on='timestamp' , right_on='timestamp')
    data = data.set_index('timestamp')
    data = data.sort_values(by=['timestamp'])
    book['Delta_ask_without_trades'] = data['book_ask_volume_speed'].values - data['Buys_volume_speed'].values
    book['Delta_ask_without_trades'] = book['Delta_ask_without_trades'].fillna(0) 
    book['Delta_bid_without_trades'] = data['book_bid_volume_speed'].values - data['Sells_volume_speed'].values
    book['Delta_bid_without_trades'] = book['Delta_bid_without_trades'].fillna(0) 
    del data


def Naive_delays(book: pd.DataFrame, delta_t: str='10000ms'):
    """book"""
    #checked
    if 'timestamp' not in book.columns:
        book['timestamp'] = pd.to_datetime(book.index, unit='ns')
    delta_t_int = int(delta_t[:-2])
    book_prev = book[['timestamp','asks[0].amount','bids[0].amount']]
    book_prev['timestamp'] = book_prev['timestamp'] + pd.to_timedelta(delta_t) 

    data_delta = pd.merge_asof(book[['timestamp','asks[0].amount','bids[0].amount']], 
                               book_prev[['timestamp','asks[0].amount','bids[0].amount']], 
                               suffixes=[None, '_prev'] , direction='backward', 
                               left_on='timestamp' , right_on='timestamp')
    data_delta['one'] = 1
    data_delta['delta_ask_amount'] = abs(data_delta['asks[0].amount'] - data_delta['asks[0].amount_prev'])
    data_delta['delta_bid_amount'] = abs(data_delta['bids[0].amount'] - data_delta['bids[0].amount_prev'])
    
    data_delta['mean_delta_ask_amount'] = data_delta['delta_ask_amount'].cumsum()/data_delta['one'].cumsum()
    data_delta['mean_delta_bid_amount'] = data_delta['delta_bid_amount'].cumsum()/data_delta['one'].cumsum()

    book['asks_delay'] = book['asks[0].amount'] / (data_delta['mean_delta_ask_amount'].values/delta_t_int)
    book.asks_delay.replace([np.inf, -np.inf], [100000, 0], inplace=True)
    book['asks_delay'] = book['asks_delay'].fillna(10000)
    book['bids_delay'] = book['bids[0].amount'] / (data_delta['mean_delta_bid_amount'].values/delta_t_int)
    book.bids_delay.replace([np.inf, -np.inf], [100000, 0], inplace=True)
    book['bids_delay'] = book['bids_delay'].fillna(10000)

In [23]:
def Average_directional_index(data: pd.DataFrame, delta_n: int=50, period: int=14):
    #checked, no value in predictions
    data['high'] = data.old_price.rolling(delta_n, min_periods=1).max()
    data.high.fillna(data.old_price)
    data['low'] = data.old_price.rolling(delta_n, min_periods=1).min()
    data.low.fillna(data.old_price)
    data['+DM'] = data.high - data.high.shift(delta_n)
    data['+DM'].fillna(0)
    data['-DM'] = data.low.shift(delta_n) - data.low 
    data['-DM'].fillna(0)
    data['H-L'] = data.high - data.low
    data['H-CL'] = np.where(abs(data.high - data.old_price.shift(delta_n)) > 0,
                            abs(data.high - data.old_price.shift(delta_n)), 0.0)
    data['L-CL'] = np.where(abs(data.low - data.old_price.shift(delta_n)) > 0,
                            abs(data.low - data.old_price.shift(delta_n)), 0.0)   
    data['TR'] = data[['H-L', 'H-CL', 'L-CL']].max(axis=1)
    data['TR'].fillna(1)
    del data['H-L'], data['H-CL'], data['L-CL'], data['low'], data['high']
    data['ATR'] = data.TR.ewm(span= delta_n * period).mean()
    data['ATR'].fillna(1)
    del data['TR']  
    data['+DIA'] = 100 * data['+DM'].ewm(span= delta_n * period).mean()/data['ATR']
    data['-DIA'] = 100 * data['-DM'].ewm(span= delta_n * period).mean()/data['ATR']
    data['DIA'] = abs(abs(data['+DIA'])-abs(data['-DIA'])) / abs(abs(data['+DIA']) + abs(data['-DIA'])) * 100
    data['ADX'] = data['DIA'].ewm(alpha=1/period, adjust=False).mean()
    data['ADX'] = data['ADX'].fillna(17)
    data['+DIA'] = data['+DIA'].fillna(17)
    data['-DIA'] = data['-DIA'].fillna(17)
    del data['DIA']

    
def Chande_momentum_oscillator(data: pd.DataFrame, delta_n: int=10, period: int=30):
    #checked but now worth
    roll = pd.concat([data['old_price'].shift(i).rename(i) for i in range(0, delta_n*period, delta_n)], axis=1)
    data['Su'] = roll.lt(data.old_price, axis=0).sum(axis=1)
    data['Sd'] = roll.gt(data.old_price, axis=0).sum(axis=1)
    data['CMO'] = 100 * (data.Su - data.Sd)/(data.Su + data.Sd)
    data['CMO'] = data.CMO.fillna(0)
    del data['Sd'], data['Su'], roll


def Momentum(data: pd.DataFrame, delta_n: int=100, period: int=1):
    # checked.?
    data['MOM'] = data.price - data.price.shift(delta_n*period)
    data['MOM'] = data.MOM.fillna(0)


def Rate_of_change(data: pd.DataFrame, delta_n: int=10, period: int=10):
    #checked, no value
    data['ROC'] = (data.old_price/data.old_price.shift(delta_n*period) - 1) * 100
    data['ROC'] = data.ROC.fillna(0)


def Relative_strength_index(data: pd.DataFrame, delta_n: int=10, period: int=14):
    #checked?
    data['CLd'] = data.old_price - data.old_price.shift(delta_n)
    data['CLd'] = data['CLd'].fillna(0)
    data['CLdd'] = data.CLd - data.CLd.shift(delta_n)
    data['CLdd'] = data['CLdd'].fillna(0)
    data['CL_plus'] = np.select([data.CLdd > 0], [abs(data.CLd)], default=0)
    data['CL_minus'] = np.select([data.CLdd < 0], [abs(data.CLd)], default=0)
    data['CL_plus'] = data.CL_plus.rolling(delta_n*period, min_periods=1).sum()
    data['CL_minus'] = data.CL_minus.rolling(delta_n*period, min_periods=1).sum()
    data['RSI'] = 100 - 100 / (1 + data.CL_plus/data.CL_minus)
    data.RSI.replace([np.inf, -np.inf], [100, 0], inplace=True)
    data['RSI'] = data.RSI.fillna(50)
    del data['CLd'], data['CL_plus'], data['CL_minus']


def Stochast_relative_strength_index(data: pd.DataFrame, delta_n: int=10, period: int=20):
    #checked?
    Relative_strength_index(data, delta_n, period)
    data['HRSI'] = data.RSI.rolling(delta_n*period, min_periods=1).max()
    data['LRSI'] = data.RSI.rolling(delta_n*period, min_periods=1).min()
    data['StochRSI'] = (data.RSI - data.LRSI) / (data.HRSI - data.LRSI) * 100
    data.StochRSI.replace([np.inf, -np.inf], [100, 0], inplace=True)
    data['StochRSI'] = data.StochRSI.fillna(50)
    del data['LRSI'], data['HRSI']


def Linear_regression_line(data: pd.DataFrame, delta_n: int=10, period: int=10):
    #checked?no need
    data['x2'] = data.old_price * data.old_price
    data['y'] = data.old_price.shift(-delta_n)
    data['y'] = data['y'].fillna(data.old_price)
    data['xy'] = data.old_price * data['y'] 
    data['LRLa'] = (delta_n * data['xy'].rolling(delta_n, min_periods=1).sum() - data.old_price.rolling(delta_n, min_periods=1).sum() * data['y'].rolling(delta_n, min_periods=1).sum())\
                / (delta_n * data['x2'].rolling(delta_n, min_periods=1).sum() - data.old_price.rolling(delta_n, min_periods=1).sum() * data.old_price.rolling(delta_n, min_periods=1).sum())
    data['LRLa'].replace([np.inf, -np.inf], [1, 1], inplace=True) 
    data['LRLa'] = data['LRLa'].fillna(1)
    data['b'] = (data['y'].rolling(delta_n, min_periods=1).sum()  - data.LRLa * data.old_price.rolling(delta_n, min_periods=1).sum()) / delta_n
    data['LRL'] = data['LRLa'] * data.old_price + data['b']
    data['LRL'] = data.LRL.fillna(0)
    del data['xy'], data['x2'], data['y']


def Realized_Volatility(data: pd.DataFrame, delta_n: int=10, period: int=10):
    #checked
    data['log_return'] = np.log(data['price']).diff()
    data['real_variance'] = data['log_return'].rolling(delta_n*period, min_periods=1).var() #* annuity_factor
    data['R_Volatility'] = np.sqrt(data['real_variance'])
    data['R_Volatility'] = data.R_Volatility.fillna(0)
    del data['log_return'], data['real_variance']


def Gauss_Kernel(data: pd.DataFrame, delta_n: int=10, std: np.float64=0.02):
    #checked
    data['Kernel'] = data.price.rolling(window=delta_n, win_type='gaussian', min_periods=1).mean(std = std)
    data['Kernel'] = data['Kernel'].fillna(data.price)


def Autocorrelation(data: pd.DataFrame, delta_n: int=30, period: int=4, corr_in_range: bool=True):
    #checked but no value
    if corr_in_range:
        for period_i in range(1, period + 1):
            data['Autocorrelation_lag_' + str(period_i)] = data.price.rolling(delta_n*period_i, min_periods=1).corr(data.price.shift(delta_n*period_i))   
            data['Autocorrelation_lag_' + str(period_i)] = data['Autocorrelation_lag_' + str(period_i)].fillna(0)
            data['Autocorrelation_lag_' + str(period_i)].replace([np.inf, -np.inf], [1, -1], inplace=True)
    else:
        data['Autocorrelation_lag_' + str(period)] = data.price.rolling(delta_n*period, min_periods=1).corr(data.price.shift(delta_n*period))
        data['Autocorrelation_lag_' + str(period)] = data['Autocorrelation_lag_' + str(period)].fillna(0)
        data['Autocorrelation_lag_' + str(period)].replace([np.inf, -np.inf], [1, -1], inplace=True)


def Partial_correlation(data: pd.DataFrame, delta_n: int=10, period: int=2):
    #checked but no value    
    if 'Autocorrelation_lag_1' not in data.columns:
        Autocorrelation(data, delta_n, period=1)
    if 'Autocorrelation_lag_' + str(period) not in data.columns:
        Autocorrelation(data, delta_n, period=period)
    data['r23'] = data.Autocorrelation_lag_1.rolling(delta_n, min_periods=1).corr(data['Autocorrelation_lag_' + str(period)])
    data['r23'] = data['r23'].fillna(0)
    data['r23'].replace([np.inf, -np.inf], [1, -1], inplace=True)
    data['Partial_correlation_'+str(period)] = (data['Autocorrelation_lag_' + str(period)] - data.r23 * data.Autocorrelation_lag_1)\
                                                /np.sqrt((1 - np.square(data['Autocorrelation_lag_' + str(period)])) * (1 - np.square(data.Autocorrelation_lag_1)))
    data['Partial_correlation_'+str(period)] = data['r23'].fillna(0)
    data['Partial_correlation_'+str(period)].replace([np.inf, -np.inf], [1, -1], inplace=True)
    del data['r23']


def Jump_Variation(data: pd.DataFrame, delta_n: int=30):
    #checked
    data['square_delta'] = np.square(abs(data.price - data.price.shift(delta_n)))
    data['bi_delta'] = np.pi*0.5*(abs(data.price - data.price.shift(delta_n))*abs(data.price.shift(delta_n) - data.price.shift(2*delta_n))\
                        + abs(data.price.shift(delta_n) - data.price.shift(2*delta_n)) *abs(data.price.shift(2*delta_n) - data.price.shift(3*delta_n)))
    data['Jump_Variation'] = np.where(data.square_delta - data.bi_delta > 0,
                            data.square_delta - data.bi_delta, 0.0)
    del data['square_delta'] , data['bi_delta']
    data['Jump_Variation'] = data.Jump_Variation.fillna(0)


def Cointegration(data: pd.DataFrame, skip_step: int=100):
    EG = engle_granger(data['asks[0].price'].iloc[::skip_step], data['bids[0].price'].iloc[::skip_step])
    data['Cointegration'] = data['asks[0].price'] + EG.cointegrating_vector['const']  + EG.cointegrating_vector['bids[0].price'] * data['bids[0].price']
    return EG


def Cointegration_from_EG(data: pd.DataFrame, EG):
    data['Cointegration'] = data['asks[0].price'] + EG.cointegrating_vector['const']  + EG.cointegrating_vector['bids[0].price'] * data['bids[0].price']


def Order_book_imbalance_level(data: pd.DataFrame, level: int=0):
    #checked
    data['Book_imb_level_'+ str(level)] = (data['bids['+str(level)+'].amount'] - data['asks['+str(level)+'].amount'])\
                                            /(data['bids['+str(level)+'].amount'] + data['asks['+str(level)+'].amount'])


def Trade_imbalance(data: pd.DataFrame, delta_t: str='500ms') -> list:
    #checked
    if 'timestamp' not in data.columns:
        data['timestamp'] = pd.to_datetime(data.index, unit='ns')
    data['price_volume'] = np.select([data.side == 'S', data.side == 'B'], 
                                     [ -data.old_price * data.amount, data.old_price * data.amount], 
                                     default=0)
    TI = data[['price','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum()
    data['Trade_imbalance'] = TI.price
    del data['price_volume']


def past_returns(data: pd.DataFrame, delta_n: int=30, period: int=20, delta_t: str='50ms') -> list:
    #checked
    if 'timestamp' not in data.columns:
        data['timestamp'] = pd.to_datetime(data.index, unit='ns')
    data['PV'] = data.old_price * data.amount
    TI = data[['PV','amount','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum() 
    TI['avg_p'] =  TI.PV/TI.amount
    data['past_returns'] = (TI['avg_p']/TI['avg_p'].shift(delta_n*period) - 1)*10000
    data['past_returns'] = data['past_returns'].fillna(0)
    del TI


def Divergence(data_1st_market: pd.DataFrame, data_2nd_market: pd.DataFrame, delta_t: str='10s'): 
    #may be add weighted price?
    if 'timestamp' not in data_1st_market.columns:
        data_1st_market['timestamp'] = pd.to_datetime(data_1st_market.index, unit='ns')
    if 'timestamp' not in data_2nd_market.columns:
        data_2nd_market['timestamp'] = pd.to_datetime(data_2nd_market.index, unit='ns')
    data = pd.merge_asof(data_1st_market[['timestamp', 'old_price' ]], data_2nd_market[['timestamp', 'old_price']], suffixes=[None, '_2'] , direction='backward', left_on='timestamp' , right_on='timestamp')
    data = data.set_index('timestamp')
    data = data.sort_values(by=['timestamp'])
    data['d_p1_p2'] = (data.old_price/data.old_price_2 - 1) * 10000 
    data['DIV'] = data['d_p1_p2'] - data['d_p1_p2'].rolling(delta_t, min_periods=1).mean()
    data_1st_market['DIV'] = data['DIV'].values
    data_1st_market['DIV'] = data_1st_market['DIV'].fillna(0)
    del data
    data = pd.merge_asof(data_2nd_market[['timestamp', 'old_price']], data_1st_market[['timestamp', 'old_price' ]], suffixes=[None, '_2'] , direction='backward', left_on='timestamp' , right_on='timestamp')
    data = data.set_index('timestamp')
    data = data.sort_values(by=['timestamp'])
    data['d_p1_p2'] = (data.old_price/data.old_price_2 - 1) * 10000 
    data['DIV'] = data['d_p1_p2'] - data['d_p1_p2'].rolling(delta_t, min_periods=1).mean()
    data_2nd_market['DIV'] = data['DIV'].values
    data_2nd_market['DIV'] = data_2nd_market['DIV'].fillna(0)
    del data


def Accumulation_Distribution_Line(data: pd.DataFrame, delta_t: str='1s') -> list:
    """On trades dataset; delta_t - length of data's blocks"""
    if 'timestamp' not in data.columns:
        data['timestamp'] = pd.to_datetime(data.index, unit='ns')
    time_data = data[['price','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).min()
    data['low'] = time_data.price
    time_data = data[['price','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).max()
    data['high'] = time_data.price
    data['Money_Flow_Multiplier'] = ((data.price - data.low) - (data.high - data.price))/(data.high - data.low)
    del data['high'], data['low']
    time_data = data[['amount','timestamp']].rolling(delta_t, on='timestamp', min_periods=1).sum()
    data['Money_flow_volume'] = time_data.amount * data['Money_Flow_Multiplier']
    data['Money_flow_volume'] = data['Money_flow_volume'].fillna(0)
    data['ADL'] = data['Money_flow_volume'].cumsum()
    del data['Money_flow_volume'], data['Money_Flow_Multiplier']


def Adaptive_log_regression(data: pd.DataFrame, n_levels: int=1, skip_step: int=100):
    """On merged target with bookdata; n_levels - number of calculated levels of book"""
    #checked
    level_names = []
    for i in range(n_levels):
        level_names.append('asks['+str(i)+'].price') 
        level_names.append('asks['+str(i)+'].amount')
        level_names.append('bids['+str(i)+'].price')
        level_names.append('bids['+str(i)+'].amount')
    logreg = LogisticRegression(solver='newton-cg', multi_class='multinomial')
    logreg.fit(data[level_names].iloc[::skip_step],data.target.iloc[::skip_step])
    return logreg, level_names
    

def IMB(data: pd.DataFrame, n_levels: int=5) -> np.float64:
    #checked
    data['PVa'] = 0
    data['Va'] = 0
    data['PVb'] = 0
    data['Vb'] = 0
    for i in range(n_levels):
        data['PVa'] = data['asks['+str(i)+'].price'] * data['asks['+str(i)+'].amount'] + data['PVa']
        data['Va'] = data['asks['+str(i)+'].amount'] + data['Va'] 
        data['PVb'] = data['bids['+str(i)+'].price'] * data['bids['+str(i)+'].amount'] + data['PVb']
        data['Vb'] = data['bids['+str(i)+'].amount'] + data['Vb']  
    data['IMBa_'+str(n_levels)] = ( (data['PVa']/ data['Va']) /data['asks[0].price'] - 1) * 10000
    data['IMBb_'+str(n_levels)] = ( data['bids[0].price'] / (data['PVb']/ data['Vb'])  - 1) * 10000  
    data['IMB_'+str(n_levels)] = data['IMBa_'+str(n_levels)] - data['IMBb_'+str(n_levels)]
    del data['Va'], data['Vb']


def IMB_bp(data: pd.DataFrame, n_levels: int=5) -> np.float64:
    #checked
    bp = 0.0001
    data['PVa'] = 0
    data['Va'] = 0
    data['PVb'] = 0
    data['Vb'] = 0
    for i in range(n_levels):
        data['PVa'] = data['asks['+str(i)+'].price'] * data['asks['+str(i)+'].amount'] + data['PVa']
        data['Va'] = data['asks['+str(i)+'].amount'] + data['Va']
     
        data['PVb'] = data['bids['+str(i)+'].price'] * data['bids['+str(i)+'].amount'] + data['PVb']
        data['Vb'] = data['bids['+str(i)+'].amount'] + data['Vb']  
    data['IMBa_'+str(n_levels)] = ( (data['PVa']/ data['Va']) /data['asks[0].price'] - 1) * 10000
    data['IMBb_'+str(n_levels)] = ( data['bids[0].price'] / (data['PVb']/ data['Vb'])  - 1) * 10000  
    data['IMB_'+str(n_levels)] = data['IMBa_'+str(n_levels)] - data['IMBb_'+str(n_levels)]
    del data['Va'], data['Vb']

In [24]:
def Create_candles(data_target: pd.DataFrame, data_trades: pd.DataFrame, delta_t: str='15s') -> pd.DataFrame:
    delta_t_int = int(delta_t[:-1]) * int(1e9)
    timestamps = [i for i in range(data_target.index.min() // delta_t_int * delta_t_int + delta_t_int,
                                   data_target.index.max() // delta_t_int * delta_t_int + 2 * delta_t_int, delta_t_int)]
    timestamps = pd.DataFrame({'time_point_ns': timestamps})
    timestamps.set_index('time_point_ns', drop=True, inplace=True)
    timestamps['timestamp'] = pd.to_datetime(timestamps.index, unit='ns')
    resu = pd.merge_asof(timestamps , data_trades[['price', 'amount', 'side']], direction='backward', left_on='time_point_ns' , right_on= 'local_ts').dropna()
    resu['high'] = resu[['price', 'timestamp']].rolling('1ms', on='timestamp', center=True, min_periods=1).max().price
    resu['low'] = resu[['price', 'timestamp']].rolling('1ms', on='timestamp', center=True, min_periods=1).min().price
    resu['spread'] = resu.high - resu.low
    resu['sell_volume'] = np.select([resu.side == 'S', resu.side == 'B'], 
                                 [resu.price * resu.amount, 0], 
                                 default=0)
    resu['buy_volume'] = np.select([resu.side == 'S', resu.side == 'B'], 
                                 [0, resu.price * resu.amount], 
                                 default=0)
    resu['d_volume'] = np.select([resu.side == 'S', resu.side == 'B'], 
                                 [-resu.price * resu.amount, resu.price * resu.amount], 
                                 default=0)
    resu['sell_volume'] = resu[['sell_volume', 'timestamp']].rolling('1ms', on='timestamp', center=True, min_periods=1).sum().sell_volume
    resu['buy_volume'] = resu[['buy_volume', 'timestamp']].rolling('1ms', on='timestamp', center=True, min_periods=1).sum().buy_volume
    resu['delta_volume'] = resu[['d_volume', 'timestamp']].rolling('1ms', on='timestamp', center=True, min_periods=1).sum().d_volume
    resu['volume'] = resu[['amount', 'timestamp']].rolling('1ms', on='timestamp', center=True, min_periods=1).sum().amount
    resu.set_index('timestamp', drop=True, inplace=True)
    resu = resu[~resu.index.duplicated(keep='last')]
    resu['close_price'] = resu['price']
    del resu['d_volume'], resu['price'], resu['amount'], resu['side']
    suf = '_cdl_'+delta_t
    resu = resu.add_suffix(suf)
    return resu
    
    
def ATR_cdl(data_candles: pd.DataFrame, time: str,  period: int=14, mean_period: int=15):
    suffix = '_cdl_'+time
    data_candles['min-close'] = abs(data_candles['low'+suffix] - data_candles['close_price'+suffix])
    data_candles['max-close'] = abs(data_candles['high'+suffix] - data_candles['close_price'+suffix])
    data_candles['TR'] = data_candles[['min-close', 'max-close', 'spread'+suffix]].max(axis=1)
    ATR = [data_candles['TR'].iloc[0]]
    ln = len(data_candles)
    for i in range(1, ln):
        ATR.append((ATR[i-1]*(period - 1) + data_candles['TR'].iloc[i])/period)
    data_candles['ATR'+suffix] = ATR
    del data_candles['TR'], data_candles['max-close'], data_candles['min-close']
    data_candles['ATR_mean'+str(mean_period)+suffix] = data_candles['ATR'+suffix].rolling(mean_period, min_periods=1).mean()
    data_candles['ATR_dev'+suffix] = data_candles['ATR_mean'+str(mean_period)+suffix] - data_candles['ATR'+suffix]
    #data_candles['ATR'] = data.TR.ewm(span= delta_n * period).mean()


    
def EMA_cdl(data_candles: pd.DataFrame, time: str,  period: int=233):
    suffix = str(period)+'_cdl_'+time 
    data_candles['EMA'+suffix] = data_candles['close_price'+'_cdl_'+time ].ewm(span=period).mean()
    data_candles['EMA'+suffix+'dist'] = (data_candles['close_price'+'_cdl_'+time ] - data_candles['EMA'+suffix]) / data_candles['close_price'+'_cdl_'+time ]


def EMA_INTCP_cdl(data_candles: pd.DataFrame, time: str,  period_1st: int=50, period_2nd: int=21):
    suffix = '_cdl_'+time
    EMA_cdl(data_candles, time, period=period_1st)
    EMA_cdl(data_candles, time, period=period_2nd)
    data_candles['EMA_intr'+suffix+str(period_1st)+'_'+str(period_2nd)] = \
        data_candles['EMA'+str(period_1st)+suffix] - data_candles['EMA'+str(period_2nd)+suffix]
    data_candles['EMA_intr_rel'+suffix+str(period_1st)+'_'+str(period_2nd)] = \
        data_candles['EMA_intr'+suffix+str(period_1st)+'_'+str(period_2nd)]/data_candles['close_price'+suffix]
    

def Levels_cdl(data_candles: pd.DataFrame, time: str):
    suffix = '_cdl_'+time
    lvls = pd.DataFrame(data_candles.index)
    lvls.set_index('timestamp', drop=True, inplace=True)
    lvls['lvl_up'] = np.where((data_candles['high'+suffix] > data_candles['high'+suffix].shift(1)) 
                            & (data_candles['high'+suffix] > data_candles['high'+suffix].shift(-1))
                            & (data_candles['high'+suffix].shift(1) > data_candles['high'+suffix].shift(2))
                            & (data_candles['high'+suffix].shift(-1) > data_candles['high'+suffix].shift(-2)),
                            data_candles['high'+suffix], np.NaN)
    lvls['lvl_down'] = np.where((data_candles['low'+suffix] < data_candles['low'+suffix].shift(1)) 
                            & (data_candles['low'+suffix] < data_candles['low'+suffix].shift(-1))
                            & (data_candles['low'+suffix].shift(1) < data_candles['low'+suffix].shift(2))
                            & (data_candles['low'+suffix].shift(-1) < data_candles['low'+suffix].shift(-2)),
                            data_candles['low'+suffix], np.NaN)
    lvls = pd.concat([lvls['lvl_up'], lvls['lvl_down']])
    lvls = lvls.dropna()
    lvls = pd.DataFrame(lvls)
    lvls = lvls.sort_index()
    lvls = lvls.rename(columns={0: 'levels'+suffix})
    data_lvls = pd.merge_asof(data_candles, lvls,  direction='backward', left_on='timestamp' , right_on='timestamp')
    data_lvls['levels'+suffix] = data_lvls['levels'+suffix].fillna(data_lvls['close_price'+suffix])
    data_candles['levels'+suffix] = data_lvls['levels'+suffix].values
    data_candles['levels_dist'+suffix] = (data_candles['levels'+suffix] - data_candles['close_price'+suffix])/data_candles['close_price'+suffix]
    del data_lvls, lvls
    

Data reading

In [25]:
target_dataset = pd.read_csv('target.csv', index_col='local_ts')
target_dataset = target_dataset.sort_index()

In [26]:
target_dataset = target_dataset.iloc[::20]

In [27]:
#r, sf = Create_candles(target_dataset,trades_dataset, '6s' )

In [28]:
book_dataset = pd.read_csv('book.csv.gz', index_col='local_timestamp', usecols=[i for i in range(3,44)]) # , nrows=100000 для тестирования
#ticker_dataset = pd.read_csv('ticker.csv.gz', index_col='local_ts' , nrows=1000000 ) 
trades_dataset = pd.read_csv('trades.csv.gz', index_col='local_ts', usecols=(0,4,5,6))
book_dataset = book_dataset[~book_dataset.index.duplicated(keep='last')]
#ticker_dataset = ticker_dataset[~ticker_dataset.index.duplicated(keep='last')]
trades_dataset = trades_dataset[~trades_dataset.index.duplicated(keep='last')]
book_dataset = book_dataset.sort_index()
#ticker_dataset = ticker_dataset.sort_index()
trades_dataset = trades_dataset.sort_index()

Split data to train, validation, target (5:2:3) parts.

In [29]:
data_len = len(target_dataset)
train_end = int(data_len*0.5)
validation_end = int(data_len*0.7)
train_target = target_dataset.iloc[:train_end]
validation_target = target_dataset.iloc[train_end+1:validation_end]
test_target = target_dataset.iloc[validation_end+1:]

train_time_end = train_target.index.max()
validation_time_end = validation_target.index.max()

del validation_target

In [30]:
train_book_dataset = book_dataset.loc[(book_dataset.index < train_time_end)]
#validation_book_dataset = book_dataset.loc[(book_dataset.index < validation_time_end) & (book_dataset.index >= train_time_end)]

train_trades_dataset = trades_dataset.loc[(trades_dataset.index < train_time_end)]
#validation_trades_dataset = trades_dataset.loc[(trades_dataset.index < validation_time_end) & (trades_dataset.index >= train_time_end)]

del book_dataset, trades_dataset, target_dataset #, validation_book_dataset, validation_trades_dataset, validation_target
#train_book_dataset, train_trades_dataset, train_target

book_spot_dataset = pd.read_csv('book.spot.csv.gz', index_col='local_timestamp', usecols=[i for i in range(3,44)]) # , nrows=100000 для тестирования
#ticker_spot_dataset = pd.read_csv('ticker.spot.csv.gz', index_col='local_ts' , nrows=1000000 ) 
trades_spot_dataset = pd.read_csv('trades.spot.csv.gz', index_col='local_ts')


book_spot_dataset = book_spot_dataset[~book_spot_dataset.index.duplicated(keep='last')]
#ticker_spot_dataset = ticker_spot_dataset[~ticker_spot_dataset.index.duplicated(keep='last')]
trades_spot_dataset = trades_spot_dataset[~trades_spot_dataset.index.duplicated(keep='last')]
book_spot_dataset = book_spot_dataset.sort_index()

#ticker_spot_dataset = ticker_spot_dataset.sort_index()
trades_spot_dataset = trades_spot_dataset.sort_index()

train_book_spot_dataset = book_spot_dataset.loc[(book_spot_dataset.index < train_time_end)]
#validation_book_spot_dataset = book_spot_dataset.loc[(book_spot_dataset.index < validation_time_end) & (book_spot_dataset.index >= train_time_end)]
train_trades_spot_dataset = trades_spot_dataset.loc[(trades_spot_dataset.index < train_time_end)]
#validation_trades_spot_dataset = trades_spot_dataset.loc[(trades_spot_dataset.index < validation_time_end) & (trades_spot_dataset.index >= train_time_end)]

train_trades_dataset['old_price'] = train_trades_dataset.price
train_trades_dataset['price'] = (train_trades_dataset['old_price'] - 10000) / (70000)
#validation_trades_dataset['old_price'] = validation_trades_dataset.price
#validation_trades_dataset['price'] = (validation_trades_dataset['old_price']- 10000) / (70000)

train_trades_spot_dataset['old_price'] = train_trades_spot_dataset.price
train_trades_spot_dataset['price'] = (train_trades_spot_dataset['old_price']- 10000) / (70000)
#validation_trades_spot_dataset['old_price'] = validation_trades_spot_dataset.price
#validation_trades_spot_dataset['price'] = (validation_trades_spot_dataset['old_price']- 10000) / (70000)


Create features

In [31]:
Candle_5  = Create_candles(train_target, train_trades_dataset, '5s')
Candle_50 = Create_candles(train_target, train_trades_dataset, '50s')
ATR_cdl(Candle_5, '5s')
EMA_cdl(Candle_5, '5s')
EMA_INTCP_cdl(Candle_5, '5s')
Levels_cdl(Candle_5, '5s')
ATR_cdl(Candle_50, '50s')
EMA_cdl(Candle_50, '50s')
EMA_INTCP_cdl(Candle_50, '50s')
Levels_cdl(Candle_50, '50s')


In [32]:
Divergence(train_trades_dataset, train_trades_spot_dataset)
train_trades_dataset = pd.concat([train_trades_dataset, train_trades_spot_dataset])
train_trades_dataset = train_trades_dataset.sort_index()
del train_trades_spot_dataset

#Divergence(validation_trades_dataset, validation_trades_spot_dataset)
#validation_trades_dataset = pd.concat([validation_trades_dataset, validation_trades_spot_dataset])
#validation_trades_dataset = validation_trades_dataset.sort_index()
#del validation_trades_spot_dataset

train_book_dataset = pd.concat([train_book_dataset, train_book_spot_dataset])
train_book_dataset = train_book_dataset.sort_index()
del train_book_spot_dataset

#validation_book_dataset = pd.concat([validation_book_dataset, validation_book_spot_dataset])
#validation_book_dataset = validation_book_dataset.sort_index()

In [33]:
Order_book_spread(train_book_dataset)
Book_sum_volume_speed(train_book_dataset)
Naive_delays(train_book_dataset)

EG = Cointegration(train_book_dataset)
Order_book_imbalance_level(train_book_dataset)
IMB(train_book_dataset)

Autocorrelation(train_trades_dataset)
Accumulation_Distribution_Line(train_trades_dataset, '100ms')
Average_directional_index(train_trades_dataset, 100)
Chande_momentum_oscillator(train_trades_dataset, 75)
Momentum(train_trades_dataset, 500)
Rate_of_change(train_trades_dataset, 500)
Stochast_relative_strength_index(train_trades_dataset, 150)
Linear_regression_line(train_trades_dataset, 350)
Realized_Volatility(train_trades_dataset, 150)
Gauss_Kernel(train_trades_dataset, 500)
Autocorrelation(train_trades_dataset)
Partial_correlation(train_trades_dataset)
Jump_Variation(train_trades_dataset, 500)
Trade_imbalance(train_trades_dataset, '500ms')
past_returns(train_trades_dataset, 250)

Trade_price_spread(train_trades_dataset)
Trade_price_speed(train_trades_dataset, '10000ms')
Trades_frequency(train_trades_dataset, '250ms')
Trades_volume_speed(train_trades_dataset)
Big_trades_frequency(train_trades_dataset)

Delta_volume_without_trades(train_book_dataset, train_trades_dataset)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [34]:
train_book_dataset.timestamp = train_book_dataset.index
train_target_plus_book = pd.merge_asof(train_target, train_book_dataset,  direction='backward', left_on='local_ts' , right_on='local_timestamp').dropna()

del train_target, train_book_dataset

Logreg, level_names = Adaptive_log_regression(train_target_plus_book)
train_target_plus_book['Logreg'] = Logreg.predict(train_target_plus_book[level_names])




In [35]:
train_target_plus_book['local_timestamp'] = train_target_plus_book.timestamp.astype(np.int64)
del train_target_plus_book['timestamp']
train_target_plus_book = train_target_plus_book.sort_values(by=['local_timestamp'])
train_target_plus_book = pd.merge_asof(train_target_plus_book, train_trades_dataset,  direction='backward', left_on='local_timestamp' , right_on='local_ts')#.dropna()
del train_trades_dataset
train_target_plus_book = train_target_plus_book.drop_duplicates(keep='first', subset = ['asks[0].price', 'asks[0].amount', 'bids[0].price', 'bids[0].amount'])

In [49]:
del train_target_plus_book['seq'], train_target_plus_book['remote_ts'],  train_target_plus_book['remote_ts2'] 

In [51]:
train_target_plus_book['timestamp']

0        2023-03-22 00:00:06.434887936
1        2023-03-22 00:00:06.561518080
2        2023-03-22 00:00:06.689776128
3        2023-03-22 00:00:06.803023872
4        2023-03-22 00:00:06.900708608
                      ...             
939127   2023-03-23 06:31:00.575541760
939130   2023-03-23 06:31:00.602443459
939131   2023-03-23 06:31:00.624221696
939132   2023-03-23 06:31:00.624584852
939135   2023-03-23 06:31:00.652203008
Name: timestamp, Length: 742937, dtype: datetime64[ns]

In [52]:
train_target_plus_book = pd.merge_asof(train_target_plus_book, Candle_5,  direction='backward', left_on='timestamp' , right_on='timestamp')
train_target_plus_book = pd.merge_asof(train_target_plus_book, Candle_50,  direction='backward', left_on='timestamp' , right_on='timestamp')

In [57]:
train_target_plus_book

Unnamed: 0,target,asks[0].price,asks[0].amount,bids[0].price,bids[0].amount,asks[1].price,asks[1].amount,bids[1].price,bids[1].amount,asks[2].price,...,EMA233_cdl_50s,EMA233_cdl_50sdist,EMA50_cdl_50s,EMA50_cdl_50sdist,EMA21_cdl_50s,EMA21_cdl_50sdist,EMA_intr_cdl_50s50_21,EMA_intr_rel_cdl_50s50_21,levels_cdl_50s,levels_dist_cdl_50s
393,-1,28109.44,0.19034,28109.21,0.05320,28109.65,0.24000,28108.97,0.00068,28109.76,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
394,-1,28093.40,10.86000,28093.10,0.00100,28093.50,0.52400,28093.00,0.21300,28093.60,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
395,-1,28091.80,3.19500,28091.70,0.00100,28091.90,6.29200,28091.60,0.00100,28092.00,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
396,-1,28106.45,0.00564,28106.02,0.39491,28106.98,0.07342,28105.83,0.05320,28107.27,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
397,-1,28090.70,11.55400,28090.60,0.10600,28090.80,0.26500,28090.50,0.02000,28090.90,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742932,1,27631.80,0.02600,27631.70,13.26900,27631.90,0.02300,27631.60,0.11500,27632.10,...,0.249169,0.010768,0.251279,0.002393,0.251702,0.000712,-0.000423,-0.001681,0.251691,-0.000754
742933,1,27633.50,0.02600,27633.40,2.82800,27633.70,1.28100,27633.30,1.32500,27633.90,...,0.249169,0.010768,0.251279,0.002393,0.251702,0.000712,-0.000423,-0.001681,0.251691,-0.000754
742934,1,27648.82,0.01522,27648.81,7.77154,27649.04,0.05709,27648.73,0.01602,27649.17,...,0.249169,0.010768,0.251279,0.002393,0.251702,0.000712,-0.000423,-0.001681,0.251691,-0.000754
742935,1,27633.50,0.02100,27633.40,19.71700,27633.70,1.28000,27633.20,0.32100,27633.90,...,0.249169,0.010768,0.251279,0.002393,0.251702,0.000712,-0.000423,-0.001681,0.251691,-0.000754


Model training

In [56]:
train_target_plus_book.replace([np.inf, -np.inf], np.nan, inplace=True)
train_target_plus_book = train_target_plus_book.dropna()

In [58]:
features_to_use = [i for i in train_target_plus_book.columns]

In [59]:
for f in ['local_timestamp','side','timestamp', 'target']:
    features_to_use.remove(f)

In [71]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(train_target_plus_book[features_to_use].iloc[::10]) 

In [67]:
train_target_plus_book[features_to_use].iloc[::10]

Unnamed: 0,asks[0].price,asks[0].amount,bids[0].price,bids[0].amount,asks[1].price,asks[1].amount,bids[1].price,bids[1].amount,asks[2].price,asks[2].amount,...,EMA233_cdl_50s,EMA233_cdl_50sdist,EMA50_cdl_50s,EMA50_cdl_50sdist,EMA21_cdl_50s,EMA21_cdl_50sdist,EMA_intr_cdl_50s50_21,EMA_intr_rel_cdl_50s50_21,levels_cdl_50s,levels_dist_cdl_50s
393,28109.44,0.19034,28109.21,0.05320,28109.65,0.24000,28108.97,0.00068,28109.76,0.00070,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
396,28106.45,0.00564,28106.02,0.39491,28106.98,0.07342,28105.83,0.05320,28107.27,0.00065,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
399,28107.13,0.00266,28106.04,0.01221,28107.14,0.53429,28106.03,0.00300,28107.56,0.00074,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
402,28105.12,0.24000,28104.87,0.02000,28105.42,0.19868,28104.84,0.00700,28105.47,0.00001,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
405,28104.88,0.19048,28104.57,0.02799,28104.89,0.02866,28104.44,0.10029,28105.13,0.22689,...,0.258490,0.000000,0.258490,0.000000,0.258490,0.000000,0.000000,0.000000,0.258490,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742923,27631.80,6.13600,27631.70,8.43300,27631.90,0.02500,27631.60,0.09000,27632.10,0.01900,...,0.249169,0.010768,0.251279,0.002393,0.251702,0.000712,-0.000423,-0.001681,0.251691,-0.000754
742926,27647.80,0.28481,27647.79,8.53559,27647.81,0.07826,27647.73,0.00322,27648.03,0.00187,...,0.249169,0.010768,0.251279,0.002393,0.251702,0.000712,-0.000423,-0.001681,0.251691,-0.000754
742929,27631.80,3.90300,27631.70,10.85100,27631.90,0.02300,27631.60,0.10200,27632.10,0.01900,...,0.249169,0.010768,0.251279,0.002393,0.251702,0.000712,-0.000423,-0.001681,0.251691,-0.000754
742932,27631.80,0.02600,27631.70,13.26900,27631.90,0.02300,27631.60,0.11500,27632.10,0.01900,...,0.249169,0.010768,0.251279,0.002393,0.251702,0.000712,-0.000423,-0.001681,0.251691,-0.000754


In [72]:
model = LogisticRegression(solver='newton-cg',  multi_class='multinomial')
model.fit(X_scale, train_target_plus_book[['target']].iloc[::10])

  y = column_or_1d(y, warn=True)


In [82]:
del train_target_plus_book

TEST

In [73]:
book_dataset = pd.read_csv('book.csv.gz', index_col='local_timestamp', usecols=[i for i in range(3,44)]) # , nrows=100000 для тестирования
#ticker_dataset = pd.read_csv('ticker.csv.gz', index_col='local_ts' , nrows=1000000 ) 
trades_dataset = pd.read_csv('trades.csv.gz', index_col='local_ts')
book_dataset = book_dataset[~book_dataset.index.duplicated(keep='last')]
#ticker_dataset = ticker_dataset[~ticker_dataset.index.duplicated(keep='last')]
trades_dataset = trades_dataset[~trades_dataset.index.duplicated(keep='last')]
book_dataset = book_dataset.sort_index()
#ticker_dataset = ticker_dataset.sort_index()
trades_dataset = trades_dataset.sort_index()

In [74]:
test_book_dataset = book_dataset.loc[(book_dataset.index >= validation_time_end)]
test_trades_dataset = trades_dataset.loc[(trades_dataset.index >= validation_time_end)]

In [63]:
del book_dataset, trades_dataset

In [75]:
book_spot_dataset = pd.read_csv('book.spot.csv.gz', index_col='local_timestamp', usecols=[i for i in range(3,44)]) # , nrows=100000 для тестирования
#ticker_spot_dataset = pd.read_csv('ticker.spot.csv.gz', index_col='local_ts' , nrows=1000000 ) 
trades_spot_dataset = pd.read_csv('trades.spot.csv.gz', index_col='local_ts')
book_spot_dataset = book_spot_dataset[~book_spot_dataset.index.duplicated(keep='last')]
#ticker_spot_dataset = ticker_spot_dataset[~ticker_spot_dataset.index.duplicated(keep='last')]
trades_spot_dataset = trades_spot_dataset[~trades_spot_dataset.index.duplicated(keep='last')]
book_spot_dataset = book_spot_dataset.sort_index()
#ticker_spot_dataset = ticker_spot_dataset.sort_index()
trades_spot_dataset = trades_spot_dataset.sort_index()

In [76]:
test_book_spot_dataset = book_spot_dataset.loc[(book_spot_dataset.index >= validation_time_end)]
test_trades_spot_dataset = trades_spot_dataset.loc[(trades_spot_dataset.index >= validation_time_end)]

test_trades_spot_dataset['old_price'] = test_trades_spot_dataset.price
test_trades_spot_dataset['price'] = (test_trades_spot_dataset['old_price']- 10000) / (70000)

#del trades_spot_dataset, book_spot_dataset

test_trades_dataset['old_price'] = test_trades_dataset.price
test_trades_dataset['price'] = (test_trades_dataset['old_price'] - 10000) / (70000)

test_book_dataset = pd.concat([test_book_dataset, test_book_spot_dataset])
test_book_dataset = test_book_dataset.sort_index()
#del test_book_spot_dataset

In [77]:
Candle_5_test  = Create_candles(test_target, test_trades_dataset, '5s')
Candle_50_test= Create_candles(test_target, test_trades_dataset, '50s')
ATR_cdl(Candle_5_test, '5s')
EMA_cdl(Candle_5_test, '5s')
EMA_INTCP_cdl(Candle_5_test, '5s')
Levels_cdl(Candle_5_test, '5s')
ATR_cdl(Candle_50_test, '50s')
EMA_cdl(Candle_50_test, '50s')
EMA_INTCP_cdl(Candle_50_test, '50s')
Levels_cdl(Candle_50_test, '50s')

In [78]:
Divergence(test_trades_dataset, test_trades_spot_dataset)
test_trades_dataset = pd.concat([test_trades_dataset, test_trades_spot_dataset])
test_trades_dataset = test_trades_dataset.sort_index()
del test_trades_spot_dataset

In [79]:
Order_book_spread(test_book_dataset)
Book_sum_volume_speed(test_book_dataset)
Naive_delays(test_book_dataset)

Cointegration_from_EG(test_book_dataset, EG)
Order_book_imbalance_level(test_book_dataset)
IMB(test_book_dataset)

Autocorrelation(test_trades_dataset)
Accumulation_Distribution_Line(test_trades_dataset, '100ms')
Average_directional_index(test_trades_dataset, 100)
Chande_momentum_oscillator(test_trades_dataset, 75)
Momentum(test_trades_dataset, 500)
Rate_of_change(test_trades_dataset, 500)
Stochast_relative_strength_index(test_trades_dataset, 150)
Linear_regression_line(test_trades_dataset, 350)
Realized_Volatility(test_trades_dataset, 150)
Gauss_Kernel(test_trades_dataset, 500)
Autocorrelation(test_trades_dataset)
Partial_correlation(test_trades_dataset)
Jump_Variation(test_trades_dataset, 500)
Trade_imbalance(test_trades_dataset, '500ms')
past_returns(test_trades_dataset, 250)

Trade_price_spread(test_trades_dataset)
Trade_price_speed(test_trades_dataset, '10000ms')
Trades_frequency(test_trades_dataset, '250ms')
Trades_volume_speed(test_trades_dataset)
Big_trades_frequency(test_trades_dataset)

Delta_volume_without_trades(test_book_dataset, test_trades_dataset)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [80]:
test_book_dataset['Logreg'] = Logreg.predict(test_book_dataset[level_names])

In [93]:
test_target['timestamp'] = test_target.index.astype(np.int64)
test_target_plus_book = pd.merge_asof(test_target, test_book_dataset,  direction='backward', left_on='local_ts' , right_on='local_timestamp').dropna()
##del  test_book_dataset, #test_target
test_target_plus_book = test_target_plus_book.set_index('timestamp_x')
test_target_plus_book = test_target_plus_book.sort_index()
test_target_features = pd.merge_asof(test_target_plus_book, test_trades_dataset,  direction='backward', left_on='timestamp_x' , right_on='local_ts')#.dropna()
#del test_target_plus_book

In [96]:
test_target_features = pd.merge_asof(test_target_features, Candle_5_test,  direction='backward', left_on='timestamp' , right_on='timestamp')

In [97]:

test_target_features = pd.merge_asof(test_target_features, Candle_50_test,  direction='backward', left_on='timestamp' , right_on='timestamp')

In [103]:
test_target_features = test_target_features.dropna()

In [88]:
a = [i for i in test_target_features.columns]

In [105]:
X_scale_test = scaler.fit_transform(test_target_features[features_to_use]) 

In [106]:
predicted_target_test = model.predict(X_scale_test)

In [107]:
metric(y_true=test_target_features['target'], y_pred=predicted_target_test )

210.71139500408572

In [108]:
from xgboost import XGBClassifier

In [109]:
train_target_plus_book['target_XGBC'] = np.where(train_target_plus_book['target'] == -1, 2,train_target_plus_book['target'])

In [112]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_scale, train_target_plus_book['target_XGBC'].iloc[::10])

In [44]:
xgb_clf.feature_importances_

array([0.0091322 , 0.01448726, 0.02004259, 0.03243427, 0.02031382,
       0.00606479, 0.02758118, 0.00668239, 0.00349207, 0.02284849,
       0.00881061, 0.00228742, 0.00282956, 0.0339332 , 0.0146613 ,
       0.01197764, 0.01101955, 0.01545475, 0.01679076, 0.00476971,
       0.00470033, 0.00409855, 0.00360795, 0.00256541, 0.        ,
       0.02054603, 0.00092013, 0.        , 0.04036542, 0.004103  ,
       0.00546963, 0.0064386 , 0.01423212, 0.0175676 , 0.02941175,
       0.02950813, 0.0100111 , 0.01168927, 0.00657171, 0.01406173,
       0.04818193, 0.01017771, 0.02938008, 0.00870763, 0.00801807,
       0.0450674 , 0.05704783, 0.0194346 , 0.02836399, 0.03050713,
       0.00204966, 0.0082793 , 0.01826847, 0.00164995, 0.01151221,
       0.00962588, 0.07677917, 0.00580664, 0.00437863, 0.01806756,
       0.01641922, 0.00235003, 0.00898653, 0.01070829, 0.00875008],
      dtype=float32)

In [113]:
predicted_target_test = xgb_clf.predict(X_scale_test)

In [114]:
pred = pd.DataFrame(predicted_target_test)

In [115]:
pred['prediction'] = np.where(predicted_target_test== 2, -1, predicted_target_test)

In [116]:
pred['prediction'].value_counts()

prediction
 1    231180
 0    217602
-1    114679
Name: count, dtype: int64

In [117]:
metric(y_true=test_target_features['target'], y_pred=pred['prediction'] )

78.88331073493447

In [119]:
LGBMC = LGBMClassifier()
LGBMC.fit(X_scale,  train_target_plus_book[['target']].iloc[::10])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [120]:
L_pred = LGBMC.predict(X_scale_test)

In [121]:
metric(y_true=test_target_features['target'], y_pred=L_pred )

139.44925606400585

In [132]:
X_scale.shape

(74255, 141)

In [133]:
X_data = X_scale.reshape(X_scale.shape[0],X_scale.shape[1], 1)

In [138]:
TimeSteps=X_data.shape[1]
TotalFeatures=X_data.shape[2]

In [181]:
y_d = train_target_plus_book[['target']].iloc[::10]

In [182]:
y_d = np.select([y_d == -1, y_d == 0, y_d == 1,], 
          [ 0, 1, 2], 
           default=0)

In [189]:
regressor = Sequential()
regressor.add(LSTM(units = 10, activation = 'relu', input_shape = (TimeSteps, TotalFeatures), return_sequences=True))
regressor.add(LSTM(units = 5, activation = 'relu', input_shape = (TimeSteps, TotalFeatures), return_sequences=True))
regressor.add(LSTM(units = 5, activation = 'relu', return_sequences=False ))
regressor.add(Dense(3, activation='softmax'))

regressor.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [190]:
StartTime=time.time()
regressor.fit(X_data, y_d, batch_size = 300, epochs = 30)
EndTime=time.time()
print("## Total Time Taken: ", round((EndTime-StartTime)/60), 'Minutes ##')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
## Total Time Taken:  12 Minutes ##


In [191]:
pr = regressor.predict(X_scale_test)



In [194]:
pr

array([[0.1736963 , 0.6184458 , 0.20785786],
       [0.16969083, 0.62888676, 0.2014224 ],
       [0.149612  , 0.68127537, 0.16911258],
       ...,
       [0.40931955, 0.22292131, 0.36775914],
       [0.39851695, 0.24517979, 0.35630327],
       [0.38465923, 0.27222317, 0.34311756]], dtype=float32)

In [202]:
pr[:,0]

array([0.1736963 , 0.16969083, 0.149612  , ..., 0.40931955, 0.39851695,
       0.38465923], dtype=float32)

In [203]:
neuropredict = pd.DataFrame({'minus': pr[:,0], 'zero': pr[:,1], 'plus': pr[:,2]})

In [217]:
neuropredict['predict'] = np.select([neuropredict.minus > 0.33, neuropredict.plus > 0.33], 
                                    [ -1, 1], default=0)

In [218]:
metric(y_true=test_target_features['target'], y_pred=neuropredict['predict'] )

-69.5862494034601