# Task
Нужно написать обвязку для проверки результатов классификации.

ПОсле того как модель отдает индексы предсказанных правильных ответов их нужно применить к изначальному датасету и посчитать результаты.

# Import modules

In [234]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn')
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = (10, 8) 

from sklearn import set_config, preprocessing # предобработка
set_config(transform_output='pandas')
from sklearn.model_selection import train_test_split # сплитование выборки
from statsmodels.tsa.stattools import adfuller

import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, GRU, SimpleRNN, Dense, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Data

In [235]:
data = pd.read_parquet('..\ETH-Full-1H.parquet')

In [236]:
df = data.copy()

In [237]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49337 entries, 2017-09-01 00:00:00 to 2023-04-23 23:00:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    49337 non-null  float64
 1   High    49337 non-null  float64
 2   Low     49337 non-null  float64
 3   Close   49337 non-null  float64
 4   Volume  49337 non-null  float64
 5   Trades  49337 non-null  int32  
dtypes: float64(5), int32(1)
memory usage: 2.4 MB


# Feature Enginiring
Логарифмирую все признаки

In [238]:
cols = df.columns
for col in cols:
    df[col + '_log'] = np.log(df[col])

In [239]:
# df.drop(columns=cols, inplace=True) # удаляю все старые признаки

Тени от открытия бара на лог шкале

In [240]:
df['Low_Shad_log'] = df['Open_log'] - df['Low_log']
df['High_Shad_log'] = df['High_log'] - df['Open_log']

Прирост за бар на лог шкале

Прирост и тени от открытия бара, т.к. в истории есть пропуски по много часов и они могут давать сильные искажения, вылеты для таргетов

In [241]:
df['Log_Return'] = df['Close_log'] - df['Open_log']

# Target Category Feature 
Это фактически очередные гиперпараметры.

Я могу задавать уровень риска в лог величинах - risk_level

и коэффициент доходность/риск в виде множителя - risk_factor

Т.е. текущий случай это:
* risk_level = 0.0025
* risk_factor = 4


In [3]:
risk_level = 0.0025
profit_factor = 4
profit_level = risk_level * profit_factor

In [9]:
def profit_margin_for_zero_mo(risk_level, profit_factor):
    """ Функция для расчета доли прибыльных сделок при которой матожидание нулевое 
    (без учета комиссии и проскальзывания, на самом деле тут уже минус).
    Так же это значения совпадает с минимальным значением метрики Precision который мне нужно искать.
    
    Args:
        risk_level (_type_): уровень риска по сделке
        profit_factor (_type_): во сколько раз прибыль больше убытка

    Returns:
        Возвращает долю прибыльных сделок при которой матожидание нулевое
    """
    profit_level = risk_level * profit_factor
    
    return round(risk_level / (profit_level + risk_level), 2)

In [10]:
print("Метрика Precision должна быть выше:", profit_margin_for_zero_mo(risk_level, profit_factor))

Метрика Precision должна быть выше: 0.2


По данным переменным можно создавать сетку для поиска рабочих вариантов

In [243]:
df['Label_long'] = np.where((df['High_Shad_log'] >= profit_level) & (df['Low_Shad_log'] <= risk_level), 1, 0)

### By Index Selection
Для тестирования и генерации случайного блуждания создаю функцию генерации заданного количества индексов

In [244]:
def get_random_index(total, num, shift=0):
    # Generate random numbers without repetition
    random_nums = np.random.choice(total - shift, num, replace=False)

    # Sort the numbers in ascending order
    sorted_nums = np.sort(random_nums) + shift

    # Convert the NumPy array to a regular list
    result = sorted_nums.tolist()

    return sorted_nums

In [245]:
N = df.shape[0] # Сколько всего элементов в датафрейме, ограничение на генерацию

In [246]:
mask = get_random_index(N, 5000)
mask

array([    4,    15,    36, ..., 49315, 49325, 49335])

In [247]:
ind_mask = df.iloc[mask].index

In [248]:
df['Label_Mask'] = 0

In [249]:
df.loc[ind_mask, 'Label_Mask'] = 1

In [250]:
df['Label_Mask_shift'] = df['Label_Mask'].shift(1)

In [251]:
df['Label_Mask_shift'].fillna(0, inplace=True)
df['Label_Mask_shift'] = df['Label_Mask_shift'].astype(int)

In [252]:
df.head(15)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Trades,Open_log,High_log,Low_log,Close_log,Volume_log,Trades_log,Low_Shad_log,High_Shad_log,Log_Return,Label_long,Label_Mask,Label_Mask_shift
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-09-01 00:00:00,384.79,388.74,384.79,388.74,162.97779,131,5.952698,5.962911,5.952698,5.962911,5.093614,4.875197,0.0,0.010213,0.010213,1,0,0
2017-09-01 01:00:00,388.69,394.06,387.53,391.37,248.72949,262,5.962782,5.976503,5.959793,5.969653,5.516366,5.568345,0.002989,0.013721,0.006871,0,0,0
2017-09-01 02:00:00,392.88,394.39,389.46,390.88,198.82856,225,5.973504,5.97734,5.964761,5.968401,5.292443,5.4161,0.008743,0.003836,-0.005104,0,0,0
2017-09-01 03:00:00,391.7,393.13,390.86,390.86,141.56114,138,5.970496,5.97414,5.968349,5.968349,4.952732,4.927254,0.002147,0.003644,-0.002147,0,0,0
2017-09-01 04:00:00,392.65,394.27,387.64,390.33,135.65894,128,5.972919,5.977036,5.960077,5.966993,4.910144,4.85203,0.012842,0.004117,-0.005926,0,1,0
2017-09-01 05:00:00,388.92,388.99,384.88,384.88,203.48452,194,5.963374,5.963554,5.952932,5.952932,5.31559,5.267858,0.010442,0.00018,-0.010442,0,0,1
2017-09-01 06:00:00,386.7,388.62,383.18,384.52,85.07529,63,5.957649,5.962602,5.948505,5.951996,4.443537,4.143135,0.009144,0.004953,-0.005653,0,0,0
2017-09-01 07:00:00,386.26,390.56,384.0,388.4,330.22597,226,5.956511,5.967582,5.950643,5.962036,5.799777,5.420535,0.005868,0.011071,0.005525,0,0,0
2017-09-01 08:00:00,389.48,393.08,388.16,389.68,270.82415,209,5.964813,5.974013,5.961418,5.965326,5.60147,5.342334,0.003395,0.009201,0.000513,0,0,0
2017-09-01 09:00:00,389.97,392.0,387.46,389.14,248.32828,248,5.96607,5.971262,5.959613,5.963939,5.514752,5.513429,0.006457,0.005192,-0.002131,0,0,0


In [253]:
risk_level

0.0025

In [254]:
profit_level

0.01

In [255]:
def calc_pnl_long(col):
    Label_Mask_shift = col[0]
    Low_Shad_log = (np.exp(col[1]) - 1) * 100
    High_Shad_log = (np.exp(col[2]) - 1) * 100
    Log_Return = (np.exp(col[3]) - 1) * 100
    
    if Label_Mask_shift == 1: # если была сделка
        if Low_Shad_log >= risk_level: # если дошло до стопа
            return -risk_level * 100
        elif High_Shad_log > profit_level: # если дошло до тейка
            return profit_level * 100
            
        return Log_Return # иначе по закрытию бара выход
    
    return np.nan

In [256]:
df['PnL_pct'] = df[['Label_Mask_shift', 'Low_Shad_log', 'High_Shad_log', 'Log_Return']].apply(calc_pnl_long, axis=1)

In [257]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Trades,Open_log,High_log,Low_log,Close_log,Volume_log,Trades_log,Low_Shad_log,High_Shad_log,Log_Return,Label_long,Label_Mask,Label_Mask_shift,PnL_pct
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-09-01 00:00:00,384.79,388.74,384.79,388.74,162.97779,131,5.952698,5.962911,5.952698,5.962911,5.093614,4.875197,0.000000,0.010213,0.010213,1,0,0,
2017-09-01 01:00:00,388.69,394.06,387.53,391.37,248.72949,262,5.962782,5.976503,5.959793,5.969653,5.516366,5.568345,0.002989,0.013721,0.006871,0,0,0,
2017-09-01 02:00:00,392.88,394.39,389.46,390.88,198.82856,225,5.973504,5.977340,5.964761,5.968401,5.292443,5.416100,0.008743,0.003836,-0.005104,0,0,0,
2017-09-01 03:00:00,391.70,393.13,390.86,390.86,141.56114,138,5.970496,5.974140,5.968349,5.968349,4.952732,4.927254,0.002147,0.003644,-0.002147,0,0,0,
2017-09-01 04:00:00,392.65,394.27,387.64,390.33,135.65894,128,5.972919,5.977036,5.960077,5.966993,4.910144,4.852030,0.012842,0.004117,-0.005926,0,1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-23 19:00:00,1846.76,1852.35,1844.92,1846.49,16550.18290,21648,7.521188,7.524210,7.520191,7.521042,9.714152,9.982668,0.000997,0.003022,-0.000146,0,0,0,
2023-04-23 20:00:00,1846.49,1852.30,1838.03,1848.89,13113.47550,19445,7.521042,7.524183,7.516450,7.522341,9.481396,9.875345,0.004592,0.003142,0.001299,0,0,0,
2023-04-23 21:00:00,1848.88,1859.32,1847.59,1854.66,7855.61330,13824,7.522335,7.527966,7.521637,7.525457,8.968984,9.534161,0.000698,0.005631,0.003121,0,0,0,
2023-04-23 22:00:00,1854.67,1868.16,1848.00,1861.72,15836.48430,21852,7.525462,7.532709,7.521859,7.529256,9.670072,9.992048,0.003603,0.007247,0.003794,0,1,0,


In [258]:
pnl_df = df[['Open', 'High', 'Low', 'Close', 'PnL_pct']].dropna()

In [259]:
pnl_df['Cum_PnL'] = pnl_df['PnL_pct'].cumsum()

In [260]:
pnl_df.tail(30)

Unnamed: 0_level_0,Open,High,Low,Close,PnL_pct,Cum_PnL
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-11 22:00:00,1893.28,1894.86,1889.31,1894.11,-0.25,-1097.75
2023-04-12 20:00:00,1901.31,1908.84,1900.12,1907.33,-0.25,-1098.0
2023-04-13 15:00:00,2001.02,2003.0,1990.15,1997.15,-0.25,-1098.25
2023-04-13 16:00:00,1997.15,2002.41,1991.09,2000.49,-0.25,-1098.5
2023-04-13 22:00:00,2006.51,2017.02,2004.08,2007.38,-0.25,-1098.75
2023-04-13 23:00:00,2007.39,2012.41,2004.39,2012.01,-0.25,-1099.0
2023-04-14 02:00:00,2117.23,2121.99,2096.69,2102.36,-0.25,-1099.25
2023-04-14 05:00:00,2113.73,2123.39,2111.7,2118.41,-0.25,-1099.5
2023-04-14 06:00:00,2118.42,2124.52,2105.86,2118.72,-0.25,-1099.75
2023-04-15 01:00:00,2086.01,2087.41,2071.13,2081.26,-0.25,-1100.0
