In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [3]:
def aggregation(dataframe: pd.DataFrame, key: str = 'D') -> pd.DataFrame:
    agg_dict_ohlc = {
        'open_': 'first',
        'high_': 'max',
        'low_': 'min',
        'close_': 'last',
        'volume_': 'sum'
    }

    return dataframe.resample(key).agg(agg_dict_ohlc).dropna()

def resample_index(index: pd.DatetimeIndex, freq: str = 'D'):
    assert isinstance(index, pd.DatetimeIndex)
    start_date = index.min()
    end_date = index.max() + pd.DateOffset(days=1)
    resampled_index = pd.date_range(start_date, end_date, periods=freq)[:-1]
    series = pd.Series(resampled_index, resampled_index.floor('D'))
    
    
    
def _calc_wma(dataframe: pd.DataFrame, n: int = 14, _key: str = 'close_') -> pd.DataFrame:
    data = dataframe.copy()

    wma = np.zeros(data.shape[0]) + np.nan

    for i in range(n):
        wma[i] = data[_key].iloc[:i].mean()

    weights = np.arange(1, n + 1, 1)
    for i in range(n - 1, data.shape[0]):
        vector = data[_key].iloc[i - n + 1: i + 1]
        vector = weights * vector
        wma[i] = (vector.sum())/(weights.sum())

    return wma

def calc_wma(dataframe: pd.DataFrame, n: int = 14, _key: str = 'close_') -> pd.DataFrame:

    data = dataframe.copy()
    wma = np.zeros(data.shape[0]) + np.nan

    for i in range(n):
        wma[i] = data[_key].iloc[:i].mean()

    weights = np.arange(1, n + 1, 1)
    for i in range(n - 1, data.shape[0]):
        vector = data[_key].iloc[i - n + 1: i + 1]
        vector = weights * vector
        wma[i] = (vector.sum())/(weights.sum())

    data[f'WMA_{_key[0]}_{str(n)}'] = wma
    return data


def __calc_ema(x, last_ema, n):
    return (2 / (n + 1) * (x - last_ema)) + last_ema

def _calc_ema(vector: np.ndarray, n: int = 14) -> np.ndarray:
    ema = np.zeros(vector.shape[0]) + np.nan

    for i in range(1, n):
        ema[i] = vector[:i].mean()

    for i in range(n, vector.shape[0]):
        ema[i] = __calc_ema(vector[i], ema[i-1], n)

    return ema

def calc_ema(dataframe: pd.DataFrame, n: int = 14, _key: str = 'close_') -> pd.DataFrame:
    data = dataframe.copy()

    ema = np.zeros(data.shape[0]) + np.nan

    for i in range(n):
        ema[i] = data[_key].iloc[:i].mean()

    for i in range(n, data.shape[0]):
        ema[i] = __calc_ema(data[_key].iloc[i], ema[i-1], n)

    data[f'EMA_{_key[0]}_{str(n)}'] = ema

    return data

def calc_rsi(dataframe: pd.DataFrame, n: int = 14, _key: str = 'close_') -> pd.DataFrame:

    data = dataframe.copy()

    change = data[_key].diff(1)
    u = np.where(change > 0, change, 0)
    d = np.where(change < 0, -change, 0)

    ema_u = _calc_ema(u, n)
    ema_d = _calc_ema(d, n)
    ema_d = np.where((ema_d == 0), np.nan, ema_d)

    rs = ema_u/ema_d
    rsi = 100 - 100 / (1 + rs)
    data[f'RSI_{_key[0]}_{str(n)}'] = rsi

    return data

def calc_macd(dataframe: pd.DataFrame, n_fast: int = 12, n_slow: int = 26, _key: str = 'close_'):
    assert n_slow > n_fast

    data = dataframe.copy()

    ema_fast = _calc_ema(data[_key].to_numpy(), n_fast)
    ema_slow = _calc_ema(data[_key].to_numpy(), n_slow)

    data['MACD'] = ema_fast - ema_slow

    return data


def lag_features(dataframe: pd.DataFrame, columns: list[str] = None, depth: int = 1):
    data = dataframe.copy()
    
    if columns is None:
        columns = data.columns
    for col in columns:
        for i in np.arange(1, depth + 1):
            data[col+'_lag'+str(i)] = data[col].shift(i)

    return data

def calc_techical_metrics(dataframe: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
    data = dataframe.copy()
    data = calc_wma(data, **kwargs['wma'])
    data = calc_ema(data, **kwargs['ema'])
    data = calc_rsi(data, **kwargs['rsi'])
    data = calc_macd(data, **kwargs['macd'])
    return data.dropna()

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklego.preprocessing import RepeatingBasisFunction

class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_periods: int = 12, input_range: tuple[int, int] = (1, 365)):
        super().__init__()
        self.n_periods = n_periods
        self.column = 'day_of_year'
        self.remainder = 'passthrough'
        self.input_range = input_range
        self.rbf_estimator = RepeatingBasisFunction(n_periods=self.n_periods, column=self.column,
                                                    input_range=self.input_range, remainder=self.remainder)

    
    @staticmethod
    def _make_day_of_year(dataframe: pd.DataFrame) -> pd.DataFrame:
        data = dataframe.copy()
        data['day_of_year'] = data.index.day_of_year
        return data
    
    def fit(self, X, y=None):
        assert isinstance(X.index, pd.DatetimeIndex), 'Index must be datetime'        
        self.X_ = X
        data = self._make_day_of_year(X)
        self.rbf_estimator.fit(data)
        return self
    
    def transform(self, X, y=None):
        assert isinstance(X.index, pd.DatetimeIndex), 'Index must be datetime'
        check_is_fitted(self)
        data = self._make_day_of_year(X)
        transformed_month = self.rbf_estimator.transform(data)
        return transformed_month
    
    def get_feature_names_out(self) -> list[str]:
        return ['rbf_{}'.format(i) for i in range(self.rbf_estimator.n_periods)] + self.X_.columns.to_list()
    


In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn import set_config

set_config(transform_output='pandas')


std_cols = ['volume_']
norm_cols = ['open_', 'high_', 'low_', 'close_', 'EMA_c_14', 'WMA_c_14', 'RSI_c_14', 'MACD']

techical_metrics_params = {
    'wma': {
        'n': 14,
        '_key': 'close_'
        },
    'ema': {
        'n': 14,
        '_key': 'close_'
        },
    'rsi': {
        'n': 14,
        '_key': 'close_'
        },
    'macd': {
        'n_fast': 12,
        'n_slow': 26,
        '_key': 'close_'
        }
}
agg_params = {
    'key': 'D'
}   

scaling_transformer = ColumnTransformer([
    ('std_scaler', StandardScaler(), std_cols),
    ('mm_scaler', MinMaxScaler(), norm_cols)
    ], remainder='passthrough'
                                        )

preprocessing_pipeline = Pipeline([
    # ('agregate_days', FunctionTransformer(aggregation, kw_args=agg_params)),
    ('metrics_append', FunctionTransformer(calc_techical_metrics, kw_args=techical_metrics_params)),
    ('date_transformer', DateTimeTransformer(n_periods=12, input_range=(1, 365))),
    ('scalling', scaling_transformer)
])

# preprocessing_pipeline.fit(train)

In [6]:
df = pd.read_pickle('data/df_TSLA.pkl')
df.sample(5)

Unnamed: 0_level_0,open_,high_,low_,close_,volume_
timestamp_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-10-14 05:00:00,219.51,219.85,219.49,219.85,4358
2023-07-18 08:45:00,289.32,289.7,289.23,289.64,78835
2023-11-15 12:30:00,243.9,244.18,243.52,244.1434,1204938
2022-08-17 08:15:00,303.8,303.8,303.6667,303.6667,5397
2023-10-11 05:30:00,264.54,264.78,264.52,264.74,4968


In [7]:
df = aggregation(df, 'D')
df

Unnamed: 0_level_0,open_,high_,low_,close_,volume_
timestamp_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-04,236.3333,248.1633,236.3333,244.5333,100740699
2021-01-05,243.3767,251.4667,239.7333,250.9667,64163466
2021-01-06,249.3333,258.0000,248.8867,254.5333,92799093
2021-01-07,256.3333,278.2400,255.7333,276.5000,103405713
2021-01-08,281.6667,294.9633,279.4633,289.1667,151570050
...,...,...,...,...,...
2023-12-22,253.7000,258.2200,249.0350,252.5700,94545651
2023-12-26,253.9800,273.4309,240.8791,256.9500,89369912
2023-12-27,257.4500,277.1995,249.5321,262.6150,108466950
2023-12-28,263.0200,265.1300,252.2900,254.8500,115267688


In [8]:
round(df.shape[0] * 0.75)

565

In [9]:
from sklearn.model_selection import train_test_split

# train, test = train_test_split(df, test_size=0.25, random_state=1) #train-test надо делить по времени
bound = round(df.shape[0] * 0.75)
train, test = df.iloc[:bound], df.iloc[bound:]

In [10]:
print(train.shape, test.shape)

(565, 5) (188, 5)


In [11]:
preprocessing_pipeline.fit(train)
train_eda = preprocessing_pipeline.transform(train)

In [12]:
train_eda.sample(5)

Unnamed: 0_level_0,std_scaler__volume_,mm_scaler__open_,mm_scaler__high_,mm_scaler__low_,mm_scaler__close_,mm_scaler__EMA_c_14,mm_scaler__WMA_c_14,mm_scaler__RSI_c_14,mm_scaler__MACD,remainder__rbf_0,...,remainder__rbf_2,remainder__rbf_3,remainder__rbf_4,remainder__rbf_5,remainder__rbf_6,remainder__rbf_7,remainder__rbf_8,remainder__rbf_9,remainder__rbf_10,remainder__rbf_11
timestamp_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-08-13,-0.825664,0.44087,0.426464,0.449682,0.439682,0.446731,0.450765,0.638983,0.493617,5.607336e-10,...,2.558762e-13,4.474586e-09,1.1e-05,0.003392,0.147024,0.862492,0.6847522,0.07357371,0.00106985,2.105398e-06
2022-07-14,-0.375668,0.427794,0.413826,0.420845,0.441588,0.459211,0.453093,0.522588,0.37906,2.286315e-14,...,4.063025e-09,9.829434e-06,0.003218,0.1426,0.855129,0.693993,0.07622352,0.001133011,2.279242e-06,6.205227e-10
2021-06-01,-0.752012,0.336751,0.32049,0.347,0.334353,0.3313,0.321116,0.536497,0.321426,1.729329e-11,...,0.0001407374,0.01998904,0.384225,0.999517,0.351889,0.016766,0.0001081111,9.434503e-08,1.11424e-11,3.018061e-16
2022-04-25,-0.697205,0.752258,0.73382,0.719728,0.75181,0.87401,0.831999,0.418697,0.472751,7.342758e-07,...,0.04543766,0.5627439,0.943228,0.213961,0.006568,2.7e-05,1.534443e-08,1.167648e-12,3.980583e-15,1.469594e-10
2022-05-24,-0.316922,0.367269,0.348279,0.347022,0.347987,0.504816,0.454471,0.188321,0.000582,2.228572e-10,...,0.0006315415,0.0529305,0.600373,0.92161,0.191463,0.005383,2.048294e-05,1.054782e-08,7.350958e-13,6.591066e-15


In [13]:
train_eda.shape

(559, 21)

In [14]:
test_eda = preprocessing_pipeline.transform(test)
test_eda.sample(5)

Unnamed: 0_level_0,std_scaler__volume_,mm_scaler__open_,mm_scaler__high_,mm_scaler__low_,mm_scaler__close_,mm_scaler__EMA_c_14,mm_scaler__WMA_c_14,mm_scaler__RSI_c_14,mm_scaler__MACD,remainder__rbf_0,...,remainder__rbf_2,remainder__rbf_3,remainder__rbf_4,remainder__rbf_5,remainder__rbf_6,remainder__rbf_7,remainder__rbf_8,remainder__rbf_9,remainder__rbf_10,remainder__rbf_11
timestamp_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-07-19,1.984473,0.619277,0.611691,0.568207,0.573243,0.618626,0.61551,0.53503,0.624394,1.411691e-13,...,9.283531e-10,3.122958e-06,0.001421773,0.08760023,0.7304515,0.824306,0.1258916,0.002602051,7.278571e-06,2.755417e-09
2023-09-15,1.991271,0.565785,0.544501,0.508819,0.553557,0.557869,0.555274,0.663882,0.507752,3.944997e-06,...,5.382962e-14,9.851605e-14,2.053948e-09,5.795394e-06,0.002213031,0.114368,0.7998891,0.7571247,0.09698758,0.001681419
2023-11-14,2.205955,0.396776,0.417829,0.395068,0.444648,0.398493,0.387861,0.709637,0.337552,0.09064568,...,3.376742e-06,1.0261e-09,4.219809e-14,1.252469e-13,2.498962e-09,7e-06,0.002465914,0.1219562,0.8162825,0.739415
2023-04-27,1.110831,0.159177,0.153766,0.165897,0.173955,0.199391,0.192405,0.237553,0.272974,4.453877e-07,...,0.03587849,0.5069891,0.9695588,0.2509347,0.008789382,4.2e-05,2.672925e-08,2.320692e-12,1.854755e-15,7.812802e-11
2023-05-24,1.867958,0.255763,0.259712,0.234425,0.261082,0.217108,0.231808,0.688373,0.430183,2.228572e-10,...,0.0006315415,0.0529305,0.600373,0.9216104,0.1914629,0.005383,2.048294e-05,1.054782e-08,7.350958e-13,6.591066e-15


In [15]:
test_eda.shape

(186, 21)

In [16]:
train_eda.iloc[:-1, :]

Unnamed: 0_level_0,std_scaler__volume_,mm_scaler__open_,mm_scaler__high_,mm_scaler__low_,mm_scaler__close_,mm_scaler__EMA_c_14,mm_scaler__WMA_c_14,mm_scaler__RSI_c_14,mm_scaler__MACD,remainder__rbf_0,...,remainder__rbf_2,remainder__rbf_3,remainder__rbf_4,remainder__rbf_5,remainder__rbf_6,remainder__rbf_7,remainder__rbf_8,remainder__rbf_9,remainder__rbf_10,remainder__rbf_11
timestamp_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-12,0.746882,0.554287,0.578752,0.567550,0.589638,0.576121,0.560074,0.753643,0.402858,0.876774,...,0.068498,0.000953,0.000002,4.575512e-10,1.578277e-14,3.240497e-13,5.423065e-09,1.228256e-05,3.764820e-03,1.561746e-01
2021-01-13,0.035175,0.590948,0.571033,0.578498,0.579772,0.587340,0.570467,0.802188,0.402858,0.855129,...,0.076224,0.001133,0.000002,6.205227e-10,2.286315e-14,2.272907e-13,4.063025e-09,9.829434e-06,3.218241e-03,1.426000e-01
2021-01-14,-0.042712,0.574284,0.573239,0.584694,0.585148,0.594241,0.576860,0.766419,0.402858,0.832208,...,0.084637,0.001344,0.000003,8.397145e-10,3.304797e-14,1.590770e-13,3.037458e-09,7.849176e-06,2.745041e-03,1.299226e-01
2021-01-15,0.357714,0.585049,0.569820,0.564375,0.558345,0.600342,0.582512,0.772758,0.402858,0.808142,...,0.093774,0.001590,0.000004,1.133866e-09,4.766609e-14,1.110937e-13,2.265828e-09,6.254255e-06,2.336336e-03,1.181152e-01
2021-01-19,-0.463575,0.570951,0.558903,0.579593,0.579938,0.601933,0.583986,0.689470,0.402858,0.703188,...,0.138276,0.003053,0.000009,3.688428e-09,2.018499e-13,2.585694e-14,6.865211e-10,2.466844e-06,1.199611e-03,7.894968e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-24,0.900448,0.287658,0.264294,0.282297,0.277809,0.262772,0.263597,0.520856,0.422923,0.000670,...,0.609799,0.915731,0.186106,5.118745e-03,1.905364e-05,9.598498e-09,6.543946e-13,7.472152e-15,2.471564e-10,1.106392e-06
2023-03-27,1.022051,0.275094,0.274583,0.290344,0.286888,0.265487,0.268278,0.561453,0.427585,0.000389,...,0.525439,0.961630,0.238180,7.983850e-03,3.621856e-05,2.223622e-08,1.847574e-12,2.394746e-15,9.653613e-11,5.266613e-07
2023-03-28,0.512906,0.282193,0.263203,0.276648,0.275880,0.266039,0.271117,0.498861,0.426803,0.000323,...,0.497828,0.973195,0.257473,9.218792e-03,4.467120e-05,2.929491e-08,2.599967e-12,1.631709e-15,7.025996e-11,4.094343e-07
2023-03-29,1.087031,0.278560,0.267636,0.289818,0.286289,0.268220,0.275209,0.551814,0.429844,0.000268,...,0.470644,0.982761,0.277725,1.062164e-02,5.497687e-05,3.851052e-08,3.650816e-12,1.109384e-15,5.102487e-11,3.176092e-07


In [17]:
train_eda.shift(-1).iloc[:-1, :]

Unnamed: 0_level_0,std_scaler__volume_,mm_scaler__open_,mm_scaler__high_,mm_scaler__low_,mm_scaler__close_,mm_scaler__EMA_c_14,mm_scaler__WMA_c_14,mm_scaler__RSI_c_14,mm_scaler__MACD,remainder__rbf_0,...,remainder__rbf_2,remainder__rbf_3,remainder__rbf_4,remainder__rbf_5,remainder__rbf_6,remainder__rbf_7,remainder__rbf_8,remainder__rbf_9,remainder__rbf_10,remainder__rbf_11
timestamp_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-12,0.035175,0.590948,0.571033,0.578498,0.579772,0.587340,0.570467,0.802188,0.402858,0.855129,...,0.076224,0.001133,0.000002,6.205227e-10,2.286315e-14,2.272907e-13,4.063025e-09,9.829434e-06,3.218241e-03,1.426000e-01
2021-01-13,-0.042712,0.574284,0.573239,0.584694,0.585148,0.594241,0.576860,0.766419,0.402858,0.832208,...,0.084637,0.001344,0.000003,8.397145e-10,3.304797e-14,1.590770e-13,3.037458e-09,7.849176e-06,2.745041e-03,1.299226e-01
2021-01-14,0.357714,0.585049,0.569820,0.564375,0.558345,0.600342,0.582512,0.772758,0.402858,0.808142,...,0.093774,0.001590,0.000004,1.133866e-09,4.766609e-14,1.110937e-13,2.265828e-09,6.254255e-06,2.336336e-03,1.181152e-01
2021-01-15,-0.463575,0.570951,0.558903,0.579593,0.579938,0.601933,0.583986,0.689470,0.402858,0.703188,...,0.138276,0.003053,0.000009,3.688428e-09,2.018499e-13,2.585694e-14,6.865211e-10,2.466844e-06,1.199611e-03,7.894968e-02
2021-01-19,-0.402983,0.577606,0.570129,0.584278,0.589494,0.605644,0.587424,0.716345,0.402858,0.675472,...,0.151549,0.003574,0.000011,4.926645e-09,2.879872e-13,1.786237e-14,5.065830e-10,1.944344e-06,1.009966e-03,7.099888e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-24,1.022051,0.275094,0.274583,0.290344,0.286888,0.265487,0.268278,0.561453,0.427585,0.000389,...,0.525439,0.961630,0.238180,7.983850e-03,3.621856e-05,2.223622e-08,1.847574e-12,2.394746e-15,9.653613e-11,5.266613e-07
2023-03-27,0.512906,0.282193,0.263203,0.276648,0.275880,0.266039,0.271117,0.498861,0.426803,0.000323,...,0.497828,0.973195,0.257473,9.218792e-03,4.467120e-05,2.929491e-08,2.599967e-12,1.631709e-15,7.025996e-11,4.094343e-07
2023-03-28,1.087031,0.278560,0.267636,0.289818,0.286289,0.268220,0.275209,0.551814,0.429844,0.000268,...,0.470644,0.982761,0.277725,1.062164e-02,5.497687e-05,3.851052e-08,3.650816e-12,1.109384e-15,5.102487e-11,3.176092e-07
2023-03-29,0.746116,0.289858,0.274384,0.301905,0.294836,0.271509,0.279772,0.593618,0.435144,0.000221,...,0.443978,0.990266,0.298919,1.221139e-02,6.751316e-05,5.051527e-08,5.115262e-12,7.526222e-16,3.697532e-11,2.458430e-07


После первичной обработки данных, рассчета технических метрик и выделения даты (индекса) в качетсве признака. Стоит разбить данные на тестовую и тренировочную выборки.

Как это сделать?

Пусть у нас есть датасет размером $m * n$.
Есть такое понятие как lookback window. Мы выбираем определенный период пусть это будет $n$ и далее учитывая этот период мы делим наш датасет на тренировочную и тестовую выборки. Т. е. датасет разбивается на $\frac{m}{n}$ или $m-n$ выборок, где от 0 до $n - 1$ - тренировочные значения. А n -  тестовое значение.

Если логически предположить, то существует два подхода:
1. Разбить датасет с неповторяющимися элементами. (Overlapping)
2. Разбить датасет с повторяющимися элементами (Whitout overlapping)

В первом случае мы разбиваем датасет на $n$ выборок и не используем соседние элементы.

Во втором случае мы разбиваем датасет на $m-n$ выборок и у нас пристуствуют повторяющиеся элементы

Стоит ли использовать второй случай с повторяющимися элементами? - ответ Да, это даст больше тренировочных значений

In [18]:
from torch.utils.data import Dataset
class TSDataset(Dataset):
    def __init__(self, X: pd.DataFrame, lookback: int = 1):
        # assert  X.shape[0] == y.shape[0], "X and y must have the same number of lines"
        self.X = X.iloc[:-lookback, :]
        self.y = X.shift(-lookback).iloc[:-lookback, :1]
        
        self.dtrange = self.X.index.to_numpy()
        
        self.X = torch.from_numpy(self.X.to_numpy('float32'))
        self.y = torch.from_numpy(self.y.to_numpy('float32'))
        self.lookback = lookback
        
    def __len__(self):
        return self.X.shape[0]
        
    def __getitem__(self, index):
        return (self.X[index], self.y[index])
    
    # @staticmethod
    # def create_dataset(dataset: pd.ndarray, loockback_val: int = 14) -> tuple[torch.Tensor, torch.Tensor]:
    #     X, y = [], []
    #     for i in range(loockback_val, dataset.shape[0]):
    #         X.append(dataset[i-loockback_val:i, :].to_numpy())
    #         y.append(dataset[i - loockback_val + 1:i+1, 0:5].to_numpy())
    #     return torch.Tensor(X), torch.Tensor(y)  

In [19]:
train_dataset = TSDataset(train_eda, lookback=1)
test_dataset = TSDataset(test_eda, lookback=1)

In [20]:
test_dataset.X.shape

torch.Size([185, 21])

In [21]:
test_dataset.y.shape

torch.Size([185, 1])

In [22]:
for i in range(5):
    print(test_dataset[i])

(tensor([ 1.2332e+00,  2.8216e-01,  2.6337e-01,  2.7116e-01,  2.5772e-01,
         2.8320e-01,  2.8871e-01, -1.0329e-01,  4.0286e-01,  6.7513e-05,
         1.2211e-02,  2.9892e-01,  9.9027e-01,  4.4398e-01,  2.6939e-02,
         2.2122e-04,  2.4584e-07,  3.6975e-11,  7.5262e-16,  5.1153e-12,
         5.0515e-08]), tensor([1.0398]))
(tensor([ 1.0398e+00,  2.5703e-01,  2.4087e-01,  2.5796e-01,  2.6042e-01,
         2.7125e-01,  2.7764e-01, -1.0329e-01,  4.0286e-01,  5.4977e-05,
         1.0622e-02,  2.7772e-01,  9.8276e-01,  4.7064e-01,  3.0503e-02,
         2.6756e-04,  3.1761e-07,  5.1025e-11,  1.1094e-15,  3.6508e-12,
         3.8511e-08]), tensor([1.4847]))
(tensor([ 1.4847e+00,  2.5286e-01,  2.3657e-01,  2.4604e-01,  2.5842e-01,
         2.6610e-01,  2.7287e-01, -6.7404e-03,  4.0286e-01,  2.3654e-05,
         5.9492e-03,  2.0250e-01,  9.3281e-01,  5.8153e-01,  4.9065e-02,
         5.6024e-04,  8.6575e-07,  1.8106e-10,  5.1246e-15,  9.2691e-13,
         1.2728e-08]), tensor([0.9011])

In [23]:
from torch.utils.data import DataLoader

batch_size = 32

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True
)

for i, (X_batch, y_batch) in enumerate(train_dataloader):
    print(f'Batch {i}: X_batch: {X_batch.shape}, y_batch {y_batch.shape}')

Batch 0: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 1: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 2: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 3: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 4: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 5: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 6: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 7: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 8: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 9: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 10: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 11: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 12: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 13: X_batch: torch.Size([32, 21]), y_batch torch.Size([32, 1])
Batch 14: X_batch: torch.Size([32, 21]), y_b

In [27]:
from torch.nn import Dropout

class Kleopatra(nn.Module):
    def __init__(self, hidden_size: int = 1):
        super().__init__()
        self.dropout = Dropout(0.2)
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size=21, hidden_size=self.hidden_size, num_layers=1, batch_first=True)
        self.linear = nn.Linear(self.hidden_size, 1)
    
    def forward(self, X):
        X = self.dropout(X)
        X, _ = self.lstm(X)
        X = self.linear(X)
        return X

Датасет - используем функцию lookback window с определенным шагом, далее батчим датасет

In [28]:
model = Kleopatra(hidden_size=1)
print(model)

Kleopatra(
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(21, 1, batch_first=True)
  (linear): Linear(in_features=1, out_features=1, bias=True)
)


In [29]:
optim = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
loss_fn = torch.nn.MSELoss()

n_epochs = 220

for epoch in range(n_epochs+1):
    model.train()
    for X_batch, y_batch in train_dataloader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optim.zero_grad()
        loss.backward()
        optim.step()
    
    if epoch % 10 != 0:
        continue
    
    model.eval()
    with torch.no_grad():
        y_pred = model(train_dataset.X)
        train_mse = loss_fn(y_pred, train_dataset.y)
        y_pred = model(test_dataset.X)
        test_mse = loss_fn(y_pred, test_dataset.y)
    
    print("Epoch %d: train MSE %.4f, test MSE %.4f" % (epoch, train_mse, test_mse))

Epoch 0: train MSE 1.2855, test MSE 1.3659
Epoch 10: train MSE 0.9599, test MSE 1.7962
Epoch 20: train MSE 0.7740, test MSE 1.8913
Epoch 30: train MSE 0.6928, test MSE 1.9817
Epoch 40: train MSE 0.6430, test MSE 2.0239
Epoch 50: train MSE 0.6049, test MSE 1.9837
Epoch 60: train MSE 0.5698, test MSE 1.9422
Epoch 70: train MSE 0.5330, test MSE 1.8830
Epoch 80: train MSE 0.5009, test MSE 1.8099
Epoch 90: train MSE 0.4716, test MSE 1.8036
Epoch 100: train MSE 0.4427, test MSE 1.7639
Epoch 110: train MSE 0.4147, test MSE 1.6879
Epoch 120: train MSE 0.3901, test MSE 1.6563
Epoch 130: train MSE 0.3707, test MSE 1.6941
Epoch 140: train MSE 0.3494, test MSE 1.6780
Epoch 150: train MSE 0.3314, test MSE 1.7303
Epoch 160: train MSE 0.3129, test MSE 1.7338
Epoch 170: train MSE 0.2968, test MSE 1.8178
Epoch 180: train MSE 0.2829, test MSE 1.8550
Epoch 190: train MSE 0.2667, test MSE 1.9049
Epoch 200: train MSE 0.2565, test MSE 1.9825
Epoch 210: train MSE 0.2434, test MSE 2.1187
Epoch 220: train MSE 

In [30]:
with torch.no_grad():
    y_pred = model(train_dataset.X)
    train_mse = loss_fn(y_pred, train_dataset.y)
    y_pred = model(test_dataset.X)
    test_mse = loss_fn(y_pred, test_dataset.y)
print("train MSE %.4f, test MSE %.4f" % (train_mse, test_mse))

train MSE 0.2287, test MSE 2.2011


In [31]:
for y_pred_el, y_test_el in zip(y_pred, test_dataset.y):
    print(y_test_el, y_pred_el)

tensor([1.0398]) tensor([0.6382])
tensor([1.4847]) tensor([0.7323])
tensor([0.9011]) tensor([0.9053])
tensor([1.6860]) tensor([0.5204])
tensor([0.8683]) tensor([0.9989])
tensor([0.4810]) tensor([0.4909])
tensor([0.9002]) tensor([0.3559])
tensor([0.3697]) tensor([0.5379])
tensor([1.0153]) tensor([0.2583])
tensor([2.7988]) tensor([0.6614])
tensor([1.0518]) tensor([1.4425])
tensor([1.4679]) tensor([0.5652])
tensor([1.0770]) tensor([0.9060])
tensor([1.7356]) tensor([0.6499])
tensor([1.1108]) tensor([1.0671])
tensor([1.0843]) tensor([0.6755])
tensor([0.7797]) tensor([0.6936])
tensor([1.1962]) tensor([0.5025])
tensor([0.9842]) tensor([0.7460])
tensor([0.4682]) tensor([0.5923])
tensor([0.6653]) tensor([0.3577])
tensor([0.7878]) tensor([0.4642])
tensor([0.2820]) tensor([0.4532])
tensor([0.9333]) tensor([0.2229])
tensor([0.5582]) tensor([0.5489])
tensor([1.7650]) tensor([0.3200])
tensor([0.6704]) tensor([0.9055])
tensor([0.5096]) tensor([0.3056])
tensor([1.0971]) tensor([0.2947])
tensor([0.7450

In [32]:
test_dataset.X[0]

tensor([ 1.2332e+00,  2.8216e-01,  2.6337e-01,  2.7116e-01,  2.5772e-01,
         2.8320e-01,  2.8871e-01, -1.0329e-01,  4.0286e-01,  6.7513e-05,
         1.2211e-02,  2.9892e-01,  9.9027e-01,  4.4398e-01,  2.6939e-02,
         2.2122e-04,  2.4584e-07,  3.6975e-11,  7.5262e-16,  5.1153e-12,
         5.0515e-08])

Pipeline надо сделать для последовательной обработки ряда после предсказания - по сути это тот же preprocessing_pipeline только без scallers 

Для предсказания нам необходимо:
1. Дата для конвертации -> надо создать список будующих дат по которым мы будем предсказывать, с возможность последовательную дату в методе predict
2. Надо создать метод, который позволит рассчитать технические метрики на основании прошлых. Но в этом случае надо будет руководствоваться датой. Т. е. нам нужен будет временный массив, который "объединит" новые предсказанные значения с предыдущими для рассчета метрики.
3.  В методе predict необходимо будет создать временной массив, который будет содержать предсказанные даты

In [None]:
# post_pred_pipeline = Pipeline([
#     ('metrics_append', FunctionTransformer(calc_techical_metrics, kw_args=techical_metrics_params)),
#     ('date_transformer', DateTimeTransformer(n_periods=12, input_range=(1, 365))),
# ])

In [None]:
def prediction(days: int = 30):
    first_input = test_dataset.X[0].unsqueeze(-2)
    pred = []
    
    
    
    # with torch.no_grad():
        
    #     step_pred = model(first_input)
    #     pred.append(step_pred)
        
    #     for i in range(1, days):
    #         y_pred = model(pred[i-1].unsqueeze(-2))
    #         print(y_pred)
    #     # y_pred = model(test_dataset.X)
    #     # y_pred.

In [None]:
prediction()

RuntimeError: input.size(-1) must be equal to input_size. Expected 21, got 5