In [31]:
import sys
import os

import pandas as pd 
import numpy as np

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from data_collection.data_collector import DataCollector

In [11]:
data_col = DataCollector()
data = data_col.get_historical_data(symbol="EURUSD")
data.head() 

Unnamed: 0_level_0,Open,Close,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1971-01-04,0.5369,0.5369,0.5369,0.5369,1
1971-01-05,0.5366,0.5366,0.5366,0.5366,1
1971-01-06,0.5365,0.5365,0.5365,0.5365,1
1971-01-07,0.5368,0.5368,0.5368,0.5368,1
1971-01-08,0.5371,0.5371,0.5371,0.5371,1


In [44]:
split = int((1 - 0.2) * len(data))
        
X = data.drop(columns=["Close"])
y = data["Close"]

# Разделение данных
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [61]:
def create_sequences(data: pd.DataFrame, y_col: str = "Close", sequence_length: int = 32):

    sequences = []
    labels = []

    X = data.drop(columns=[y_col]).values
    y = data[y_col].values

    for i in range(len(data) - sequence_length):
        sequences.append(X[i:i + sequence_length])
        labels.append(y[i + sequence_length])
    
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data)

print(sequences.shape)
print(labels.shape)


(13785, 32, 4)
(13785,)


# Building a class

In [40]:
class TimeSeriesSplits:
    """
    Class for preparing data and dividing it into training and test samples.

    Methods:
    Train_test_split (x, y, test_size):
    Divides data into training and test samples.
    Create_sequences (Data, Sequence_LENGTH):
    Creates data sequences and appropriate tags for prediction tasks.
    """
       
    def train_test_split(self, data: pd.DataFrame, y_col: str, test_size: float = 0.2):
        """
        Separates data into training and test samples, maintaining a temporary sequence.

        Options:
        Data: PD.Dataframe - data containing signs and target variable.
        y_col: str - the name of the column with the target variable.
        Test_Size: Float - the share of data that will be used for test sample (default 0.2).

        Returns:
        X_train, y_train, X_test, y_test - divided data.
        """
        # Определяем точку разделения данных на обучающие и тестовые
        split = int((1 - test_size) * len(data))
        
        X = data.drop(columns=[y_col])
        y = data[y_col]
        
        # Разделение данных
        X_train, X_test = X[:split], X[split:]
        y_train, y_test = y[:split], y[split:]

        return X_train, y_train, X_test, y_test

    def create_sequences(self, data, y_col: str = "Close", sequence_length: int = 32):
            """
            Creates data sequences and tags from Dataframe.

            Options:
            - Data: Pd.Dataframe - data containing signs and target variable.
            - y_col: str - the name of the column with the target variable.
            - Sequence_LENGTH: int - the length of the sequence.

            Returns:
            - sequences: np.array - an array of sequences of signs.
            - Labels: NP.array - an array of marks.
            """
            sequences = []
            labels = []

            X = data.drop(columns=[y_col]).values
            y = data[y_col].values

            for i in range(len(data) - sequence_length):
                sequences.append(X[i:i + sequence_length])
                labels.append(y[i + sequence_length])
            
            return np.array(sequences), np.array(labels)

    

