In [58]:
import polars as pl
import pandas as pd
import numpy as np
from enum import Enum
%matplotlib inline
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing

columns_to_normalize = ["close", "ema5", "ema20", "macd520"]

def read_data(train_file = 'train_data.csv', val_file = 'val_data.csv', test_file = 'test_data.csv'):
    train_data = pl.read_csv(train_file)
    val_data = pl.read_csv(val_file)
    test_data = pl.read_csv(test_file)
    return train_data, val_data, test_data

#get data of x and y and return x and y, slice y
def extract_y(data):
    y = data['next_5_min']
    x = data.drop('next_5_min')
    return x,y


#we should not use min max scale cause the max changing over the time.
#we need to deal with two kinds of things : 1. train data - we need to create a scaler and fit it to the train, and then create a normalized data , 2. test/validation data - we should get as input the scaler of the train, and do normalize by him
def min_max_scaler(data):
    scaler = preprocessing.MinMaxScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data, scaler

def z_score_normalize(data):
    scaler = preprocessing.StandardScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data, scaler
    
def scale_back(data, scaler):
    return scaler.inverse_transform(data)

def normalized_x_y(x: pl.DataFrame, y: pl.DataFrame, columns_to_normalize: list):
    # Convert Polars DataFrame to NumPy array for normalization
    x_values = x.select(columns_to_normalize).to_numpy()
    y_values = y.to_numpy().reshape(-1, 1)
    # Normalize
    normalized_x_values, scaler_x = z_score_normalize(x_values)
    normalized_y_values, scaler_y = z_score_normalize(y_values)
    # Convert back to Polars DataFrame
    normalized_x = pl.DataFrame(normalized_x_values, schema=columns_to_normalize)
    normalized_y = pl.DataFrame(normalized_y_values, schema=['normalized_y'])

    return normalized_x, normalized_y, scaler_x, scaler_y

In [59]:
train_data, val_data, test_data = read_data()
x_train, y_train = extract_y(train_data)
x_val, y_val = extract_y(val_data)
x_test, y_test = extract_y(test_data)


In [60]:
# Split data to train val and test and normalize the data
x_train, y_train, x_train_scaler, y_train_scaler = normalized_x_y(x_train, y_train, columns_to_normalize)
x_test, y_test, x_test_scaler, y_test_scaler = normalized_x_y(x_val, y_val, columns_to_normalize)
x_val, y_val, x_val_scaler, y_val_scaler = normalized_x_y(x_test, y_test, columns_to_normalize)
print(x_test[:3])
print(y_test[:3])


shape: (3, 4)
┌───────────┬───────────┬───────────┬──────────┐
│ close     ┆ ema5      ┆ ema20     ┆ macd520  │
│ ---       ┆ ---       ┆ ---       ┆ ---      │
│ f64       ┆ f64       ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪══════════╡
│ -0.862746 ┆ -0.865432 ┆ -0.86851  ┆ 1.076178 │
│ -0.858736 ┆ -0.860492 ┆ -0.865335 ┆ 1.698643 │
│ -0.860193 ┆ -0.861329 ┆ -0.863936 ┆ 0.910139 │
└───────────┴───────────┴───────────┴──────────┘
shape: (3, 1)
┌──────────────┐
│ normalized_y │
│ ---          │
│ f64          │
╞══════════════╡
│ -0.863859    │
│ -0.861666    │
│ -0.857984    │
└──────────────┘
