In [129]:
import polars as pl
import pandas as pd
import numpy as np
from enum import Enum
%matplotlib inline
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing
import keras
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Input, Activation, concatenate, Input,  TimeDistributed, concatenate
from keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.utils import plot_model
import matplotlib.pyplot as plt

columns_to_normalize = ["close", "ema5", "ema20" , "macd520"]

def read_data(train_file = 'train_data.csv', val_file = 'val_data.csv', test_file = 'test_data.csv'):
    train_data = pl.read_csv(train_file)
    val_data = pl.read_csv(val_file)
    test_data = pl.read_csv(test_file)
    return train_data, val_data, test_data

#get data of x and y and return x and y, slice y
def extract_y(data):
    y = data['next_5_min']
    x = data.drop('next_5_min')
    return x,y


#we should not use min max scale cause the max changing over the time.
#we need to deal with two kinds of things : 1. train data - we need to create a scaler and fit it to the train, and then create a normalized data , 2. test/validation data - we should get as input the scaler of the train, and do normalize by him
def min_max_scaler(data):
    scaler = preprocessing.MinMaxScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data, scaler

def z_score_normalize(data):
    scaler = preprocessing.StandardScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data, scaler
    
def scale_back(data, scaler):
    return scaler.inverse_transform(data)

def normalized_x_y(x: pl.DataFrame, y: pl.DataFrame, columns_to_normalize: list):
    # Convert Polars DataFrame to NumPy array for normalization
    x_values = x.select(columns_to_normalize).to_numpy()
    y_values = y.to_numpy().reshape(-1, 1)
    # Normalize
    normalized_x_values, scaler_x = z_score_normalize(x_values)
    normalized_y_values, scaler_y = z_score_normalize(y_values)
    # Convert back to Polars DataFrame
    normalized_x = pl.DataFrame(normalized_x_values, schema=columns_to_normalize)
    normalized_y = pl.DataFrame(normalized_y_values, schema=['normalized_y'])

    return normalized_x, normalized_y, scaler_x, scaler_y

In [130]:
train_data, val_data, test_data = read_data()
x_train, y_train = extract_y(train_data)
x_val, y_val = extract_y(val_data)
x_test, y_test = extract_y(test_data)


In [131]:
# Split data to train val and test and normalize the data
x_train, y_train, x_train_scaler, y_train_scaler = normalized_x_y(x_train, y_train, columns_to_normalize)
x_test, y_test, x_test_scaler, y_test_scaler = normalized_x_y(x_val, y_val, columns_to_normalize)
x_val, y_val, x_val_scaler, y_val_scaler = normalized_x_y(x_test, y_test, columns_to_normalize)
print(x_test[:3])
print(y_test[:3])

shape: (3, 4)
┌───────────┬───────────┬───────────┬──────────┐
│ close     ┆ ema5      ┆ ema20     ┆ macd520  │
│ ---       ┆ ---       ┆ ---       ┆ ---      │
│ f64       ┆ f64       ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪══════════╡
│ -0.862746 ┆ -0.865432 ┆ -0.86851  ┆ 1.076178 │
│ -0.858736 ┆ -0.860492 ┆ -0.865335 ┆ 1.698643 │
│ -0.860193 ┆ -0.861329 ┆ -0.863936 ┆ 0.910139 │
└───────────┴───────────┴───────────┴──────────┘
shape: (3, 1)
┌──────────────┐
│ normalized_y │
│ ---          │
│ f64          │
╞══════════════╡
│ -0.863859    │
│ -0.861666    │
│ -0.857984    │
└──────────────┘


In [132]:
history_points = 10
model = Sequential()
model.add(LSTM(50, input_shape=(history_points, 4)))
model.add(Dropout(0.2))
# Add Dense layer
model.add(Dense(64, activation='sigmoid'))

# Add output Dense layer
model.add(Dense(1, activation='linear'))

# Compile the model
adam = optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=adam, loss='mse')

  super().__init__(**kwargs)


In [133]:
def prepare_lstm_data(x, y, history_points):
    x_sequences = create_sequences(x, history_points)
    y_sequences = create_sequences(y, history_points)
    return x_sequences, y_sequences

def create_sequences(data, history_points):
    sequences = []
    for i in range(len(data) - history_points + 1):
        sequences.append(data[i:i + history_points])
    return np.array(sequences)

x_train_prepared, y_train_prepared = prepare_lstm_data(x_train, y_train, history_points)
x_val_prepared, y_val_prepared = prepare_lstm_data(x_val, y_val, history_points)
# Train the model
model.fit(x=x_train_prepared, y=y_train_prepared, batch_size=25, epochs=1, verbose=0)

# Predict and evaluate
y_val_predicted = model.predict(x_val_prepared)



[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [134]:
#convert back to polars
y_val_predicted_np = np.array(y_val_predicted).reshape(-1, 1)
y_val_predicted_s = y_val_scaler.inverse_transform(y_val_predicted_np)
print("x_val_prepared:",y_val_prepared[1])
print("y_val_predicted:",y_val_predicted[1])
# Compute RMSE
rmse = np.sqrt(np.mean(np.square(y_val_prepared - y_val_predicted)))

# # Compute scaled RMSE
scaled_rmse = rmse / (np.max(y_val_predicted) - np.min(y_val_predicted)) * 100

print("Adjusted Prediction Root Mean Squared Error for real data: {:.2f} %".format(scaled_rmse))

x_val_prepared: [[-0.86166607]
 [-0.85798417]
 [-0.85746052]
 [-0.85767325]
 [-0.85638049]
 [-0.85606958]
 [-0.85523501]
 [-0.8544659 ]
 [-0.85557866]
 [-0.85791871]]
y_val_predicted: [-0.85995865]


ValueError: operands could not be broadcast together with shapes (19716,10,1) (19716,1) 

In [None]:
plot_model(model, to_file='model.v1.png')
plt.gcf().set_size_inches(22, 15, forward=True)

start = 0
end = -1

real = plt.plot(y_test_real[start:end], label='real')
pred = plt.plot(y_test_predicted[start:end], label='predicted')

plt.legend(['Real', 'Predicted'])

plt.show()


You must install pydot (`pip install pydot`) for `plot_model` to work.


NameError: name 'y_test_real' is not defined

<Figure size 2200x1500 with 0 Axes>