In [24]:
import pandas as pd
df = pd.read_csv("/Users/kassraniroumand/code/aitrading/aitrading/data/eurousd_df_clean_2.csv")


# Convert 'timestamp' to datetime if it's not already
df['timestamp'] = pd.to_datetime(df['datetime'])

# Sort the dataframe by timestamp to ensure correct operations
df = df.sort_values(by='timestamp')

# Calculate the target variable by comparing the current price to the next
df['target'] = (df['close'].shift(-1) > df['close']).astype(int)

# Time-based features (example: part of the day)
df['hour'] = df['timestamp'].dt.hour
df['part_of_day'] = pd.cut(df['hour'], bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'], right=False)

In [25]:
df

Unnamed: 0,datetime,open,high,low,close,volume,timestamp,target,hour,part_of_day
1102,2021-01-11 00:00:00,1.15510,1.15534,1.15499,1.15529,523,2021-01-11 00:00:00,1,0,Night
1103,2021-01-11 00:05:00,1.15529,1.15553,1.15527,1.15553,408,2021-01-11 00:05:00,1,0,Night
1104,2021-01-11 00:10:00,1.15553,1.15580,1.15552,1.15575,379,2021-01-11 00:10:00,1,0,Night
1105,2021-01-11 00:15:00,1.15574,1.15594,1.15574,1.15584,405,2021-01-11 00:15:00,1,0,Night
1106,2021-01-11 00:20:00,1.15584,1.15596,1.15584,1.15589,301,2021-01-11 00:20:00,0,0,Night
...,...,...,...,...,...,...,...,...,...,...
179147,2024-12-03 23:35:00,1.09255,1.09257,1.09245,1.09251,364,2024-12-03 23:35:00,1,23,Evening
179148,2024-12-03 23:40:00,1.09251,1.09257,1.09247,1.09252,343,2024-12-03 23:40:00,0,23,Evening
179149,2024-12-03 23:45:00,1.09252,1.09256,1.09249,1.09252,435,2024-12-03 23:45:00,0,23,Evening
179150,2024-12-03 23:50:00,1.09252,1.09254,1.09235,1.09241,404,2024-12-03 23:50:00,0,23,Evening


In [26]:
df['price_change_5_intervals'] = df['close'].diff(periods=5)

# Rolling window features (example: rolling average over the last 5 intervals)
df['rolling_avg_price_5_intervals'] = df['close'].rolling(window=5).mean()

# Drop any rows with NaN values created by shifts and rolling functions
df.dropna(inplace=True)

# Display the first few rows to inspect the new features
df

Unnamed: 0,datetime,open,high,low,close,volume,timestamp,target,hour,part_of_day,price_change_5_intervals,rolling_avg_price_5_intervals
1107,2021-01-11 00:25:00,1.15589,1.15590,1.15574,1.15579,256,2021-01-11 00:25:00,1,0,Night,0.00050,1.155760
1108,2021-01-11 00:30:00,1.15579,1.15596,1.15579,1.15589,255,2021-01-11 00:30:00,1,0,Night,0.00036,1.155832
1109,2021-01-11 00:35:00,1.15589,1.15596,1.15584,1.15594,294,2021-01-11 00:35:00,1,0,Night,0.00019,1.155870
1110,2021-01-11 00:40:00,1.15595,1.15605,1.15585,1.15604,420,2021-01-11 00:40:00,0,0,Night,0.00020,1.155910
1111,2021-01-11 00:45:00,1.15605,1.15609,1.15586,1.15594,438,2021-01-11 00:45:00,1,0,Night,0.00005,1.155920
...,...,...,...,...,...,...,...,...,...,...,...,...
179147,2024-12-03 23:35:00,1.09255,1.09257,1.09245,1.09251,364,2024-12-03 23:35:00,1,23,Evening,-0.00025,1.092650
179148,2024-12-03 23:40:00,1.09251,1.09257,1.09247,1.09252,343,2024-12-03 23:40:00,0,23,Evening,-0.00017,1.092616
179149,2024-12-03 23:45:00,1.09252,1.09256,1.09249,1.09252,435,2024-12-03 23:45:00,0,23,Evening,-0.00029,1.092558
179150,2024-12-03 23:50:00,1.09252,1.09254,1.09235,1.09241,404,2024-12-03 23:50:00,0,23,Evening,-0.00028,1.092502


In [27]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Assuming df is sorted chronologically
split_point = int(len(df) * 0.8)
train_df = df.iloc[:split_point]
test_df = df.iloc[split_point:]

# Separate features and target, ensuring we use 'close' instead of 'price'
X_train = train_df.drop(['target', 'timestamp', 'hour'], axis=1)
y_train = train_df['target']
X_test = test_df.drop(['target', 'timestamp', 'hour'], axis=1)
y_test = test_df['target']

# Normalize numerical features, replacing 'price' with 'close'
scaler = StandardScaler()
numeric_features = ['close', 'price_change_5_intervals', 'rolling_avg_price_5_intervals']  # Adjust feature names as needed
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Encode categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
categorical_features = ['part_of_day']  # Ensure this is present or adjust accordingly
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

# Concatenate encoded categorical features back with numerical features
X_train_preprocessed = np.concatenate((X_train[numeric_features].values, X_train_encoded), axis=1)
X_test_preprocessed = np.concatenate((X_test[numeric_features].values, X_test_encoded), axis=1)

Preprocessing complete. The data is now ready for time series modeling.




In [31]:
y_train

1107      1
1108      1
1109      1
1110      0
1111      1
         ..
143768    0
143769    1
143770    1
143771    1
143772    1
Name: target, Length: 143748, dtype: int64

In [47]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Reshape input to be [samples, time steps, features] which is required for LSTM
X_train_reshaped = X_train_preprocessed.reshape((X_train_preprocessed.shape[0], 1, X_train_preprocessed.shape[1]))
X_test_reshaped = X_test_preprocessed.reshape((X_test_preprocessed.shape[0], 1, X_test_preprocessed.shape[1]))

# Convert labels to categorical (one-hot encoding) for classification
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(1, X_train_reshaped.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))  # 2 because we have binary classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train_categorical, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
_, accuracy = model.evaluate(X_test_reshaped, y_test_categorical, verbose=0)
print(f'Test Accuracy: {accuracy:.2f}')

Epoch 1/100


  super().__init__(**kwargs)


[1m3594/3594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 607us/step - accuracy: 0.5069 - loss: 0.6934 - val_accuracy: 0.5146 - val_loss: 0.6928
Epoch 2/100
[1m3594/3594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 502us/step - accuracy: 0.5139 - loss: 0.6929 - val_accuracy: 0.5109 - val_loss: 0.6930
Epoch 3/100
[1m3594/3594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 506us/step - accuracy: 0.5138 - loss: 0.6928 - val_accuracy: 0.5110 - val_loss: 0.6928
Epoch 4/100
[1m3594/3594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 503us/step - accuracy: 0.5149 - loss: 0.6927 - val_accuracy: 0.5124 - val_loss: 0.6929
Epoch 5/100
[1m3594/3594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 502us/step - accuracy: 0.5126 - loss: 0.6927 - val_accuracy: 0.5143 - val_loss: 0.6927
Epoch 6/100
[1m2554/3594[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 452us/step - accuracy: 0.5155 - loss: 0.6925

KeyboardInterrupt: 