In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv("/Users/kassraniroumand/code/aitrading/aitrading/data/eurousd_df_clean_3.csv")


# Convert 'timestamp' to datetime if it's not already
df['timestamp'] = pd.to_datetime(df['datetime'])

# Sort the dataframe by timestamp to ensure correct operations
df = df.sort_values(by='timestamp')

# Calculate the target variable by comparing the current price to the next
df['target'] = (df['close'].shift(-1) > df['close']).astype(int)

# Time-based features (example: part of the day)
df['hour'] = df['timestamp'].dt.hour
df['part_of_day'] = pd.cut(df['hour'], bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'], right=False)

In [None]:
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Convert day of week to cyclic features
df['sin_day'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['cos_day'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

In [None]:
df['price_change_5_intervals'] = df['close'].diff(periods=5)

# Rolling window features (example: rolling average over the last 5 intervals)
df['rolling_avg_price_10_intervals'] = df['close'].rolling(window=5).mean()
df['rolling_avg_price_10_close_intervals'] = df['low'].rolling(window=5).mean()


# Drop any rows with NaN values created by shifts and rolling functions
df.dropna(inplace=True)

# Display the first few rows to inspect the new features
# df

In [None]:
df['ma_30m'] = df['close'].rolling(window=5).mean()  # 1-hour MA
df['ma_24h'] = df['close'].rolling(window=288).mean()  # 24-hour MA

In [None]:
df

In [None]:
df['ema_30min'] = df['close'].ewm(span=6, adjust=False).mean()  # 1-hour EMA
df['ema_24h'] = df['close'].ewm(span=288, adjust=False).mean()  # 24-hour EMA

In [None]:
def calculate_rsi(data, window=14):
    delta = data.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()

    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

In [None]:
df['rsi'] = calculate_rsi(df['close'], window=5)

In [None]:
df.dropna(inplace=True)

In [None]:
# Define split ratios
train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

# Calculate indices for splits
total_samples = len(df)
train_end = int(total_samples * train_ratio)
validation_end = int(train_end + total_samples * validation_ratio)

# Split the dataset
train_data = df.iloc[:train_end]
validation_data = df.iloc[train_end:validation_end]
test_data = df.iloc[validation_end:]


X_train, y_train = train_data.drop(['target', 'timestamp', 'hour'], axis=1), train_data['target']
X_val, y_val = validation_data.drop(['target', 'timestamp', 'hour'], axis=1), validation_data['target']
X_test, y_test = test_data.drop(['target', 'timestamp', 'hour'], axis=1), test_data['target']


print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Normalize numerical features, replacing 'price' with 'close'
scaler = StandardScaler()
# numeric_features = ['close', 'price_change_5_intervals', 'rolling_avg_price_5_intervals']  # Adjust feature names as needed
numeric_features = [
    'close',
                    'price_change_5_intervals',
                    'rolling_avg_price_10_close_intervals',
                    'rolling_avg_price_10_intervals',
                    'sin_day',
                    'cos_day',
                    # 'ma_30min',
                      'ma_24h',
                  'ema_30min',
                  'ema_24h',
                    'rsi'
                    ]  # Adjust feature names as needed
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])
X_val[numeric_features] = scaler.transform(X_val[numeric_features])

# Encode categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
categorical_features = ['part_of_day']  # Ensure this is present or adjust accordingly
# categorical_features = ['part_of_day']  # Ensure this is present or adjust accordingly
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])
X_val_encoded = encoder.transform(X_val[categorical_features])

# Concatenate encoded categorical features back with numerical features
X_train_preprocessed = np.concatenate((X_train[numeric_features].values, X_train_encoded), axis=1)
X_val_preprocessed = np.concatenate((X_val[numeric_features].values, X_val_encoded), axis=1)
X_test_preprocessed = np.concatenate((X_test[numeric_features].values, X_test_encoded), axis=1)

In [None]:
from keras.src.layers import Bidirectional
from keras.src.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
X_train_reshaped = X_train_preprocessed.reshape((X_train_preprocessed.shape[0], 1, X_train_preprocessed.shape[1]))
X_test_reshaped = X_test_preprocessed.reshape((X_test_preprocessed.shape[0], 1, X_test_preprocessed.shape[1]))
X_val_reshaped = X_val_preprocessed.reshape((X_val_preprocessed.shape[0], 1, X_val_preprocessed.shape[1]))
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_val_categorical = to_categorical(y_val)

In [None]:
from keras.src.callbacks import ReduceLROnPlateau, EarlyStopping

# Define the LSTM model
model = Sequential()
model.add(Bidirectional(LSTM(100, return_sequences=True, activation='tanh'), input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, activation='tanh'))
model.add(Dropout(0.2))
# model.add(LSTM(100, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(2, activation='sigmoid'))  # Assuming binary classification

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
# Compile the model

# Train the model
# history = model.fit(X_train_reshaped, y_train_categorical, epochs=100, batch_size=32, validation_ratio=0.2, verbose=1)
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.2, patience=5, min_lr=0.001)

history = model.fit(
    X_train_reshaped,
    y_train_categorical, 
    epochs=100,
    batch_size=8,
    validation_data=(X_val_reshaped,y_val_categorical),
    verbose=1,
    callbacks=[early_stopping, reduce_lr]
)

In [None]:
# Evaluate the model
_, accuracy = model.evaluate(X_test_reshaped, y_test_categorical, verbose=0)
print(f'Test Accuracy: {accuracy:.2f}')

In [None]:
model.summary()

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.plot(history.history['accuracy'], label='train')

In [None]:
plt.hist(model.predict(X_test_reshaped),bins=100)

In [None]:
plt.hist(model.predict(X_test_reshaped) > 0.52,bins=100)

In [None]:
plt.hist(model.predict(X_test_reshaped) <  0.48,bins=100)

In [None]:
len(pd.DataFrame(model.predict(X_test_reshaped)))

In [None]:
len(X_test_reshaped)

In [None]:
pred = pd.DataFrame(model.predict(X_test_reshaped))

In [None]:
pred[pred[0] > 0.52]

In [None]:
pred[pred[1] > 0.52]

In [None]:
pred