# Algorithmic Trading 
## Part 2 - Reinforcement Learning
Project developed by Beatriz Alves Correia

Date: November-December 2023

In [17]:
import yfinance as yf  # for dataset
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt  
import mplfinance as mpf
from tqdm import tqdm

import keras
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.optimizers import Adam


import numpy as np
import random
from collections import deque


#Keras: Used for building and training the neural network.
# numpy: Provides support for large, multi-dimensional arrays and matrices.
# random: Used for random number generation.
# collections.deque: Provides a double-ended queue for storing experiences.


### Data Loading same as with random forest approach

In [18]:
axp = yf.Ticker("AXP")
data = axp.history(start="2019-01-01", end="2023-09-30")
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-02 00:00:00-05:00,86.886879,89.070379,86.757343,88.524506,4175400,0.0,0.0
2019-01-03 00:00:00-05:00,88.059981,88.422291,86.610742,86.796539,4776600,0.39,0.0
2019-01-04 00:00:00-05:00,88.598813,91.367231,88.162181,90.707642,4637200,0.0,0.0
2019-01-07 00:00:00-05:00,90.828426,91.896778,90.131675,91.200027,3289000,0.0,0.0
2019-01-08 00:00:00-05:00,91.683088,92.454154,90.596152,91.645927,3227200,0.0,0.0


### Add Daily Returns column to the dataset

In [19]:
# Calculate daily returns
data['Daily Returns'] = data['Close'].pct_change()
data['Daily Returns']

Date
2019-01-02 00:00:00-05:00         NaN
2019-01-03 00:00:00-05:00   -0.019520
2019-01-04 00:00:00-05:00    0.045061
2019-01-07 00:00:00-05:00    0.005428
2019-01-08 00:00:00-05:00    0.004889
                               ...   
2023-09-25 00:00:00-04:00    0.000980
2023-09-26 00:00:00-04:00   -0.013770
2023-09-27 00:00:00-04:00   -0.008073
2023-09-28 00:00:00-04:00    0.002201
2023-09-29 00:00:00-04:00   -0.006923
Name: Daily Returns, Length: 1195, dtype: float64

### Add Cumulative Returns column to the dataset

In [20]:
# Calculate the cumulative returns
cumulative_return = (1 + data['Daily Returns']).cumprod() - 1
data['Cumulative Returns'] = cumulative_return

# drop the na created with the calculation
data['Cumulative Returns'].dropna()
cumulative_return[-1]

  cumulative_return[-1]


0.6678277947483524

### Check missing data

In [21]:
# check missing values
missing_values = data.isnull().sum()
missing_values

Open                  0
High                  0
Low                   0
Close                 0
Volume                0
Dividends             0
Stock Splits          0
Daily Returns         1
Cumulative Returns    1
dtype: int64

### Drop the missing data

In [22]:
# drop missing values created 
data = data.dropna()

### Add lag columns to the dataset
With the following function its possible to add the desired number of lags to the dataset in order to incorporate historical context to the model which can help the agent understand trends ans patterns in the market.

In [23]:
# function to add desired lags
def add_lags(data, lags):
    for lag in range(1, lags + 1):
        data[f'Return_{lag}d'] = data['Close'].pct_change(lag) * 100
    data = data.dropna()  # Drop rows with NaN values
    return data

The model is a sequential neural network with:
- Input Layer: 64 units, ReLU activation.
- Hidden Layers: Two layers with 32 and 8 units, ReLU activation.
- Output Layer: 3 units (one for each action), linear activation.
- Loss function: Mean squared error (MSE).
- Optimizer: Adam optimizer with a learning rate of 0.001.

Agent Class:

- Initialization: Sets up the parameters, memory, and neural network models.
- Model: Creates the neural network to predict Q-values for actions given a state
- State Representation: Converts the return to a state format suitable for the neural network.
- Action Selection: Uses an epsilon-greedy policy for exploration and exploitation.
- Replay: Samples experiences from memory and updates the Q-values.
- Update Target Model: Copies weights from the training model to the target model.
- Model Save/Load: Methods to save and load the model.


In [24]:
# Class Agent 
class QAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size  # Number of previous days used for state representation
        self.action_size = action_size  # Number of possible actions: sit, buy, sell
        self.memory = deque(maxlen=2000)  # Experience replay memory
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01 # Minimum exploration rate
        self.epsilon_decay = 0.995 # Exploration rate decay
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(64, input_dim=self.state_size, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(8, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                t = self.target_model.predict(next_state)[0]
                target[0][action] = reward + self.gamma * np.amax(t)
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self):
        self.model.save("trading_model.h5")

    def load_model(self):
        self.model = keras.models.load_model("trading_model.h5")

    def get_state(self, row):
        return np.array([row[['Return_1d', 'Return_2d', 'Return_3d']]])

    def choose_action(self, state):
        return self.act(state)


TradingTrainer Class:

Initialization: Sets up the agent, data, and training parameters.
Train Method: Runs the training loop for a specified number of episodes, updating the agent's Q-values based on the trading actions and rewards.

In [25]:
class TradingTrainer:
    def __init__(self, agent, prepared_data, episodes=2, batch_size=32, initial_balance=100):
        self.agent = agent
        self.prepared_data = prepared_data
        self.episodes = episodes
        self.batch_size = batch_size
        self.initial_balance = initial_balance

    def train(self):
        for episode in tqdm(range(self.episodes), desc="Training Progress"):
            total_reward = 0
            balance = self.initial_balance
            inventory = []

            for i in range(len(self.prepared_data) - 1):
                state = self.agent.get_state(self.prepared_data.iloc[i])
                next_state = self.agent.get_state(self.prepared_data.iloc[i + 1])
                action = self.agent.choose_action(state)
                
                reward = 0  # Initialize reward

                if action == 1:  # Buy
                    inventory.append(self.prepared_data.iloc[i]['Close'])
                elif action == 2 and len(inventory) > 0:  # Sell
                    bought_price = inventory.pop(0)
                    reward = self.prepared_data.iloc[i]['Close'] - bought_price
                    balance += reward

                done = i == len(self.prepared_data) - 2
                self.agent.remember(state, action, reward, next_state, done)
                if len(self.agent.memory) > self.batch_size:
                    self.agent.replay(self.batch_size)
                total_reward += reward

            self.agent.update_target_model()
            print(f"Episode {episode + 1}/{self.episodes}, Total Reward: {total_reward}, Balance: {balance}")


In [26]:
class TradingEvaluator:
    def __init__(self, agent, prepared_data, initial_balance=10000):
        self.agent = agent
        self.prepared_data = prepared_data
        self.initial_balance = initial_balance

    def evaluate(self):
        balance = self.initial_balance
        inventory = []
        correct_actions = 0
        total_actions = 0

        for i in range(len(self.prepared_data) - 1):
            state = self.agent.get_state(self.prepared_data.iloc[i])
            action = self.agent.choose_action(state)

            if action == 1:  # Buy
                inventory.append(self.prepared_data.iloc[i]['Close'])
            elif action == 2 and len(inventory) > 0:  # Sell
                bought_price = inventory.pop(0)
                profit = self.prepared_data.iloc[i]['Close'] - bought_price
                balance += profit
                if profit > 0:
                    correct_actions += 1
            total_actions += 1

        final_return = (balance - self.initial_balance) / self.initial_balance * 100
        accuracy = (correct_actions / total_actions) * 100 if total_actions > 0 else 0

        print(f"Final Investment Return: {final_return:.2f}%")
        print(f"Accuracy of Actions: {accuracy:.2f}%")
        return final_return, accuracy




## Teste com 3 lags

In [27]:
# apply data preparation
lags = 3
data = add_lags(data, lags)
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily Returns,Cumulative Returns,Return_1d,Return_2d,Return_3d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-08 00:00:00-05:00,91.683088,92.454154,90.596152,91.645927,3227200,0.0,0.0,0.004889,0.035261,0.488925,1.034407,5.587075
2019-01-09 00:00:00-05:00,91.729542,92.519192,91.181435,91.813156,2954500,0.0,0.0,0.001825,0.03715,0.182473,0.67229,1.218767
2019-01-10 00:00:00-05:00,91.460159,91.66454,90.87489,91.283653,3523800,0.0,0.0,-0.005767,0.031168,-0.576718,-0.395298,0.091695
2019-01-11 00:00:00-05:00,90.930596,91.673799,90.447519,91.553032,2770600,0.0,0.0,0.002951,0.034211,0.295101,-0.283319,-0.101363
2019-01-14 00:00:00-05:00,90.819131,91.998959,90.689071,91.450851,4120500,0.0,0.0,-0.001116,0.033057,-0.111608,0.183163,-0.394611


In [28]:
state_size = 3
num_action = 3
agent = QAgent(state_size, num_action)
trainer = TradingTrainer(agent, data)
evaluator = TradingEvaluator(agent, data)
trainer.train()
agent.save_model()
evaluator.evaluate()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Training Progress:   0%|          | 0/2 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11

Training Progress:   0%|          | 0/2 [03:47<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Example usage
evaluator = TradingEvaluator(agent, prepared_data)
final_return, accuracy = evaluator.evaluate()

Results with 3 lags 
- 

In [43]:
# Parameters
window_size = 10
episodes = 1
batch_size = 32
gamma = 0.95
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
initial_balance = 100
data_length = len(data)

In [44]:
# Define Environment
class TradingEnvironment:
    def __init__(self, data, window_size):
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.state = deque(maxlen=window_size)
        self.reset()

    def reset(self):
        self.current_step = 0
        self.state = deque(maxlen=self.window_size)
        for _ in range(self.window_size):
            self.state.append(self.data['Close'].iloc[self.current_step])
            self.current_step += 1
        self.balance = initial_balance
        self.trades = 0
        self.correct_trades = 0
        return np.array(self.state)

    def step(self, action):
        reward = 0
        self.state.append(self.data['Close'].iloc[self.current_step])
        price_diff = self.data['Close'].iloc[self.current_step + 1] - self.data['Close'].iloc[self.current_step]
        if action == 1:  # Buy
            reward = price_diff
            self.balance += price_diff * self.balance
            if price_diff > 0:
                self.correct_trades += 1
            self.trades += 1
        self.current_step += 1
        done = self.current_step >= len(self.data) - 1
        return np.array(self.state), reward, done

# Define DQN
class DQN:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(optimizer='adam', loss='mse')
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        global epsilon
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

# Training and Evaluation
env = TradingEnvironment(data, window_size)
dqn = DQN(window_size, 2)

for e in tqdm(range(episodes), desc="Episodes"):
    state = env.reset()
    state = np.reshape(state, [1, window_size])
    for time in range(data_length - window_size):
        action = dqn.act(state)
        next_state, reward, done = env.step(action)
        next_state = np.reshape(next_state, [1, window_size])
        dqn.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print(f"Episode {e+1}/{episodes}, Reward: {reward}, Balance: {env.balance}")
            break
        if len(dqn.memory) > batch_size:
            dqn.replay(batch_size)

# Calculate accuracy and final ROI
accuracy = env.correct_trades / env.trades if env.trades > 0 else 0
final_roi = (env.balance - initial_balance) / initial_balance * 100

print(f"Training completed.")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Final ROI: {final_roi:.2f}%")

Episodes:   0%|          | 0/1 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14

Episodes:   0%|          | 0/1 [02:27<?, ?it/s]


KeyboardInterrupt: 

In [36]:
state_size = 3
num_action = 3
agent = QAgent(state_size, num_action)
trainer = TradingTrainer(agent, data)
evaluator = TradingEvaluator(agent, data)
trainer.train()
agent.save_model()
evaluator.evaluate()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Training Progress:   0%|          | 0/2 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12

Training Progress:   0%|          | 0/2 [10:45<?, ?it/s]


KeyboardInterrupt: 

In [64]:

window_size = 10
episodes = 1000
alpha = 0.1
gamma = 0.95
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
initial_balance = 100

# Add lag features
for lag in range(1, 15):
    data[f'Lag_{lag}'] = data['Close'].shift(lag)

data = data.dropna().reset_index(drop=True)

# Define Environment
class TradingEnvironment:
    def __init__(self, data, window_size):
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.reset()

    def reset(self):
        self.current_step = 0
        self.balance = initial_balance
        self.trades = 0
        self.correct_trades = 0
        self.state = self.data.iloc[:self.window_size, -10:].values.flatten()
        return self.state

    def step(self, action):
        self.current_step += 1
        done = self.current_step + self.window_size >= len(self.data)

        if done:
            return self.state, 0, done

        next_state = self.data.iloc[self.current_step:self.current_step + self.window_size, -10:].values.flatten()
        reward = self.data['Daily Returns'].iloc[self.current_step + self.window_size - 1]

        if action == 1:  # Buy
            self.balance += reward * self.balance
            if reward > 0:
                self.correct_trades += 1
            self.trades += 1

        self.state = next_state
        return self.state, reward, done

# Q-Learning
q_table = np.zeros((len(data) - window_size, 2))

def choose_action(state_index):
    if np.random.rand() < epsilon:
        return random.choice([0, 1])
    return np.argmax(q_table[state_index])

# Training and Evaluation
env = TradingEnvironment(data, window_size)

for e in tqdm(range(episodes), desc="Episodes"):
    state = env.reset()
    state_index = 0
    done = False
    while not done:
        action = choose_action(state_index)
        next_state, reward, done = env.step(action)
        next_state_index = min(state_index + 1, len(data) - window_size - 1)

        best_next_action = np.argmax(q_table[next_state_index])
        td_target = reward + gamma * q_table[next_state_index, best_next_action] * (1 - done)
        td_error = td_target - q_table[state_index, action]
        q_table[state_index, action] += alpha * td_error

        state_index = next_state_index

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Calculate accuracy and final ROI
accuracy = env.correct_trades / env.trades if env.trades > 0 else 0
final_roi = (env.balance - initial_balance) / initial_balance * 100

print(f"Training completed.")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Final ROI: {final_roi:.2f}%")

Episodes: 100%|██████████| 1000/1000 [01:23<00:00, 12.02it/s]

Training completed.
Accuracy: 53.30%
Final ROI: 134.75%





In [72]:
window_size = 10
episodes = 1
alpha_values = [0.1, 0.2, 0.3, 0.15]  # Learning rates to test
gamma_values = [0.9, 0.95, 0.99, 0.85]  # Discount factors to test
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
initial_balance = 100

In [75]:
# Add lag features
for lag in range(1, 16):
    data[f'Lag_{lag}'] = data['Close'].shift(lag)

data = data.dropna().reset_index(drop=True)

# Define Environment
class TradingEnvironment:
    def __init__(self, data, window_size):
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.reset()

    def reset(self):
        self.current_step = 0
        self.balance = initial_balance
        self.trades = 0
        self.correct_trades = 0
        self.state = self.data.iloc[:self.window_size, -15:].values.flatten()
        return self.state

    def step(self, action):
        self.current_step += 1
        done = self.current_step + self.window_size >= len(self.data)

        if done:
            return self.state, 0, done

        next_state = self.data.iloc[self.current_step:self.current_step + self.window_size, -15:].values.flatten()
        reward = self.data['Daily Returns'].iloc[self.current_step + self.window_size - 1]

        if action == 1:  # Buy
            self.balance += reward * self.balance
            if reward > 0:
                self.correct_trades += 1
            self.trades += 1

        self.state = next_state
        return self.state, reward, done

# Grid search for alpha and gamma
best_roi = -np.inf
best_alpha = None
best_gamma = None
best_accuracy = 0

for alpha in alpha_values:
    for gamma in gamma_values:
        # Q-Learning
        q_table = np.zeros((len(data) - window_size, 2))
        env = TradingEnvironment(data, window_size)
        current_epsilon = epsilon

        for e in range(episodes):
            state = env.reset()
            state_index = 0
            done = False
            while not done:
                action = random.choice([0, 1]) if np.random.rand() < current_epsilon else np.argmax(q_table[state_index])
                next_state, reward, done = env.step(action)
                next_state_index = min(state_index + 1, len(data) - window_size - 1)

                best_next_action = np.argmax(q_table[next_state_index])
                td_target = reward + gamma * q_table[next_state_index, best_next_action] * (1 - done)
                td_error = td_target - q_table[state_index, action]
                q_table[state_index, action] += alpha * td_error

                state_index = next_state_index

            if current_epsilon > epsilon_min:
                current_epsilon *= epsilon_decay

        final_roi = (env.balance - initial_balance) / initial_balance * 100
        accuracy = env.correct_trades / env.trades if env.trades > 0 else 0
        print(f"Alpha: {alpha}, Gamma: {gamma}, Final ROI: {final_roi:.2f}%, Accuracy: {accuracy * 100:.2f}%")

        if final_roi > best_roi:
            best_roi = final_roi
            best_alpha = alpha
            best_gamma = gamma
            best_accuracy = accuracy

print(f"Best Alpha: {best_alpha}, Best Gamma: {best_gamma}, Best ROI: {best_roi:.2f}%, Best Accuracy: {best_accuracy * 100:.2f}%")

Alpha: 0.1, Gamma: 0.9, Final ROI: -7.22%, Accuracy: 49.05%
Alpha: 0.1, Gamma: 0.95, Final ROI: 13.38%, Accuracy: 49.80%
Alpha: 0.1, Gamma: 0.99, Final ROI: 33.21%, Accuracy: 48.00%
Alpha: 0.1, Gamma: 0.85, Final ROI: -28.78%, Accuracy: 49.80%
Alpha: 0.2, Gamma: 0.9, Final ROI: 0.38%, Accuracy: 52.67%
Alpha: 0.2, Gamma: 0.95, Final ROI: 48.68%, Accuracy: 52.32%
Alpha: 0.2, Gamma: 0.99, Final ROI: 21.47%, Accuracy: 50.28%
Alpha: 0.2, Gamma: 0.85, Final ROI: 124.17%, Accuracy: 51.66%
Alpha: 0.3, Gamma: 0.9, Final ROI: 38.06%, Accuracy: 50.30%
Alpha: 0.3, Gamma: 0.95, Final ROI: 23.45%, Accuracy: 51.16%
Alpha: 0.3, Gamma: 0.99, Final ROI: -5.56%, Accuracy: 51.21%
Alpha: 0.3, Gamma: 0.85, Final ROI: 41.59%, Accuracy: 50.72%
Alpha: 0.15, Gamma: 0.9, Final ROI: 75.89%, Accuracy: 50.89%
Alpha: 0.15, Gamma: 0.95, Final ROI: 29.55%, Accuracy: 50.62%
Alpha: 0.15, Gamma: 0.99, Final ROI: -37.24%, Accuracy: 50.58%
Alpha: 0.15, Gamma: 0.85, Final ROI: 86.75%, Accuracy: 48.72%
Best Alpha: 0.2, Best 