In [30]:
import yfinance as yf
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# 1. Download historical stock data and feature engineering
symbol = "AAPL"
start_date = "2020-01-01"
end_date = "2025-02-14"

data = yf.download(symbol, start=start_date, end=end_date)

data['SMA_5'] = data['Close'].rolling(window=5).mean()
data['SMA_20'] = data['Close'].rolling(window=20).mean()
data['Returns'] = data['Close'].pct_change()

data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

# 2. Define safe float function (to fix future warnings)
def safe_float(val):
    return float(val.iloc[0]) if isinstance(val, pd.Series) else float(val)

# 3. Define trading environment with balance tracking
class TradingEnvironment:
    def __init__(self, data, initial_balance=10000):
        self.data = data.reset_index(drop=True)
        self.index = 0
        self.done = False
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # number of shares held
        self.last_price = safe_float(self.data.loc[self.index, 'Close'])

    def reset(self):
        self.index = 0
        self.done = False
        self.balance = self.initial_balance
        self.position = 0
        self.last_price = safe_float(self.data.loc[self.index, 'Close'])
        return self._get_state()

    def step(self, action):
        # Actions: 0 = HOLD, 1 = BUY, 2 = SELL
        self.index += 1
        if self.index >= len(self.data) - 1:
            self.done = True

        price = safe_float(self.data.loc[self.index, 'Close'])
        reward = 0

        if action == 1:  # BUY
            if self.balance >= price:
                self.position += 1
                self.balance -= price
        elif action == 2:  # SELL
            if self.position > 0:
                self.position -= 1
                self.balance += price

        # Reward is unrealized profit on held shares
        reward = (price - self.last_price) * self.position
        self.last_price = price

        next_state = self._get_state()
        return next_state, reward, self.done, {}

    def _get_state(self):
        idx = self.index
        return np.array([
            safe_float(self.data.loc[idx, 'Close']),
            safe_float(self.data.loc[idx, 'SMA_5']),
            safe_float(self.data.loc[idx, 'SMA_20']),
            safe_float(self.data.loc[idx, 'Returns'])
        ])

# 4. Define DQN neural network
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# 5. Define DQN agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if random.random() < self.epsilon:
            return random.choice([0,1,2])  # HOLD, BUY, SELL
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)

        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
                target += self.gamma * torch.max(self.model(next_state_tensor)).item()

            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            target_tensor = self.model(state_tensor).clone().detach()
            target_tensor[0][action] = target

            self.optimizer.zero_grad()
            output = self.model(state_tensor)
            loss = self.criterion(output, target_tensor)
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# 6. Training loop
env = TradingEnvironment(data)
agent = DQNAgent(state_size=4, action_size=3)
batch_size = 32
episodes = 500
total_rewards = []

for episode in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

    agent.replay(batch_size)
    total_rewards.append(total_reward)
    print(f"Episode {episode+1}/{episodes}, Total Reward: {total_reward:.2f}")

print("Training Complete!")

# 7. Testing phase - evaluation
agent.epsilon = agent.epsilon_min  # turn off exploration for testing
test_env = TradingEnvironment(data)
state = test_env.reset()
done = False

while not done:
    action = agent.act(state)
    next_state, reward, done, _ = test_env.step(action)
    state = next_state if next_state is not None else state

final_balance = test_env.balance + test_env.position * safe_float(data.loc[test_env.index, 'Close'])
profit = final_balance - test_env.initial_balance
print(f"Final Balance after testing: ${final_balance:.2f}")
print(f"Total Profit: ${profit:.2f}")


  data = yf.download(symbol, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Episode 1/500, Total Reward: 4137.93
Episode 2/500, Total Reward: 3521.20
Episode 3/500, Total Reward: 787.63
Episode 4/500, Total Reward: 3936.66
Episode 5/500, Total Reward: 1077.54
Episode 6/500, Total Reward: 4014.79
Episode 7/500, Total Reward: 1192.12
Episode 8/500, Total Reward: 1641.85
Episode 9/500, Total Reward: 507.37
Episode 10/500, Total Reward: 8424.83
Episode 11/500, Total Reward: 482.04
Episode 12/500, Total Reward: 2536.26
Episode 13/500, Total Reward: 434.77
Episode 14/500, Total Reward: 1659.23
Episode 15/500, Total Reward: 495.00
Episode 16/500, Total Reward: 872.17
Episode 17/500, Total Reward: 2576.28
Episode 18/500, Total Reward: 7865.08
Episode 19/500, Total Reward: 8135.48
Episode 20/500, Total Reward: 7317.47
Episode 21/500, Total Reward: 8457.16
Episode 22/500, Total Reward: 7560.93
Episode 23/500, Total Reward: 10198.87
Episode 24/500, Total Reward: 8954.17
Episode 25/500, Total Reward: 552.01
Episode 26/500, Total Reward: 641.58
Episode 27/500, Total Reward