In [1]:
# Agent AI: To make decision making for the database
# Environment for the external system
# State is the represetation for the information available: Stock closing prices, moving averages, and daily returns
# Action Space for AI Agent's actions: Buy, Sell, and Hold
# Reward Function to determine the Agent's preformance by assigning a numerical value for it's actions: total profit

In [5]:
pip install yfinance

Collecting yfinance
  Downloading yfinance-0.2.61-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.1.tar.gz (3.0 MB)
     ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
     ---------------------------------------- 3.0/3.0 MB 58.8 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting curl_cffi>=0.7 (from yfinance)
  Downloading curl_cffi-0.11.1-cp39-abi3-win_amd64.whl.metadata (15 kB)
Collecting websockets>=13.0 (from yfinance)
  Downloading websockets-15.0.1-cp312-cp312-win_amd64.whl.metadata (7.0

In [7]:
import yfinance as yf
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# define a specific stock symbol (Apple in this case) and time period
symbol = "AAPL"
start_date = "2020-01-01"
end_date = "2025-04-01"

# download historical data
data = yf.download(symbol, start=start_date, end=end_date)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [9]:
#Program Technical indicators for better trading decisions
# feature engineering
data['SMA_5'] = data['Close'].rolling(window=5).mean()
data['SMA_20'] = data['Close'].rolling(window=20).mean()
data['Returns'] = data['Close'].pct_change()

In [11]:
# drop NaN values and reset index
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [13]:
# define actions
ACTIONS = {0: "HOLD", 1: "BUY", 2: "SELL"}

In [15]:
#Using the reinforcement training model, we extract with get state function
# get state function
def get_state(data, index):
    return np.array([
        float(data.loc[index, 'Close']),
        float(data.loc[index, 'SMA_5']),
        float(data.loc[index, 'SMA_20']),
        float(data.loc[index, 'Returns'])
    ])

In [19]:
#Build trading environment
# trading environment
class TradingEnvironment:
    def __init__(self, data):
        self.data = data
        self.initial_balance = 10000
        self.balance = self.initial_balance
        self.holdings = 0
        self.index = 0

    def reset(self):
        self.balance = self.initial_balance
        self.holdings = 0
        self.index = 0
        return get_state(self.data, self.index)

    def step(self, action):
        price = float(self.data.loc[self.index, 'Close'])
        reward = 0

        if action == 1 and self.balance >= price:  # BUY
            self.holdings = self.balance // price
            self.balance -= self.holdings * price
        elif action == 2 and self.holdings > 0:  # SELL
            self.balance += self.holdings * price
            self.holdings = 0

        self.index += 1
        done = self.index >= len(self.data) - 1

        if done:
            reward = self.balance - self.initial_balance

        next_state = get_state(self.data, self.index) if not done else None
        return next_state, reward, done, {}

In [21]:
#DQN is a neural network that approximates Q values for each state-action pair

In [23]:
# Define neural network with torch library
# deep q-network
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [27]:
# Program a DQN agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(list(ACTIONS.keys()))
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)

        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
                target += self.gamma * torch.max(self.model(next_state_tensor)).item()

            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            target_tensor = self.model(state_tensor).clone().detach()
            target_tensor[0][action] = target

            self.optimizer.zero_grad()
            output = self.model(state_tensor)
            loss = self.criterion(output, target_tensor)
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [29]:
# Train the agent for its capabilities
# Program the environment as well
env = TradingEnvironment(data)
agent = DQNAgent(state_size=4, action_size=3)
batch_size = 32
episodes = 500
total_rewards = []

for episode in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

    agent.replay(batch_size)
    total_rewards.append(total_reward)
    print(f"Episode {episode+1}/{episodes}, Total Reward: {total_reward}")

print("Training Complete!")

  float(data.loc[index, 'Close']),
  float(data.loc[index, 'SMA_5']),
  float(data.loc[index, 'SMA_20']),
  float(data.loc[index, 'Returns'])
  price = float(self.data.loc[self.index, 'Close'])


Episode 1/500, Total Reward: 7278.934745788574
Episode 2/500, Total Reward: -9821.325180053711
Episode 3/500, Total Reward: -9894.329265594482
Episode 4/500, Total Reward: -9852.625118255615
Episode 5/500, Total Reward: -9903.610557556152
Episode 6/500, Total Reward: -9800.343521118164
Episode 7/500, Total Reward: -9917.574089050293
Episode 8/500, Total Reward: -9848.722831726074
Episode 9/500, Total Reward: 9314.805347442627
Episode 10/500, Total Reward: -9928.986518859863
Episode 11/500, Total Reward: 22824.507022857666
Episode 12/500, Total Reward: -9873.130554199219
Episode 13/500, Total Reward: -9881.269943237305
Episode 14/500, Total Reward: -9924.079879760742
Episode 15/500, Total Reward: -9942.645240783691
Episode 16/500, Total Reward: -9878.154693603516
Episode 17/500, Total Reward: -9826.156986236572
Episode 18/500, Total Reward: -9863.679512023926
Episode 19/500, Total Reward: -9859.195457458496
Episode 20/500, Total Reward: -9913.834060668945
Episode 21/500, Total Reward: 3

In [33]:
# create a new environment situation for testing
test_env = TradingEnvironment(data)
state = test_env.reset()
done = False

# simulate a trading session using the trained agent
while not done:
    # always choose the best action (exploitation)
    action = agent.act(state)
    next_state, reward, done, _ = test_env.step(action)
    state = next_state if next_state is not None else state
# Calculation with the reward function(s)
final_balance = test_env.balance
profit = final_balance - test_env.initial_balance
print(f"Final Balance after testing: ${final_balance:.2f}")
print(f"Total Profit: ${profit:.2f}")

  float(data.loc[index, 'Close']),
  float(data.loc[index, 'SMA_5']),
  float(data.loc[index, 'SMA_20']),
  float(data.loc[index, 'Returns'])
  price = float(self.data.loc[self.index, 'Close'])


Final Balance after testing: $228.16
Total Profit: $-9771.84
