In [86]:
# import base stats libraries
import numpy as np
import pandas as pd
import random 
import seaborn as sb
import matplotlib.pyplot as plt

# import yfinance for stock data
import yfinance as yf 

# import torch for model creation and sklearn to scale within reasonable range
import torch
from sklearn.preprocessing import MinMaxScaler

# import progress bar
from tqdm import tqdm

# import decoupling method numpy array split
from collections import deque

#import datetime
#import cv2

In [87]:
# cuda setup
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

### Trading Environment

In [88]:
class TradingEnv:
    def __init__(self, data, window_size = 30, initial_balance = 10000):
        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.reset()
    
    def reset(self):
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.shares_held = 0
        self.net_worth = self.initial_balance
        self.done = False
        return self.getStates()
    
    def getStates(self):
        state = self.data[self.current_step - self.window_size:self.current_step].flatten()
        # normalize balance and shares_held before 
        norm_balance = self.balance / self.initial_balance
        norm_shares = self.shares_held
        return np.concatenate((state,[norm_balance, norm_shares]))
    
    def step(self, action):
        price = self.data[self.current_step][3] # close
        prev_net = self.net_worth
        reward = 0

        if action == 1: # Buy
            if self.balance >= price:
                self.balance -= price
                self.shares_held += 1 
        elif action == 2: # sell
            if self.shares_held > 0:
                self.balance += price 
                self.shares_held -= 1
                reward = 1
    
        self.current_step += 1
        if self.current_step >= len(self.data):
            self.done = True
        self.net_worth = self.balance + self.shares_held * price
        reward = self.net_worth - prev_net

        return self.getStates(), reward, self.done

### Network architecture

In [96]:
N_features = 5 # OHLCV
window_size = 30 # 30-day trading window
N_states = N_features * window_size 

class TradingNet(torch.nn.Module):
    def __init__(self):
        super(TradingNet, self).__init__()
        self.fc1 = torch.nn.Linear(N_states+2,8192) # +2 for balance and shares_held
        self.fc2 = torch.nn.Linear(8192,1024)
        self.fc3 = torch.nn.Linear(1024,3)
        self.activ = torch.nn.LeakyReLU()
    
    def forward(self, x):
        x =self.activ(self.fc1(x))
        x = self.activ(self.fc2(x))
        return self.fc3(x)

### Replay Class

In [90]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((np.array(state), action, reward, np.array(next_state), done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.array, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)



### Parameters/Settings

In [97]:
'''Initial model settings will be here '''
# model parameter settings
episodes = 50
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.999
batch_size = 32
buffer_capacity = 10000
model_path = "../data"


### Get data

In [98]:
def stock_data(ticker='NVDA', start="2020-01-01", end="2023-12-31"):
    #load stock data while also cleaning columns of interest
    data = yf.download(ticker, start=start, end=end)[["Open","High","Low","Close","Volume"]].fillna(method='ffill')
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data

### Training

In [None]:
def train():
    data = stock_data()
    env = TradingEnv(data,30)#30 is for days
    action_size = 3 # buy,sell, hold
    model = TradingNet() # neural network

    #network setup
    target_model = TradingNet().to(device)
    target_model.load_state_dict(model.state_dict())

    net = TradingNet()
    optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
    criterion = torch.nn.MSELoss()
    replay_buffer = ReplayBuffer(buffer_capacity)

    global epsilon # access epsilon variable from settings
    rewards_history = []

    # Training Loop
    for episode in tqdm(range(episodes), desc="Training Episodes"):
        state = env.reset()
        total_reward = 0
        done = False
        pbar = tqdm(range(episodes), desc="Training Episodes")

        pbar.set_postfix({
        'Reward': total_reward,
        'Epsilon': f"{epsilon:.3f}",
        'Loss': f"{loss.item():.4f}" if 'loss' in locals() else 'N/A'
        })

        while not done:
            if np.random.rand() <= epsilon:
                action = random.randrange(action_size)
            else:
                with torch.no_grad():
                    action = torch.argmax(model(torch.FloatTensor(state))).item()
            
            next_state, reward, done = env.step(action)
            replay_buffer.push(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            # Replay
            if len(replay_buffer) >= batch_size:
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
                states = torch.FloatTensor(states)
                next_states = torch.FloatTensor(next_states)
                actions = torch.LongTensor(actions)
                rewards = torch.FloatTensor(rewards)
                dones = torch.FloatTensor(dones)

                # values for model states
                q_values = model(states)
                next_q_values = target_model(next_states)

                # Q-values for next states
                max_next_q_values = torch.max(next_q_values, dim=1)[0]

                q_targets = rewards + gamma * max_next_q_values * (1 - dones)

                q_values_actions = q_values.gather(1, actions.unsqueeze(1)).squeeze()


                loss = criterion(q_values_actions, q_targets)


                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
            epsilon = max(epsilon_min, epsilon * epsilon_decay)
        target_model.load_state_dict(model.state_dict())
        rewards_history.append(total_reward)
    
    # save model
    torch.save(model.state_dict(),model_path)
    print("Model saved to", model_path)

    # display progress
    plt.plot(rewards_history)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Reward per Episode")
    plt.grid()
    plt.show()


# start program
if __name__ == "__main__":
    train()



[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, start=start, end=end)[["Open","High","Low","Close","Volume"]].fillna(method='ffill')
Training Episodes:   0%|          | 0/50 [00:00<?, ?it/s]

[A[A

[A[A