# Stock Trading Using Deep Reinforcement Algorithm

## Cleaning and Combining the data

In [142]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

### Defining the Environment

In [143]:
from gymnasium import Env 
from gymnasium import spaces 
import numpy as np 
import enum
import pandas as pd

class Actions(enum.Enum):
    Hold = 0
    Buy = 1
    Sell = 2

class StockTradingEnv(Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, df, price_column='Close', sample_window=45, initial_balance=100000, commission_rate=0.001, norm_constant=1):
        super(StockTradingEnv, self).__init__()

        self.norm_constant = norm_constant          # used to reverse normalization on close price
        self.price_column = price_column            # Column to calculate price on
        self.sample_window = sample_window          # Window to look at when making a decision, 35 corresponds to 5 days in 30-minute chart
        self.initial_balance = initial_balance      # Initial balance in portfolio
        self.commission_rate = commission_rate      # Comission rate that the broker takes
        self.df = df                                # dataframe
        self.current_step = self.sample_window      # Current step the state is at

        # Action space consisting of three actions buy, sell, and hold each with a percentage like buy using x% of portfolio
        self.action_space = spaces.Box(low=np.array([0, 0]), high=np.array([2, 1]), dtype=np.float32)  
        
        # 
        # What state values can be observed, can change the low and high to 0 and 1 if normalization is done
        self.observation_space = spaces.Box(low=-np.inf, 
                                                high=np.inf, 
                                                shape=(self.df.shape[1] * self.sample_window + 4,),  # the 4 more observations are available cash, share value, action, and percentage
                                                dtype=np.float32)
        
        self.reward_range = (-np.inf, np.inf) # Possible reward range

        self.cash = self.initial_balance # Cash is the available money to use to buy stocks
        self.shares = 0     # Represents amount of shares in the portfolio for this stock

    def reset(self,seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.sample_window
        self.cash = self.initial_balance
        self.shares = 0
        return self.next_observation([Actions.Hold.value, 0]), {}

    def step(self, action):
        balance = self.current_balance

        self.take_action(action)
        obs = self.next_observation(action)
        
        reward = self.current_balance - balance
        done = self.current_step >= self.df.shape[0] - 1
        info = { 'Reward' : round(reward, 2),
                 'Action' : Actions(round(action[0])).name,
                 'Percentage': round(action[1], 2),
                 'Shares' : self.shares, 
                 'Close'  : round(self.current_close_price, 2),
                 'Cash'   : round(self.cash, 2), 
                 'Total'  : round(self.current_balance, 2) }

        if not done:
            self.current_step += 1
        else:
            self.reset()

        return obs, reward, done, False, info
    
    def take_action(self, action):
        if round(action[0]) == Actions.Buy.value:
            if (self.shares == 0):
                price = self.current_close_price * (1 + self.commission_rate)
                self.shares = action[1] * self.cash / price
                self.cash -= self.shares * price
        elif round(action[0]) == Actions.Sell.value:
            if (self.shares > 0):
                price = self.current_close_price * (1 - self.commission_rate)
                self.cash += action[1] * self.shares * price
                self.shares = 0

    def next_observation(self, action):
        observation = []
        for i in range(self.sample_window, 0, -1):
            observation = np.append(observation, self.df.iloc[self.current_step - i])
        observation = np.append(observation, [self.cash, self.shares * self.current_close_price, action[0], action[1]])
        return observation.astype(np.float32)
    
    @property
    def current_close_price(self):
        price = self.df.loc[self.current_step, self.price_column] * self.norm_constant
        if np.isnan(price):
            raise ValueError(f"Encountered NaN price at step {self.current_step}")
        return price
    
    @property
    def current_balance(self):
        return self.cash + (self.shares * self.current_close_price)

### needed modifications: adjust reward function to hate losing more than just winning

In [144]:
from stable_baselines3 import PPO
import torch as th

df = pd.read_csv("norm_reduced_data.csv")

# Create the environment
env = StockTradingEnv(df)

hyperparameters = {
    'learning_rate': 0.0001,
    'n_steps': 2048,
    'batch_size': 64,
    'n_epochs': 10,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'ent_coef': 0.01,
    'max_grad_norm': 0.5
}

# policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[128, 128, 128])

# Instantiate the PPO agent
policy_kwargs = dict(activation_fn=th.nn.Tanh, net_arch=[64, 64])
model = PPO("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs, **hyperparameters)

# Train the agent
model.learn(total_timesteps=10000)

# Save the trained model
model.save("ppo_stock_trading")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 328  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 284           |
|    iterations           | 2             |
|    time_elapsed         | 14            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00026845478 |
|    clip_fraction        | 0.000146      |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.84         |
|    explained_variance   | -1.74e-05     |
|    learning_rate        | 0.0001        |
|    loss                 | 1.94e+07      |
|    n_updates            | 10            |
|    policy_gradient_loss | -9.81e-

In [None]:
from stable_baselines3 import PPO

# Load the trained model
model = PPO.load("ppo_stock_trading")

# Number of episodes to test
num_episodes = 10

# List to store the total rewards for each episode
episode_rewards = []

for episode in range(num_episodes):
    # Reset the environment and the cumulative reward for the episode
    obs, _ = env.reset()
    total_rewards = 0
    
    # Run the episode
    while True:
        action, _states = model.predict(obs)
        obs, reward, done, _, info = env.step(action)
        
        total_rewards += reward
        if done:
            break
    
    # Store the total rewards for the episode
    episode_rewards.append(total_rewards)
    print(f"Episode {episode + 1}: Total Reward = {total_rewards}")

    model.save("final_trained_model")

# Calculate and print the average reward
average_reward = sum(episode_rewards) / num_episodes
print(f"Average Reward: {average_reward}")


Episode 1: Total Reward = -186040.14049955452
Episode 2: Total Reward = -79899.62497822676
Episode 3: Total Reward = -88869.81423101001
Episode 4: Total Reward = -104810.29758118838
Episode 5: Total Reward = -263887.987748681
Episode 6: Total Reward = -184421.67849277172
Episode 7: Total Reward = -103143.00723997173
