# DDPG Main

## Data Preprocessing

In [3]:
!pip install numpy



In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load data
#C:\Users\rohit\OneDrive\Documents\Asset-Pricing-with-Reinforcement-Learning\XOM_30_minute_6_month_data.csv
df = pd.read_csv('XOM_30_minute_6_month_data.csv', parse_dates=['Date'])
df.sort_values('Date', inplace=True)

# Normalize
scaler = MinMaxScaler()
df[['Last Price', 'Volume', 'SMAVG (15)']] = scaler.fit_transform(df[['Last Price', 'Volume', 'SMAVG (15)']])

# Split into training and testing sets
train_size = int(len(df) * 0.8)
train_df = df[:train_size]
test_df = df[train_size:]

In [2]:
print(train_df[10:])

                    Date  Last Price    Volume  SMAVG (15)
2331 2023-03-07 20:00:00    0.596608  0.049707    0.072420
2330 2023-03-07 20:30:00    0.606681  0.165330    0.086878
2329 2023-03-08 14:30:00    0.634154  0.134774    0.099523
2328 2023-03-08 15:00:00    0.608055  0.086473    0.077523
2327 2023-03-08 15:30:00    0.571654  0.099766    0.067301
...                  ...         ...       ...         ...
473  2023-09-29 20:30:00    0.881409  0.306496    0.180971
472  2023-10-02 14:30:00    0.794870  0.171930    0.194260
471  2023-10-02 15:00:00    0.755034  0.144825    0.184963
470  2023-10-02 15:30:00    0.762818  0.113712    0.166506
469  2023-10-02 16:00:00    0.764192  0.092697    0.157100

[1863 rows x 4 columns]


## Environment

In [85]:
import numpy as np
import gymnasium
from gymnasium import spaces

class DDPGTradingEnv(gymnasium.Env):
    def __init__(self, df, window_size, render_mode=None):
        super(DDPGTradingEnv, self).__init__()

        self.df = df
        self.window_size = window_size
        self.render_mode = render_mode

        self.prices, self.signal_features = self._process_data()

        self.shape = (window_size, self.signal_features.shape[1])

        discrete_actions = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
        continuous_quantity = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
        continuous_amount = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        
        # Include the amount as part of the action space
#         self.action_space = spaces.Tuple((discrete_actions, continuous_amount))
        self.action_space = spaces.Box(low=0, high=1, shape=(2,), dtype=np.float32)

        self.observation_space = spaces.Box(low=0, high=1, shape=self.shape, dtype=np.float32)

        self.profit_history = []
        self.reset()

    def reset(self, seed=None):
        super().reset(seed=1)
        self.current_step = self.window_size
        self.total_profit = 0
        self.history = []
        self.balance = 10000
        self.shares_held = 0
        self.portfolio_value = self.balance
        self.previous_portfolio_value = self.portfolio_value
        self.initial_balance = self.balance
        return self._get_observation()

    # Stores prices and features (for now just the price)
    def _process_data(self):

        # Extract prices and features
        prices = df['Last Price'].values
        #features = df[['Last Price', 'Volume', 'SMAVG (15)']].values
        features = df[['Last Price']].values

        return prices, features

    def _get_observation(self):
        start = self.current_step - self.window_size
        end = self.current_step
        observation = self.signal_features[start:end]
        return observation

    def _take_action(self, action_value):
        current_price = self.prices[self.current_step]
        if action_value[0] >= 0.5:
            self._buy_stock(current_price, amount=action_value[1])
        elif action_value[0] < 0.5:
            self._sell_stock(current_price, amount=-action_value[1])


    def _update_portfolio(self, action_value):
        current_price = self.prices[self.current_step]
        if action_value[0] >= 0.5:
            self._buy_stock(current_price, amount=action_value[1])
        elif action_value[0] < 0.5:
            self._sell_stock(current_price, amount=-action_value[1])
        # Update the portfolio value
        self.portfolio_value = self.balance + self.shares_held * current_price
        

    def _buy_stock(self, current_price, amount):
        # Determine the actual amount of stock to buy based on 'amount'
        # For example, you might interpret 'amount' as a percentage of your balance
#         buy_amount = min(self.balance / current_price, amount)
        buy_amount = round(self.balance / current_price) * amount
        self.balance -= buy_amount * current_price
        self.shares_held += buy_amount

    def _sell_stock(self, current_price, amount):
        # Determine the actual amount of stock to sell based on 'amount'
        # Ensure that you don't sell more than you hold
#         sell_amount = min(self.shares_held, amount)
        sell_amount = self.shares_held * amount
        self.balance += sell_amount * current_price
        self.shares_held -= sell_amount

    def step(self, action, seed=None, options=None):
        super().reset(seed=seed)
        action_value = action
        self._take_action(action)
        self.current_step += 1
        reward = self._calculate_reward(action_value)
        done = self.current_step >= len(self.prices) - 1
        observation = self._get_observation()
        info = {'current_step': self.current_step, 'total_profit': self.total_profit}
        self.profit_history.append(self.total_profit)
        print(f"Step: {self.current_step}, Portfolio value: {self.portfolio_value}")
        return observation, reward, done, info

    def render(self, mode='human'):
        # Simple text rendering
        if mode == 'human':
            print(f"Step: {self.current_step}, Total Profit: {self.total_profit}")

    def _calculate_reward(self, action):
        """
        Calculate the reward based on the action taken.
        Action can either be buying or selling a stock.
        The reward is the change in portfolio value as a result of the action.
        """
        # Assuming self.portfolio_value stores the current value of the portfolio
        previous_portfolio_value = self.portfolio_value

        # Update portfolio value based on the action
        self._update_portfolio(action)

        # New portfolio value
        current_portfolio_value = self.portfolio_value

        # Reward is the change in portfolio value
        reward = current_portfolio_value - previous_portfolio_value

        self._update_portfolio(action)  # Ensure this method updates the portfolio value
        current_portfolio_value = self.portfolio_value
        reward = current_portfolio_value - self.previous_portfolio_value
        self.previous_portfolio_value = current_portfolio_value
        self._update_profit(action)
        return reward

    def _update_profit(self, action):
        """
        Update the total profit based on the action taken.
        """
        current_price = self.prices[self.current_step]

        # Update the portfolio after the action
        self._update_portfolio(action)

        # Calculate total profit as the difference between current portfolio value and initial balance
        self.total_profit = self.portfolio_value - self.initial_balance
        # Print for debugging
        print(f"Updated total profit: {self.total_profit}")



## DDPG Classes

## Actor

In [6]:
!pip install --user shimmy>=0.2.1

## Training

In [86]:
import numpy as np
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise
window_size=1
# Initialises trading environment (window size set to 1)
env = DDPGTradingEnv(df, window_size)

action = env.action_space.sample()

print(env.step((1, 1)))
print(env.shares_held)
# Add some noise for exploration
# n_actions = env.action_space.shape[0]
# action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

# Initialize DDPG model. MlpPolicy refers to the neural network.
# We will have to create this.
ddpg_model = DDPG("MlpPolicy", env, action_noise=None, verbose=1, seed=None)

# # Train the model
# # Set the total number of timesteps the length of the training data
ddpg_model.learn(total_timesteps=df.shape[1])




# Reset the environment before running the model

# # Run the model on the entire dataset
# for i in range(total_timesteps):
#     action, _states = ddpg_model.predict(obs, deterministic=True)
#     obs, rewards, done, info = env.step(action)

#     # Optional: Log results
#     print(f"Step: {i}, Reward: {rewards}, Total Profit: {info['total_profit']}")

#     if done:
#         obs = env.reset()


Updated total profit: -41.223591791145736
Step: 2, Portfolio value: 9958.776408208854
(array([[0.63310104]]), -41.223591791145736, False, {'current_step': 2, 'total_profit': -41.223591791145736})
15795
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
import matplotlib.pyplot as plt

# Plotting the profit change
plt.plot(env.profit_history)
plt.title("Profit Change Over Time")
plt.xlabel("Steps")
plt.ylabel("Total Profit")
plt.show()

In [32]:
# import gym
# import numpy as np
# from gym import spaces
# from stable_baselines3 import DDPG
# from stable_baselines3.common.noise import NormalActionNoise

# # Define your DDPGTradingEnv class here

# # Initialize the environment
# env = DDPGTradingEnv(df, window_size, render_mode=False)

# # Determine the number of actions
# n_actions = env.action_space.shape[-1]

# # Define the action noise
# action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

# # Initialize the DDPG model
# ddpg_model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)

# test_action = np.array([0.5])  # Sample action
# observation, reward, done, info = env.step(test_action)

# # Proceed with training and other operations
# #ddpg_model.learn(total_timesteps=10000)


In [131]:
# obs = env.reset()
# for i in range(1000):
#     action, _states = ddpg_model.predict(obs, deterministic=True)
#     obs, rewards, dones, info = env.step(action)
#     env.render()

Updated total profit: 0.0
Step:  61 . Portfolio value:  10000.0
Step: 61, Total Profit: 0.0
Updated total profit: 0.0
Step:  62 . Portfolio value:  10000.0
Step: 62, Total Profit: 0.0
Updated total profit: 0.0
Step:  63 . Portfolio value:  10000.0
Step: 63, Total Profit: 0.0
Updated total profit: 0.0
Step:  64 . Portfolio value:  10000.0
Step: 64, Total Profit: 0.0
Updated total profit: 0.0
Step:  65 . Portfolio value:  10000.0
Step: 65, Total Profit: 0.0
Updated total profit: 0.0
Step:  66 . Portfolio value:  10000.0
Step: 66, Total Profit: 0.0
Updated total profit: 0.0
Step:  67 . Portfolio value:  10000.0
Step: 67, Total Profit: 0.0
Updated total profit: 0.0
Step:  68 . Portfolio value:  10000.0
Step: 68, Total Profit: 0.0
Updated total profit: 0.0
Step:  69 . Portfolio value:  10000.0
Step: 69, Total Profit: 0.0
Updated total profit: 0.0
Step:  70 . Portfolio value:  10000.0
Step: 70, Total Profit: 0.0
Updated total profit: 0.0
Step:  71 . Portfolio value:  10000.0
Step: 71, Total 