<a href="https://colab.research.google.com/github/Dipak22/ReinforcementLearning/blob/master/SAC_Trading_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install stable-baselines3 yfinance pandas numpy matplotlib

Collecting stable-baselines3
  Downloading stable_baselines3-2.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting gymnasium<1.1.0,>=0.29.1 (from stable-baselines3)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (f

In [3]:
!pip install 'shimmy>=2.0'

Collecting shimmy>=2.0
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Downloading Shimmy-2.0.0-py3-none-any.whl (30 kB)
Installing collected packages: shimmy
Successfully installed shimmy-2.0.0


In [4]:
import gym
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from gym import spaces
from stable_baselines3 import SAC
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv

# List of 15 top companies (tickers) in the Nifty 50.
# (Tickers are for NSE stocks on Yahoo Finance. Ensure they are correct.)
tickers = [
    'RELIANCE.NS', 'TCS.NS', 'HDFCBANK.NS', 'INFY.NS', 'ICICIBANK.NS',
    'KOTAKBANK.NS', 'LT.NS', 'SBIN.NS', 'BHARTIARTL.NS', 'AXISBANK.NS',
    'HINDUNILVR.NS', 'MARUTI.NS', 'ASIANPAINT.NS', 'ULTRACEMCO.NS', 'TITAN.NS'
]

# Download historical data for all tickers (MultiIndex DataFrame)
data = yf.download(tickers, start='2010-01-01', end='2023-01-01')

# --- Multi-Asset Trading Environment ---
class MultiAssetTradingEnv(gym.Env):
    """
    A multi-asset trading environment for a portfolio of stocks.
    The agent decides the allocation weights for each asset at every timestep.
    Transaction costs are applied for changing allocations.
    """
    def __init__(self, data, tickers, initial_balance=10000.0, transaction_cost=0.001):
        super(MultiAssetTradingEnv, self).__init__()
        self.data = data
        self.tickers = tickers
        self.n_assets = len(tickers)
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.transaction_cost = transaction_cost

        # Portfolio allocations: weight for each asset (initially, assume all in cash: zero allocation)
        self.allocations = np.zeros(self.n_assets)

        # We'll compute three features per asset: Close, SMA50, SMA200.
        # Plus current allocation vector.
        self.obs_dim = self.n_assets * 3 + self.n_assets  # = 4 * n_assets
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(self.obs_dim,), dtype=np.float32
        )

        # Action space: a vector of allocation preferences for each asset in [0,1].
        # These will be normalized to sum to 1.
        self.action_space = spaces.Box(
            low=0.0, high=1.0, shape=(self.n_assets,), dtype=np.float32
        )

        # Precompute technical indicators for each ticker
        self.feature_data = {}  # dictionary: ticker -> DataFrame with features
        for ticker in tickers:
            close = self.data['Close'][ticker]
            sma50 = close.rolling(window=50).mean()
            sma200 = close.rolling(window=200).mean()
            df_features = pd.DataFrame({
                'Close': close,
                'SMA50': sma50,
                'SMA200': sma200
            })
            self.feature_data[ticker] = df_features.dropna()

        # Align dates across all tickers (use the intersection of all available dates)
        self.dates = self.feature_data[tickers[0]].index
        for ticker in tickers[1:]:
            self.dates = self.dates.intersection(self.feature_data[ticker].index)
        self.dates = sorted(self.dates)
        self.n_steps = len(self.dates)
        self.current_step = 0

    def _get_observation(self):
        """
        Construct an observation vector:
        For each asset, include: [Close, SMA50, SMA200]
        Then append the current allocation vector.
        """
        obs = []
        current_date = self.dates[self.current_step]
        for ticker in self.tickers:
            row = self.feature_data[ticker].loc[current_date]
            obs.extend([float(row['Close']), float(row['SMA50']), float(row['SMA200'])])
        obs.extend(self.allocations.tolist())
        return np.array(obs, dtype=np.float32)

    def reset(self):
        self.balance = self.initial_balance
        self.current_step = 0
        self.allocations = np.zeros(self.n_assets)
        return self._get_observation()

    def step(self, action):
        """
        action: array of allocation preferences for each asset.
        Normalize the action to sum to 1 (if sum is zero, allocate equally).
        Portfolio return is computed as the weighted sum of individual asset returns.
        A transaction cost is applied based on the change in allocation.
        """
        # Normalize action to get new allocations
        action = np.clip(action, 0, 1)
        if action.sum() == 0:
            new_allocations = np.ones(self.n_assets) / self.n_assets
        else:
            new_allocations = action / action.sum()

        # Get current prices for all assets at current date
        current_date = self.dates[self.current_step]
        prices = np.array([self.feature_data[ticker].loc[current_date, 'Close'] for ticker in self.tickers])

        # Move to next step
        self.current_step += 1
        done = (self.current_step >= self.n_steps - 1)
        next_date = self.dates[self.current_step] if not done else current_date
        next_prices = np.array([self.feature_data[ticker].loc[next_date, 'Close'] for ticker in self.tickers])

        # Compute individual asset returns
        asset_returns = (next_prices - prices) / (prices + 1e-8)
        # Compute portfolio return as weighted sum of asset returns
        portfolio_return = np.dot(new_allocations, asset_returns)

        # Compute transaction cost (L1 norm of allocation change * balance * cost)
        allocation_change = np.abs(new_allocations - self.allocations).sum()
        cost = allocation_change * self.balance * self.transaction_cost

        # Update allocations
        self.allocations = new_allocations

        # Update portfolio balance
        new_balance = self.balance * (1 + portfolio_return) - cost
        reward = new_balance - self.balance  # reward is the absolute profit/loss

        self.balance = new_balance

        obs = self._get_observation()
        return obs, reward, done, {}

    def render(self, mode='human'):
        print(f"Step: {self.current_step}, Balance: {self.balance:.2f}, Allocations: {self.allocations}")

# --- Create Environment and Train with SAC ---

# Create the multi-asset environment
env = MultiAssetTradingEnv(data=data, tickers=tickers, initial_balance=10000, transaction_cost=0.001)
vec_env = DummyVecEnv([lambda: env])

# Create and train the SAC model (tune hyperparameters as needed)
model = SAC(
    policy=MlpPolicy,
    env=vec_env,
    verbose=1,
    learning_rate=1e-4,
    buffer_size=100000,
    batch_size=256,
    tau=0.02,
    ent_coef='auto',
    gamma=0.99
)
# Train for 200,000 timesteps (adjust if needed)
model.learn(total_timesteps=200000)

# --- Backtesting the SAC Agent ---

obs = vec_env.reset()
agent_balances = []
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _ = vec_env.step(action)
    agent_balances.append(env.balance)

# --- Buy and Hold Baseline (Equal-Weighted) ---
# For buy-and-hold, invest equally in each asset at the first date and hold through.
initial_balance = 10000
start_date = env.dates[0]
initial_prices = np.array([env.feature_data[ticker].loc[start_date, 'Close'] for ticker in tickers])
# Compute shares purchased for each asset with equal money allocated
shares = (initial_balance / env.n_assets) / initial_prices

buy_and_hold = []
for date in env.dates:
    current_prices = np.array([env.feature_data[ticker].loc[date, 'Close'] for ticker in tickers])
    portfolio_value = np.sum(shares * current_prices)
    buy_and_hold.append(portfolio_value)

# --- Plot the Results ---
plt.figure(figsize=(12, 7))
plt.plot(agent_balances, label='SAC Trading Agent')
plt.plot(buy_and_hold, label='Equal-Weighted Buy & Hold', linestyle='--')
plt.xlabel('Time Steps')
plt.ylabel('Portfolio Value')
plt.title('SAC Multi-Asset Trading vs. Equal-Weighted Buy & Hold')
plt.legend()
plt.show()


[*********************100%***********************]  15 of 15 completed


Using cpu device


KeyboardInterrupt: 