In [None]:
!pip install stable-baselines3[extra] gym numpy pandas yfinance matplotlib
!pip install gymnasium shimmy stable-baselines3[extra]



In [None]:
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import yfinance as yf
import matplotlib.pyplot as plt

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [None]:
# Define 10 stocks for the portfolio
stocks = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "NVDA", "META", "JPM", "UNH", "V"]

# Fetch historical stock data from Yahoo Finance
start_date = "2020-01-01"
end_date = "2024-01-01"

data = yf.download(stocks, start=start_date, end=end_date)

# Fix multi-index columns (Yahoo Finance sometimes returns multi-index)
if isinstance(data.columns, pd.MultiIndex):
    data = data["Adj Close"] if "Adj Close" in data.columns else data["Close"]
else:
    data = data['Adj Close'] if 'Adj Close' in data.columns else data['Close']

# Drop missing values (important for RL training)
data = data.dropna()

# Calculate daily returns
returns = data.pct_change().dropna()

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  10 of 10 completed


In [None]:
class PortfolioEnv(gym.Env):
    def __init__(self, df, initial_balance=1_000_000):
        super(PortfolioEnv, self).__init__()

        self.df = df
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.current_step = 0

        self.num_stocks = df.shape[1]

        # State: [past returns + current portfolio weights]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_stocks * 2,))

        # Action: Portfolio weights (continuous values between 0 and 1)
        self.action_space = spaces.Box(low=0, high=1, shape=(self.num_stocks,))

        # Portfolio allocation starts as equal weighting
        self.portfolio_weights = np.ones(self.num_stocks) / self.num_stocks

    def step(self, action):
        # Avoid division by zero when normalizing
        if np.sum(action) == 0:
            action = np.ones_like(action) / len(action)  # Assign equal weights

        action = action / np.sum(action)  # Normalize to sum 1
        self.portfolio_weights = action

        # Compute portfolio return
        current_returns = self.df.iloc[self.current_step].values
        portfolio_return = np.dot(self.portfolio_weights, current_returns)

        # Reward function: Sharpe ratio (risk-adjusted return)
        risk_free_rate = 0.01 / 252  # Approximate daily risk-free rate
        reward = (portfolio_return - risk_free_rate) / (np.std(portfolio_return) + 1e-6)

        # Move to next step
        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1
        truncated = False  # Required by Gymnasium

        # New state: [past returns + portfolio weights]
        next_state = np.concatenate([self.df.iloc[self.current_step].values, self.portfolio_weights])

        return next_state, reward, terminated, truncated, {}


    def reset(self, seed=None, options=None):
        super().reset(seed=seed)  # ✅ Ensure proper reset handling for Gymnasium
        self.current_step = 0
        self.portfolio_weights = np.ones(self.num_stocks) / self.num_stocks
        obs = np.concatenate([self.df.iloc[self.current_step].values, self.portfolio_weights])

        # Ensure obs is a 1D numpy array, NOT a nested array
        obs = np.array(obs, dtype=np.float32).flatten()  # ✅ Convert to flat 1D array

        print("Returning from reset():", obs.shape, type(obs), {})  # ✅ Debugging output

        return obs, {}  # ✅ Fix: Explicitly return a tuple (obs, info)




In [None]:
# Create the portfolio environment
env = PortfolioEnv(returns)
env = DummyVecEnv([lambda: env])

# Train PPO (Actor-Critic) model
model = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0003, batch_size=64, n_steps=512)
model.learn(total_timesteps=10_000)


Using cpu device
Returning from reset(): (20,) <class 'numpy.ndarray'> {}
----------------------------
| time/              |     |
|    fps             | 544 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 512 |
----------------------------
Returning from reset(): (20,) <class 'numpy.ndarray'> {}
------------------------------------------
| time/                   |              |
|    fps                  | 381          |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 1024         |
| train/                  |              |
|    approx_kl            | 4.139135e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -14.2        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.14e+09     |
|    n_updates            | 10           |
|    policy_gradient_

<stable_baselines3.ppo.ppo.PPO at 0x7bc6d7dd6a90>

In [None]:
result = env.reset()
print("Result from reset():", result)
print("Type of result:", type(result))


Returning from reset(): (20,) <class 'numpy.ndarray'> {}
Result from reset(): [[-0.00972213 -0.01213903 -0.00523129 -0.01319606 -0.00529125 -0.01245199
  -0.01600591  0.02963326 -0.01012001 -0.00795311  0.1         0.1
   0.1         0.1         0.1         0.1         0.1         0.1
   0.1         0.1       ]]
Type of result: <class 'numpy.ndarray'>


In [None]:
obs, info = env.reset()  # ✅ Should return two values without error
print("Observation shape:", obs.shape)
print("Info:", info)  # Should print: {}

Returning from reset(): (20,) <class 'numpy.ndarray'> {}


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
state, _ = env.reset()
state = state[0]  # ✅ Extract first element
portfolio_values = [env.get_attr("initial_balance")[0]]

for i in range(len(returns) - 1):
    action, _states = model.predict(state)
    state, reward, terminated, truncated, _ = env.step(action)
    state = state[0]  # ✅ Extract first element

    # Store portfolio value
    portfolio_values.append(portfolio_values[-1] * (1 + np.dot(action, returns.iloc[i])))

    if terminated or truncated:
        break

# Plot portfolio performance
plt.plot(portfolio_values)
plt.title("Portfolio Performance using Actor-Critic (PPO)")
plt.xlabel("Time Step")
plt.ylabel("Portfolio Value ($)")
plt.show()


Returning from reset(): (20,) <class 'numpy.ndarray'> {}


ValueError: not enough values to unpack (expected 2, got 1)