## INFINTIE CASH AT HAND (NO CASH REBALANCE )

In [116]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd

class PortfolioEnv_no_cash_rebalance(gym.Env):
    def __init__(self, prices_df, window_size=20, alpha=1.0, beta=0.1):
        super(PortfolioEnv_no_cash_rebalance, self).__init__()

        self.prices = prices_df
        self.window_size = window_size
        self.alpha = alpha
        self.beta = beta
        self.current_step = self.window_size

        self.num_stocks = self.prices.shape[1]
        self.max_steps = len(self.prices) - 1

        # Action: portfolio weights (with shorting allowed)
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(self.num_stocks,), dtype=np.float32)

        # Observation: normalized window + previous weights -> flatten (window_size * num_stocks + num_stocks)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.window_size * self.num_stocks + self.num_stocks,),
            dtype=np.float32
        )

        self.done = False
        self.weights = np.zeros(self.num_stocks)

    def reset(self,*,seed = None,options = None):
        super().reset(seed = seed)
        self.current_step = self.window_size
        self.done = False
        self.weights = np.zeros(self.num_stocks)
        return self._get_observation(),{}

    def _get_observation(self):
        window = self.prices.iloc[self.current_step - self.window_size:self.current_step]
        first_day_prices = window.iloc[0].replace(0, 1e-6)
        normalized_window = (window / first_day_prices).values.astype(np.float32)
        flat_prices = normalized_window.flatten()
        return np.concatenate([flat_prices, self.weights.astype(np.float32)])

    def step(self, action):
        if self.done:
            return self._get_observation(), 0.0, self.done,False, {}

        # Normalize action to sum to 1 (portfolio constraint)
        # action_sum = np.sum(action)
        # if action_sum != 0:
        #     action = action / action_sum
        # else: #in the next training doe
        #     action = np.ones_like(action) / len(action)
        
        self.weights = action

        current_prices = self.prices.iloc[self.current_step]
        previous_prices = self.prices.iloc[self.current_step - 1]
        daily_log_returns = np.log(current_prices / previous_prices).values

        portfolio_return = np.dot(self.weights, daily_log_returns)

        # Portfolio risk: covariance over the window
        window = self.prices.iloc[self.current_step - self.window_size:self.current_step]
        log_returns = np.log(window / window.shift(1)).dropna()
        cov_matrix = log_returns.cov().values
        portfolio_risk = np.dot(self.weights.T, np.dot(cov_matrix, self.weights))

        reward = self.alpha * portfolio_return - self.beta * portfolio_risk

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True

        obs = self._get_observation()
        truncated = False

        return obs, reward, self.done, truncated,{
            'return': portfolio_return,
            'risk': portfolio_risk
        }

    def render(self, mode='human'):
        print(f"Step: {self.current_step}")
        print(f"Portfolio Weights: {self.weights}")


## ENVIRONMENT USING INITIAL CAPITAL REBALANCE

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd

class PortfolioEnv_initial_balance(gym.Env):
    def __init__(self, prices_df, window_size=20, alpha=1.0, beta=0.1, initial_balance=10000):
        super(PortfolioEnv_initial_balance, self).__init__()

        self.prices = prices_df
        self.window_size = window_size
        self.alpha = alpha
        self.beta = beta
        self.initial_balance = initial_balance

        self.current_step = self.window_size
        self.num_stocks = self.prices.shape[1]
        self.max_steps = len(self.prices) - 1

        # Action: portfolio weights (with shorting allowed)
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(self.num_stocks,), dtype=np.float32)

        # Observation: normalized window + previous weights + cash balance -> flatten (window_size * num_stocks + num_stocks + 1)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.window_size * self.num_stocks + self.num_stocks + 1,),
            dtype=np.float32
        )

        # Initialize variables
        self.balance = self.initial_balance  # Cash balance
        self.stock_holdings = np.zeros(self.num_stocks)  # Amount of each stock held
        self.done = False
        self.weights = np.zeros(self.num_stocks)

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.window_size
        self.done = False
        self.balance = self.initial_balance
        self.stock_holdings = np.zeros(self.num_stocks)
        self.weights = np.zeros(self.num_stocks)
        
        obs = self._get_observation()
    # Convertion of observation to float32 explicitly to match the observation_space dtype
        return obs.astype(np.float32), {}
    def _get_observation(self):
        window = self.prices.iloc[self.current_step - self.window_size:self.current_step]
        first_day_prices = window.iloc[0].replace(0, 1e-6)
        normalized_window = (window / first_day_prices).values.astype(np.float32)
        flat_prices = normalized_window.flatten()
        
        # Return the observation (prices, weights, and balance)
        return np.concatenate([flat_prices, self.weights.astype(np.float32), np.array([self.balance])])

    def step(self, action):
        if self.done:
            return self._get_observation(), 0.0, self.done, False, {}

        # Normalize action to sum to 1 (portfolio constraint)
        self.weights = action / np.sum(np.abs(action))  # Portfolio weights should sum to 1

        # Determine portfolio value and transactions
        current_prices = self.prices.iloc[self.current_step]
        previous_prices = self.prices.iloc[self.current_step - 1]
        daily_log_returns = np.log(current_prices / previous_prices).values

        # Portfolio return calculation
        portfolio_return = np.dot(self.weights, daily_log_returns)

        # Portfolio risk: covariance over the window
        window = self.prices.iloc[self.current_step - self.window_size:self.current_step]
        log_returns = np.log(window / window.shift(1)).dropna()
        cov_matrix = log_returns.cov().values
        portfolio_risk = np.dot(self.weights.T, np.dot(cov_matrix, self.weights))

        reward = self.alpha * portfolio_return - self.beta * portfolio_risk

        # Portfolio update logic: buying/selling stocks
        total_value = self.balance + np.dot(self.stock_holdings, current_prices)
        target_stock_holdings = self.weights * total_value / current_prices  # Desired stock holdings based on weights

        # Calculate transaction cost: buying/selling stocks (no short-selling restrictions)
        transaction_cost = np.abs(target_stock_holdings - self.stock_holdings).sum() * 0.001  # 0.1% transaction fee
        self.balance -= transaction_cost  # Deduct transaction costs from cash balance

        # Update stock holdings and balance
        for i in range(self.num_stocks):
            cost_to_buy = (target_stock_holdings[i] - self.stock_holdings[i]) * current_prices[i]
            if cost_to_buy > 0:  # Buying
                if self.balance >= cost_to_buy:
                    self.balance -= cost_to_buy
                    self.stock_holdings[i] = target_stock_holdings[i]
            elif cost_to_buy < 0:  # Selling
                self.balance += np.abs(cost_to_buy)  # Return funds from selling
                self.stock_holdings[i] = target_stock_holdings[i]

        portfolio_value = self.balance + np.dot(self.stock_holdings, current_prices)
        # The balance is already updated in the buying/selling steps above
        # Don't reset balance, keep it as the updated amount

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True

        # Return observation, reward, done, truncated flag
        obs = self._get_observation()
        truncated = False

        return obs.astype(np.float32), reward, self.done, truncated, {'return': portfolio_return, 'risk': portfolio_risk}

    def render(self, mode='human'):
        print(f"Step: {self.current_step}")
        print(f"Cash Balance: {self.balance}")
        print(f"Portfolio Weights: {self.weights}")
        print(f"Stock Holdings: {self.stock_holdings}")
        print(f"Total Portfolio Value: {self.balance + np.dot(self.stock_holdings, self.prices.iloc[self.current_step])}")


## ENVIRONMENT WITH SHARPE RATIO REWARD 

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd

class PortfolioEnv_sharpe(gym.Env):
    def __init__(self, prices_df, window_size=20, alpha=1.0, beta=0.1, initial_balance=10000):
        super(PortfolioEnv_sharpe, self).__init__()

        self.prices = prices_df
        self.window_size = window_size
        self.alpha = alpha
        self.beta = beta
        self.initial_balance = initial_balance

        self.current_step = self.window_size
        self.num_stocks = self.prices.shape[1]
        self.max_steps = len(self.prices) - 1

        # Action: portfolio weights (with shorting allowed)
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(self.num_stocks,), dtype=np.float32)

        # Observation: normalized window + previous weights + cash balance -> flatten (window_size * num_stocks + num_stocks + 1)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.window_size * self.num_stocks + self.num_stocks + 1,),
            dtype=np.float32
        )

        # Initialize variables
        self.balance = self.initial_balance  # Cash balance
        self.stock_holdings = np.zeros(self.num_stocks)  # Amount of each stock held
        self.done = False
        self.weights = np.zeros(self.num_stocks)

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.window_size
        self.done = False
        self.balance = self.initial_balance
        self.stock_holdings = np.zeros(self.num_stocks)
        self.weights = np.zeros(self.num_stocks)
        
        obs = self._get_observation()
    # Convertion of observation to float32 explicitly to match the observation_space dtype
        return obs.astype(np.float32), {}
    def _get_observation(self):
        window = self.prices.iloc[self.current_step - self.window_size:self.current_step]
        first_day_prices = window.iloc[0].replace(0, 1e-6)
        normalized_window = (window / first_day_prices).values.astype(np.float32)
        flat_prices = normalized_window.flatten()
        
        # Return the observation (prices, weights, and balance)
        return np.concatenate([flat_prices, self.weights.astype(np.float32), np.array([self.balance])])

    def step(self, action):
        if self.done:
            return self._get_observation(), 0.0, self.done, False, {}

        epsilon = 1e-6
        action = action - np.mean(action)
        self.weights = action / (np.sum(np.abs(action)) + 1e-6)
        # self.weights = action / (np.sum(action) + epsilon)

        current_prices = self.prices.iloc[self.current_step]
        previous_prices = self.prices.iloc[self.current_step - 1]
        daily_log_returns = np.log(current_prices / previous_prices).values

        # Compute portfolio return
        portfolio_return = np.dot(self.weights, daily_log_returns)

        # Get windowed returns to compute rolling std dev
        window = self.prices.iloc[self.current_step - self.window_size:self.current_step]
        log_returns = np.log(window / window.shift(1)).dropna()
        portfolio_returns_window = np.dot(log_returns.values, self.weights)

        # Calculate rolling std deviation (risk)
        std_dev = np.std(portfolio_returns_window,ddof = 1) + 1e-6

        # Sharpe ratio reward (risk-free rate assumed 0 for simplicity)
        reward = portfolio_return / std_dev

        # Rebalancing logic
        total_value = self.balance + np.dot(self.stock_holdings, current_prices)
        target_stock_holdings = self.weights * total_value / current_prices

        transaction_cost = np.abs(target_stock_holdings - self.stock_holdings).sum() * 0.001
        self.balance -= transaction_cost

        for i in range(self.num_stocks):
            cost_to_buy = (target_stock_holdings[i] - self.stock_holdings[i]) * current_prices[i]
            if cost_to_buy > 0:
                if self.balance >= cost_to_buy:
                    self.balance -= cost_to_buy
                    self.stock_holdings[i] = target_stock_holdings[i]
            elif cost_to_buy < 0:
                self.balance += np.abs(cost_to_buy)
                self.stock_holdings[i] = target_stock_holdings[i]

        self.current_step += 1
        if self.current_step >= self.max_steps:
            self.done = True

        obs = self._get_observation()
        truncated = False
        return obs.astype(np.float32), reward, self.done, truncated, {
            'return': portfolio_return,
            'rolling_std': std_dev
        }


    def render(self, mode='human'):
        print(f"Step: {self.current_step}")
        print(f"Cash Balance: {self.balance}")
        print(f"Portfolio Weights: {self.weights}")
        print(f"Stock Holdings: {self.stock_holdings}")
        print(f"Total Portfolio Value: {self.balance + np.dot(self.stock_holdings, self.prices.iloc[self.current_step])}")


## Downloading the Data ( 50 companies with)

In [100]:
import yfinance as yf

# Define the stock tickers and the date range
stock_tickers = [
    'AAPL', 'GOOGL', 'MSFT', 'AMZN', 'TSLA',
    'META', 'NVDA', 'INTC', 'NFLX', 'IBM'
]
start_date = '2007-01-01'
end_date = '2023-12-31'

# Download the data from Yahoo Finance
stock_data = yf.download(stock_tickers, start=start_date, end=end_date)['Close']

# Show the first few rows of the dataset
stock_data.head()

# Clean up the data by removing rows with NaN values
stock_data.dropna(inplace=True)

# Print the shape of the dataset
print(stock_data.shape)


[*********************100%***********************]  10 of 10 completed

(2923, 10)





## TRAINING LOOP OF PPO WITH CASH REBALANCE

In [None]:
import warnings

# Suppress deprecation warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from stable_baselines3 import PPO, DDPG, TD3
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback

# -------------------------------------------------------------------
# 1) Custom Callback with Early Stopping
# -------------------------------------------------------------------
class LoggingCallback(BaseCallback):
    def __init__(self, verbose=0, reward_patience=50, epsilon=1e-2, print_freq=10):
        super(LoggingCallback, self).__init__(verbose)
        self.episode_rewards = []
        self._current_ep_reward = 0.0
        self.actor_losses = []
        self.critic_losses = []

        # Early stopping parameters
        self.reward_patience = reward_patience
        self.epsilon = epsilon
        self.best_mean_reward = -np.inf
        self.episodes_without_improvement = 0

        # Print frequency
        self.print_freq = print_freq

    def _on_step(self) -> bool:
        reward = float(self.locals["rewards"][0])
        self._current_ep_reward += reward

        if self.locals["dones"][0]:
            self.episode_rewards.append(self._current_ep_reward)

            # Only print logs after a certain number of episodes
            if len(self.episode_rewards) % self.print_freq == 0:
                print(f"Episode {len(self.episode_rewards)} reward: {self._current_ep_reward:.4f}")

            # Early stopping logic
            if len(self.episode_rewards) >= self.reward_patience:
                recent_rewards = self.episode_rewards[-self.reward_patience:]
                mean_recent = np.mean(recent_rewards)

                if mean_recent > self.best_mean_reward + self.epsilon:
                    self.best_mean_reward = mean_recent
                    self.episodes_without_improvement = 0
                else:
                    self.episodes_without_improvement += 1
                    print(f"No reward improvement for {self.episodes_without_improvement} episodes")

                    # # Early stopping condition met
                    # if self.episodes_without_improvement >= self.reward_patience:
                    #     print("Early stopping triggered: No improvement for 50 episodes.")
                    #     return False  # This will stop training

            self._current_ep_reward = 0.0

        # Capture losses (if available in logs)
        logs = self.logger.name_to_value
        if "train/actor_loss" in logs and "train/critic_loss" in logs:
            self.actor_losses.append(logs["train/actor_loss"])
            self.critic_losses.append(logs["train/critic_loss"])

        return True

# -------------------------------------------------------------------
# 2) Environment
# -------------------------------------------------------------------
# Assume `stock_data` is your 2923x10 price DataFrame and PortfolioEnv is already defined
env = PortfolioEnv_initial_balance(prices_df=stock_data, window_size=20)
check_env(env, warn=True)

# -------------------------------------------------------------------
# 3) Instantiate Model
# -------------------------------------------------------------------
n_actions = env.action_space.shape[-1]

# Add noise to actions using NormalActionNoise (mean=0, sigma=0.05)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.05 * np.ones(n_actions))

callback = LoggingCallback(reward_patience=50, epsilon=1e-2)

# Instantiate PPO model
model = PPO(
    "MlpPolicy",            # Policy type
    env,                    # Environment
    learning_rate=1e-4,     # Learning rate
   # Adding action noise here
    gamma = 0.99,

    verbose=1,
)

# -------------------------------------------------------------------
# 4) Train
# -------------------------------------------------------------------
model.learn(total_timesteps=2000000, callback=callback)
model.save("PPO_portfolio_trained")





## TRAINING LOOP OF DDPG USING CASH REBALANCE ENVIRONMENT

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback

# -------------------------------------------------------------------
# 1) Custom Callback with Early Stopping
# -------------------------------------------------------------------
class LoggingCallback(BaseCallback):
    def __init__(self, verbose=0, reward_patience=20, epsilon=1e-2):
        super(LoggingCallback, self).__init__(verbose)
        self.episode_rewards = []
        self._current_ep_reward = 0.0
        self.actor_losses = []
        self.critic_losses = []

        # Early stopping parameters
        self.reward_patience = reward_patience
        self.epsilon = epsilon
        self.best_mean_reward = -np.inf
        self.episodes_without_improvement = 0

    def _on_step(self) -> bool:
        reward = float(self.locals["rewards"][0])
        self._current_ep_reward += reward

        if self.locals["dones"][0]:
            self.episode_rewards.append(self._current_ep_reward)
            print(f"Episode {len(self.episode_rewards)} reward: {self._current_ep_reward:.4f}")

            # Early stopping logic
            if len(self.episode_rewards) >= self.reward_patience:
                recent_rewards = self.episode_rewards[-self.reward_patience:]
                mean_recent = np.mean(recent_rewards)

                if mean_recent > self.best_mean_reward + self.epsilon:
                    self.best_mean_reward = mean_recent
                    self.episodes_without_improvement = 0
                else:
                    self.episodes_without_improvement += 1
                    print(f"No reward improvement for {self.episodes_without_improvement} episodes")

                # if self.episodes_without_improvement >= self.reward_patience:
                #     print(f"Early stopping triggered after {len(self.episode_rewards)} episodes!")
                #     return False  # Stop training

            self._current_ep_reward = 0.0

        # Capture losses (if available in logs)
        logs = self.logger.name_to_value
        if "train/actor_loss" in logs and "train/critic_loss" in logs:
            self.actor_losses.append(logs["train/actor_loss"])
            self.critic_losses.append(logs["train/critic_loss"])

        return True

# -------------------------------------------------------------------
# 2) Environment
# -------------------------------------------------------------------
# Assume `stock_data` is your 2923x10 price DataFrame and PortfolioEnv is already defined
env = PortfolioEnv_initial_balance(prices_df=stock_data, window_size=20)
check_env(env, warn=True)

# -------------------------------------------------------------------
# 3) Instantiate Model
# -------------------------------------------------------------------
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.05 * np.ones(n_actions))

callback = LoggingCallback(reward_patience=20, epsilon=1e-2)

model = DDPG(
    "MlpPolicy",
    env,
    learning_rate=1e-4,
    action_noise=action_noise,
    verbose=1,
)

# -------------------------------------------------------------------
# 4) Train
# -------------------------------------------------------------------
model.learn(total_timesteps=1000000, callback=callback)
model.save("DDPG3_portfolio_trained")


## TRAINING LOOP OF DDPG WITHOUT CASH REBALANCE 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback

# -------------------------------------------------------------------
# 1) Custom Callback with Early Stopping
# -------------------------------------------------------------------
class LoggingCallback(BaseCallback):
    def __init__(self, verbose=0, reward_patience=20, epsilon=1e-2):
        super(LoggingCallback, self).__init__(verbose)
        self.episode_rewards = []
        self._current_ep_reward = 0.0
        self.actor_losses = []
        self.critic_losses = []

        # Early stopping parameters
        self.reward_patience = reward_patience
        self.epsilon = epsilon
        self.best_mean_reward = -np.inf
        self.episodes_without_improvement = 0

    def _on_step(self) -> bool:
        reward = float(self.locals["rewards"][0])
        self._current_ep_reward += reward

        if self.locals["dones"][0]:
            self.episode_rewards.append(self._current_ep_reward)
            print(f"Episode {len(self.episode_rewards)} reward: {self._current_ep_reward:.4f}")

            # Early stopping logic
            if len(self.episode_rewards) >= self.reward_patience:
                recent_rewards = self.episode_rewards[-self.reward_patience:]
                mean_recent = np.mean(recent_rewards)

                if mean_recent > self.best_mean_reward + self.epsilon:
                    self.best_mean_reward = mean_recent
                    self.episodes_without_improvement = 0
                else:
                    self.episodes_without_improvement += 1
                    print(f"No reward improvement for {self.episodes_without_improvement} episodes")

                # if self.episodes_without_improvement >= self.reward_patience:
                #     print(f"Early stopping triggered after {len(self.episode_rewards)} episodes!")
                #     return False  # Stop training

            self._current_ep_reward = 0.0

        # Capture losses (if available in logs)
        logs = self.logger.name_to_value
        if "train/actor_loss" in logs and "train/critic_loss" in logs:
            self.actor_losses.append(logs["train/actor_loss"])
            self.critic_losses.append(logs["train/critic_loss"])

        return True

# -------------------------------------------------------------------
# 2) Environment
# -------------------------------------------------------------------
# Assume `stock_data` is your 2923x10 price DataFrame and PortfolioEnv is already defined
env = PortfolioEnv_no_cash_rebalance(prices_df=stock_data, window_size=20)
check_env(env, warn=True)

# -------------------------------------------------------------------
# 3) Instantiate Model
# -------------------------------------------------------------------
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.05 * np.ones(n_actions))

callback = LoggingCallback(reward_patience=20, epsilon=1e-2)

model = DDPG(
    "MlpPolicy",
    env,
    learning_rate=1e-4,
    action_noise=action_noise,
    verbose=1,
)

# -------------------------------------------------------------------
# 4) Train
# -------------------------------------------------------------------
model.learn(total_timesteps=1000000, callback=callback)
model.save("DDPG2_portfolio_trained")



## EVALUATION WITHOUT CASH REBALANCE USING DDPG 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from stable_baselines3 import DDPG
from stable_baselines3.common.vec_env import DummyVecEnv

window_size = 20

# --- Load trained model ---
model = DDPG.load("DDPG2_portfolio_trained")

# --- Create evaluation environment ---
eval_env = DummyVecEnv([lambda: PortfolioEnv_no_cash_rebalance(prices_df=stock_data, window_size=window_size)])
obs = eval_env.reset()
rl_returns = []
done = [False]

while not done[0]:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _ = eval_env.step(action)
    rl_returns.append(reward[0])  # SB3 gives list of rewards from vecenv

# --- 1/N Equal Weight Strategy ---
equal_weights = np.ones(stock_data.shape[1]) / stock_data.shape[1]
equal_returns = (stock_data.pct_change().dropna() @ equal_weights).cumsum()

# --- Mean-Variance Strategy ---
mean_returns = stock_data.pct_change().dropna().mean()
cov_matrix = stock_data.pct_change().dropna().cov()

inv_cov = np.linalg.pinv(cov_matrix.values)
weights_mv = inv_cov @ mean_returns.values
weights_mv /= weights_mv.sum()

mv_returns = (stock_data.pct_change().dropna() @ weights_mv).cumsum()

# --- RL strategy returns ---
rl_returns = np.cumsum(rl_returns)

# --- Plot all ---
plt.figure(figsize=(12, 6))
plt.plot(equal_returns.index, equal_returns, label='1/N Strategy')
plt.plot(mv_returns.index, mv_returns, label='Mean-Variance Strategy')
plt.plot(equal_returns.index[:len(rl_returns)], rl_returns, label='DDPG Strategy')
plt.xlabel("Time")
plt.ylabel("Cumulative Returns")
plt.title("Portfolio Strategies Comparison")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


## EVALUATION WITH CASH RABALANCE (PPO MODEL)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from stable_baselines3 import DDPG
from stable_baselines3.common.vec_env import DummyVecEnv


window_size = 20

# --- Load trained model ---
model = PPO.load("PPO_portfolio_trained")

# --- Create evaluation environment ---
eval_env = DummyVecEnv([lambda: PortfolioEnv_initial_balance(prices_df=stock_data, window_size=window_size)])
obs = eval_env.reset()
rl_returns = []
done = [False]

while not done[0]:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _ = eval_env.step(action)
    rl_returns.append(reward[0])  # SB3 gives list of rewards from vecenv

# --- 1/N Equal Weight Strategy ---
equal_weights = np.ones(stock_data.shape[1]) / stock_data.shape[1]
equal_returns = (stock_data.pct_change().dropna() @ equal_weights).cumsum()

# --- Mean-Variance Strategy ---
mean_returns = stock_data.pct_change().dropna().mean()
cov_matrix = stock_data.pct_change().dropna().cov()

inv_cov = np.linalg.pinv(cov_matrix.values)
weights_mv = inv_cov @ mean_returns.values
weights_mv /= weights_mv.sum()

mv_returns = (stock_data.pct_change().dropna() @ weights_mv).cumsum()

# --- RL strategy returns ---
rl_returns = np.cumsum(rl_returns)

# --- Plot all ---
plt.figure(figsize=(12, 6))
plt.plot(equal_returns.index, equal_returns, label='1/N Strategy')
plt.plot(mv_returns.index, mv_returns, label='Mean-Variance Strategy')
plt.plot(equal_returns.index[:len(rl_returns)], rl_returns, label='PPO Strategy')
plt.xlabel("Time")
plt.ylabel("Cumulative Returns")
plt.title("Portfolio Strategies Comparison")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
