# Regime-Aware Portfolio Optimization (Three Agent Folder Migration)
This notebook consolidates `hmm.py` and `Finrlmain.py` for execution in Google Colab.

In [None]:
# Install dependencies
!pip install git+https://github.com/AI4Finance-Foundation/FinRL.git
!pip install yfinance pandas numpy torch matplotlib seaborn hmmlearn gym gymnasium

In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
import os
from hmmlearn import hmm
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer
from gym import spaces
from gym.utils import seeding
from finrl.meta.env_portfolio_allocation.env_portfolio import StockPortfolioEnv

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

## 1. HMM Regime Detection (from `hmm.py`)
Detects market regimes using exogenous features.

In [None]:
class MarketRegimeHMM:
    def __init__(self, n_regimes=4):
        self.n_regimes = n_regimes
        self.model = hmm.GaussianHMM(
            n_components=n_regimes, 
            covariance_type="diag", 
            n_iter=1000,
            random_state=42
        )
        self.is_fitted = False

    def prepare_data(self, df):
        returns_df = df.pivot(index='date', columns='tic', values='close').pct_change().fillna(0)
        X = returns_df.values
        return X, returns_df.index

    def fit(self, df):
        X, dates = self.prepare_data(df)
        self.model.fit(X)
        self.is_fitted = True
        state_means = self.model.means_ 
        avg_returns = state_means.mean(axis=1)
        self.sorted_states = np.argsort(avg_returns) 
        print(f"HMM fitted. Regime order (Low Return -> High Return): {self.sorted_states}")
        return self

    def predict(self, df):
        if not self.is_fitted:
            raise ValueError("HMM not fitted yet.")
        X, dates = self.prepare_data(df)
        regimes = self.model.predict(X)
        mapped_regimes = np.zeros_like(regimes)
        for i, state in enumerate(self.sorted_states):
            mapped_regimes[regimes == state] = i
        return pd.DataFrame({'date': dates, 'regime': mapped_regimes})

    def predict_next_regime(self, current_regime):
        if not self.is_fitted:
            raise ValueError("HMM not fitted yet.")
        raw_state = self.sorted_states[current_regime]
        trans_probs = self.model.transmat_[raw_state]
        next_raw_state = np.argmax(trans_probs)
        next_mapped_regime = np.where(self.sorted_states == next_raw_state)[0][0]
        return next_mapped_regime

def plot_regimes(df, regime_df, save_path='results/regimes.png'):
    os.makedirs('results', exist_ok=True)
    returns_df = df.pivot(index='date', columns='tic', values='close').pct_change().dropna()
    market_return = returns_df.mean(axis=1)
    cum_returns = (1 + market_return).cumprod()
    fig, ax = plt.subplots(figsize=(15, 7))
    ax.plot(cum_returns.index, cum_returns.values, color='black', alpha=0.3, label='Market Cum. Returns')
    regime_colors = ['red', 'orange', 'blue', 'green'] 
    labels = ['High Bear', 'Sideways/Low Bear', 'Sideways/Low Bull', 'High Bull']
    for i in range(len(regime_colors)):
        mask = regime_df['regime'] == i
        dates = regime_df.loc[mask, 'date']
        for d in dates:
            ax.axvspan(d, d, color=regime_colors[i], alpha=0.15)
    from matplotlib.lines import Line2D
    custom_lines = [Line2D([0], [0], color=regime_colors[i], lw=4, alpha=0.5) for i in range(len(regime_colors))]
    ax.legend(custom_lines + [Line2D([0], [0], color='black', alpha=0.3)], labels + ['Market'])
    plt.title('Market Regimes Detected by HMM')
    plt.savefig(save_path)
    plt.show()
    print(f"Regime plot saved to {save_path}")

## 2. Helper Functions and Environment Overrides (from `Finrlmain.py`)
Injects custom logic into FinRL's `StockPortfolioEnv`.

In [None]:
def enforce_portfolio_constraints(weights, max_weight=0.60):
    weights = np.array(weights).copy()
    weights = np.clip(weights, 0, 1)
    weights = weights / (weights.sum() + 1e-8)
    for _ in range(5):
        over = weights > max_weight
        if not over.any(): break
        excess = weights[over] - max_weight
        total_excess = excess.sum()
        weights[over] = max_weight
        under = weights < max_weight
        if under.any():
            current_mass = weights[under].sum()
            if current_mass > 0:
                weights[under] += total_excess * (weights[under] / current_mass)
            else:
                weights[under] += total_excess / under.sum()
    return weights / (weights.sum() + 1e-8)

def _init_override(self, df, stock_dim, hmax, initial_amount, transaction_cost_pct, reward_scaling, state_space, action_space, tech_indicator_list, turbulence_threshold=None, lookback=252, day=0, **kwargs):
    self.day = day
    self.lookback = lookback
    self.df = df
    self.stock_dim = stock_dim
    self.hmax = hmax
    self.initial_amount = initial_amount
    self.transaction_cost_pct = transaction_cost_pct
    self.reward_scaling = reward_scaling
    self.state_space = state_space
    self.action_space_dim = action_space
    self.tech_indicator_list = tech_indicator_list
    self.action_space = spaces.Box(low=0, high=1, shape=(self.action_space_dim,), dtype=np.float32)
    self.unique_dates = self.df.date.unique()
    self.data = self.df[self.df.date == self.unique_dates[self.day]]
    self.covs = self.data["cov_list"].iloc[0]
    tech_array = np.array([self.data[tech].values for tech in self.tech_indicator_list])
    self.regime_df = kwargs.get('regime_df', None)
    current_regime = 0
    if self.regime_df is not None:
        regime_match = self.regime_df.loc[self.regime_df.date == self.unique_dates[self.day], 'future_regime']
        if not regime_match.empty: current_regime = regime_match.values[0]
    self.state = np.vstack([self.covs, tech_array]).astype(np.float32)
    regime_row = np.full((1, self.state.shape[1]), current_regime).astype(np.float32)
    self.state = np.vstack([self.state, regime_row])
    self.state_memory = [self.state] * self.lookback
    self.terminal = False
    self.portfolio_value = self.initial_amount
    self.asset_memory = [self.initial_amount]
    self.portfolio_return_memory = [0]
    self.actions_memory = [[1/self.stock_dim]*self.stock_dim]
    self.date_memory = [self.unique_dates[self.day]]

def _step_override(self, actions):
    self.terminal = self.day >= len(self.unique_dates) - 1
    if self.terminal:
        df = pd.DataFrame(self.portfolio_return_memory, columns=['daily_return'])
        if df['daily_return'].std() != 0:
            self.sharpe = (252**0.5) * df['daily_return'].mean() / df['daily_return'].std()
        return self.state, self.reward, self.terminal, False, {}
    else:
        weights = enforce_portfolio_constraints(actions)
        self.actions_memory.append(weights)
        last_day_memory = self.data
        self.day += 1
        self.data = self.df[self.df.date == self.unique_dates[self.day]]
        self.covs = self.data["cov_list"].iloc[0]
        tech_array = np.array([self.data[tech].values for tech in self.tech_indicator_list])
        self.state = np.vstack([self.covs, tech_array]).astype(np.float32)
        current_regime = 0
        if self.regime_df is not None:
            regime_match = self.regime_df.loc[self.regime_df.date == self.unique_dates[self.day], 'future_regime']
            if not regime_match.empty: current_regime = regime_match.values[0]
        regime_row = np.full((1, self.state.shape[1]), current_regime).astype(np.float32)
        self.state = np.vstack([self.state, regime_row])
        self.state_memory.pop(0); self.state_memory.append(self.state)
        portfolio_return = sum(((self.data.close.values / last_day_memory.close.values) - 1) * weights)
        self.portfolio_return_memory.append(portfolio_return)
        self.date_memory.append(self.unique_dates[self.day])
        self.portfolio_value *= (1 + portfolio_return)
        self.asset_memory.append(self.portfolio_value)
        self.reward = portfolio_return * self.reward_scaling
        return np.array(self.state_memory), self.reward, self.terminal, False, {}

def _reset_override(self, seed=None, options=None):
    if seed is not None: self.np_random, seed = seeding.np_random(seed)
    self.asset_memory = [self.initial_amount]; self.day = 0
    self.data = self.df[self.df.date == self.unique_dates[self.day]]
    self.covs = self.data["cov_list"].iloc[0]
    tech_array = np.array([self.data[tech].values for tech in self.tech_indicator_list])
    self.state = np.vstack([self.covs, tech_array]).astype(np.float32)
    current_regime = 0
    if self.regime_df is not None:
        regime_match = self.regime_df.loc[self.regime_df.date == self.unique_dates[self.day], 'future_regime']
        if not regime_match.empty: current_regime = regime_match.values[0]
    regime_row = np.full((1, self.state.shape[1]), current_regime).astype(np.float32)
    self.state = np.vstack([self.state, regime_row])
    self.state_memory = [self.state] * self.lookback
    self.portfolio_value = self.initial_amount; self.terminal = False
    self.portfolio_return_memory = [0]; self.actions_memory = [[1/self.stock_dim]*self.stock_dim]
    self.date_memory = [self.unique_dates[self.day]]
    return np.array(self.state_memory), {}

StockPortfolioEnv.__init__ = _init_override
StockPortfolioEnv.step = _step_override
StockPortfolioEnv.reset = _reset_override
StockPortfolioEnv._seed = lambda self, seed=None: seeding.np_random(seed)

## 3. Data Pipeline and HMM Fitting

In [None]:
TICKER_LIST = ['AAPL','MSFT','NVDA','GOOGL','META','JNJ','UNH','PFE','JPM','BAC','GS','XOM','CVX','WMT','PG','BA','CAT','AMZN', 'AMD', 'NFLX', 'V', 'HD', 'MCD','KO', 'PEP','DIS', 'COST','CRM', 'INTC', 'TXN','GE', 'MMM', 'HON','C', 'GS', 'MS','ABT', 'ABBV', 'MRK']
TICKER_LIST = sorted(list(set(TICKER_LIST)))
START_DATE = "2015-01-01"; END_DATE = "2024-01-01"; INITIAL_AMOUNT = 1_000_000

print("Fetching data...")
df = YahooDownloader(start_date=START_DATE, end_date=END_DATE, ticker_list=TICKER_LIST).fetch_data()
fe = FeatureEngineer(use_technical_indicator=True, tech_indicator_list=["macd", "rsi", "cci", "adx"], use_vix=False, use_turbulence=False)
df = fe.preprocess_data(df)

print("Fetching exogenous benchmarks for HMM...")
df_exo = YahooDownloader(start_date=START_DATE, end_date=END_DATE, ticker_list=['SPY', 'DBC', 'LQD', 'EMB', 'TLT', 'TIP']).fetch_data()
df_exo = df_exo.sort_values(["date", "tic"]).reset_index(drop=True)

print("Cleaning data...")
df = df.drop_duplicates(subset=["date", "tic"])
counts = df.groupby('tic').size(); max_c = counts.max()
df = df[df.tic.isin(counts[counts == max_c].index.tolist())].sort_values(["date", "tic"]).reset_index(drop=True)
TICKER_LIST = sorted(df.tic.unique().tolist())

def add_covariance_matrix(df, lookback=20):
    df = df.sort_values(['date', 'tic'], ignore_index=True)
    df.index = df.date.factorize()[0]
    cov_list = []; unique_dates = df.date.unique()
    for i in range(lookback, len(unique_dates)):
        data_window = df.loc[i - lookback:i, :]
        price_pivot = data_window.pivot_table(index='date', columns='tic', values='close')
        cov_list.append(price_pivot.pct_change().dropna().cov().values)
    df_cov = pd.DataFrame({'date': unique_dates[lookback:], 'cov_list': cov_list})
    return df.merge(df_cov, on='date').sort_values(['date', 'tic']).reset_index(drop=True)

print("Computing covariance matrix...")
df = add_covariance_matrix(df, lookback=20)

print("Fitting HMM...")
hmm_model = MarketRegimeHMM(n_regimes=4).fit(df_exo)
regime_df = hmm_model.predict(df_exo)
regime_df['future_regime'] = regime_df['regime'].apply(lambda x: hmm_model.predict_next_regime(x))
plot_regimes(df, regime_df)

## 4. Reinforcement Learning (A2C)

In [None]:
class Actor(nn.Module):
    def __init__(self, input_dim, num_assets, hidden=256):
        super().__init__()
        self.feature_extractor = nn.Sequential(nn.Linear(input_dim, hidden), nn.ReLU(), nn.Linear(hidden, hidden), nn.ReLU())
        self.heads = nn.ModuleList([nn.Linear(hidden, num_assets) for _ in range(4)])
    def forward(self, x):
        regime_indices = x[:, -1].long() 
        features = self.feature_extractor(x)
        out = torch.zeros(x.shape[0], self.heads[0].out_features, device=x.device)
        for i in range(4):
            mask = (regime_indices == i)
            if mask.any(): out[mask] = self.heads[i](features[mask])
        return torch.nn.functional.softplus(out) + 1.0

class Critic(nn.Module):
    def __init__(self, input_dim, hidden=256):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(input_dim, hidden), nn.ReLU(), nn.Linear(hidden, hidden), nn.ReLU(), nn.Linear(hidden, 1))
    def forward(self, x): return self.net(x).squeeze(-1)

def train_a2c(env, epochs=100, gamma=0.99, lr=1e-4, value_coef=0.5, entropy_coef=0.01, batch_size=20):
    obs_dim = np.prod(env.observation_space.shape); act_dim = env.action_space.shape[0]
    actor = Actor(obs_dim, act_dim).to(DEVICE); critic = Critic(obs_dim).to(DEVICE)
    optimizer = optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=lr)
    rewards_history = []
    for ep in range(epochs):
        state, _ = env.reset(); done = False; ep_reward = 0
        s_buf, w_buf, r_buf, m_buf = [], [], [], []
        while not done:
            s_in = torch.tensor(state.flatten(), dtype=torch.float32).unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                alpha = actor(s_in); dist = torch.distributions.Dirichlet(alpha); weights = dist.sample()
            action = weights.cpu().numpy()[0]
            next_state, reward, done, _, _ = env.step(action)
            s_buf.append(s_in); w_buf.append(weights); r_buf.append(torch.tensor([reward], device=DEVICE)); m_buf.append(torch.tensor([1 - float(done)], device=DEVICE))
            state = next_state; ep_reward += reward
            if len(r_buf) >= batch_size:
                b_s = torch.cat(s_buf); b_w = torch.cat(w_buf); alpha_b = actor(b_s); vals = critic(b_s).squeeze()
                dist_b = torch.distributions.Dirichlet(alpha_b); log_probs = dist_b.log_prob(b_w); ents = dist_b.entropy()
                with torch.no_grad():
                    s_next = torch.tensor(next_state.flatten(), dtype=torch.float32).unsqueeze(0).to(DEVICE)
                    nv = critic(s_next) if not done else torch.zeros(1, 1, device=DEVICE)
                rets = []; R = nv.squeeze()
                for r, m in zip(reversed(r_buf), reversed(m_buf)): R = r + gamma * R * m; rets.insert(0, R)
                rets = torch.stack(rets).squeeze(); advs = rets - vals
                actor_loss = -(log_probs * advs.detach()).mean(); critic_loss = advs.pow(2).mean()
                loss = actor_loss + value_coef * critic_loss - entropy_coef * ents.mean()
                optimizer.zero_grad(); loss.backward(); optimizer.step()
                s_buf.pop(0); w_buf.pop(0); r_buf.pop(0); m_buf.pop(0)
            if done: s_buf, w_buf, r_buf, m_buf = [], [], [], []
        rewards_history.append(ep_reward)
        if ep % 10 == 0: print(f"Episode {ep:03d} | Reward: {ep_reward:.4f}")
    return actor, critic, rewards_history

## 5. Execution

In [None]:
from gym import spaces
env = StockPortfolioEnv(df=df, stock_dim=len(TICKER_LIST), hmax=100, initial_amount=INITIAL_AMOUNT, transaction_cost_pct=0.001, reward_scaling=1000.0, state_space=len(TICKER_LIST), action_space=len(TICKER_LIST), tech_indicator_list=["macd", "rsi", "cci", "adx"], lookback=20, day=0, regime_df=regime_df)
env.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(env.lookback, env.state.shape[0], env.state.shape[1]), dtype=np.float32)

actor, critic, rewards = train_a2c(env, epochs=100)

plt.figure(figsize=(10, 6)); plt.plot(rewards); plt.title('A2C Training Progress'); plt.xlabel('Episode'); plt.ylabel('Reward'); plt.show()

print("\n[Evaluating...]")
state, _ = env.reset(); done = False; all_w = []
while not done:
    s = torch.tensor(state.flatten(), dtype=torch.float32).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        alpha = actor(s); weights = (alpha / alpha.sum(dim=-1, keepdim=True)).cpu().numpy()[0]
        weights = enforce_portfolio_constraints(weights); all_w.append(weights)
    state, _, done, _, _ = env.step(weights)

print(f"Final Portfolio Value: ${env.portfolio_value:,.2f}")
print(f"Total Return: {(env.portfolio_value / INITIAL_AMOUNT - 1) * 100:.2f}%")

print("\nFinal Portfolio Allocation:")
if len(all_w) > 0:
    last_weights = all_w[-1]
    for i, ticker in enumerate(TICKER_LIST):
        print(f"  {ticker:5s}: {last_weights[i]*100:6.2f}%")