In [None]:
pip install hmmlearn pytorch

In [None]:
"""
hmm
"""
import os
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from hmmlearn.hmm import GaussianHMM

# ------------------ Config ------------------
SYMBOLS = ["AAPL", "MSFT", "GOOG", "META", "AMZN"]
START_DATE = "2020-01-01"
END_DATE = "2025-06-27"
PERIOD = "1d"
LOOKBACKS = [1, 5, 10, 21]
WINDOW_SIZE = 20
HMM_COMPONENTS = 5
PCA_COMPONENTS = 5
DATA_DIR = "multi_asset_hmm"
os.makedirs(DATA_DIR, exist_ok=True)

# ------------------ Feature Generator ------------------
def create_features(data):
    df = data.copy()

    for days in LOOKBACKS:
        df[f"Log_Return_{days}"] = np.log(df['Close'] / df['Close'].shift(days))
        if (LOOKBACKS.index(days) + 1) % 2 == 0:
            df[f'Volatility_{days}'] = df[f"Log_Return_{days}"].rolling(days, min_periods=1).std()
        if days >= 10:
            df[f"Momentum_{days}"] = df["Close"].shift(1) - df["Close"].shift(days + 1)

    log_volume = np.log(df["Volume"].shift(1) + 1e-6)
    rolling_mean = log_volume.rolling(window=5, min_periods=5).mean()
    rolling_std = log_volume.rolling(window=5, min_periods=5).std()
    df["Z_Log_Volume"] = (log_volume - rolling_mean) / (rolling_std + 1e-6)

    ma = df["Close"].rolling(WINDOW_SIZE).mean()
    std = df["Close"].rolling(WINDOW_SIZE).std()
    df["Z_Price_vs_MA"] = (df["Close"] - ma) / (std + 1e-6)

    return df.dropna()

# ------------------ HMM Training Function ------------------
def train_asset_hmm(symbol):
    print(f"\n--- {symbol} ---")

    data = yf.Ticker(symbol).history(start=START_DATE, end=END_DATE, interval=PERIOD)
    data = data.drop(["Dividends", "Stock Splits"], axis=1)
    df = create_features(data)

    feature_cols = [col for col in df.columns if any(key in col for key in ['Log_Return', 'Volatility', 'Momentum', 'Z_'])]
    X = df[feature_cols]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pca = PCA(n_components=PCA_COMPONENTS)
    X_pca = pca.fit_transform(X_scaled)

    hmm_model = GaussianHMM(
        n_components=HMM_COMPONENTS,
        covariance_type="full",
        n_iter=400,
        tol=1e-4,
        verbose=False,
        init_params="stmc"
    )
    hmm_model.fit(X_pca)

    regime_probs = hmm_model.predict_proba(X_pca)
    most_likely = np.argmax(regime_probs, axis=1)
    confidence = regime_probs[np.arange(len(most_likely)), most_likely]

    df["Most_Likely_Regime"] = most_likely
    df["Regime_Prob"] = confidence

    #plot_regimes(df, symbol)

    return {
        "symbol": symbol,
        "model": hmm_model,
        "features": df,
        "pca": pca,
        "scaler": scaler
    }


# ------------------ Plotting ------------------
def plot_regimes(df, symbol):
    plt.figure(figsize=(15, 6))
    for regime in sorted(df["Regime"].unique()):
        mask = df["Regime"] == regime
        plt.plot(df.index[mask], df["Close"][mask], ".", label=f"Regime {regime}")
    plt.title(f"{symbol}: Regimes Detected by HMM")
    plt.xlabel("Date")
    plt.ylabel("Close Price")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, f"{symbol}_regimes.png"))
    plt.close()

# ------------------ Main Loop ------------------
asset_models = {}

for symbol in SYMBOLS:
    try:
        result = train_asset_hmm(symbol)
        asset_models[symbol] = result
    except Exception as e:
        print(f"Failed on {symbol}: {e}")


In [None]:
"""
    Regiemem implementation
"""
extended_dfs = {}

def extend_features(data, symbol, return_horizon=5, vol_multiplier=0.75):
    df = data.copy()

    df["Forward_Return_5"] = df["Close"].shift(-return_horizon) / df["Close"] - 1
    df["Forward_Log_Return_5"] = np.log(df["Close"].shift(-return_horizon) / df["Close"])

    regime_mult_map = {
        0: 0.5,
        1: 0.75,
        2: 1.0,
        3: 1.25,
        4: 1.5
    }
    df["Regime_Multiplier"] = df["Most_Likely_Regime"].map(regime_mult_map)
    df["Vol_Adj_Threshold"] = df["Volatility_5"] * df["Regime_Multiplier"].fillna(1.0)

    def label_vol_based(row):
        if row["Forward_Return_5"] > row["Vol_Adj_Threshold"]:
            return "Bull"
        elif row["Forward_Return_5"] < -row["Vol_Adj_Threshold"]:
            return "Bear"
        else:
            return "Neutral"

    df["Market_State_Threshold"] = df.apply(label_vol_based, axis=1)

    rolling_mean = df["Forward_Return_5"].rolling(LOOKBACKS[-1]).mean()
    rolling_std = df["Forward_Return_5"].rolling(LOOKBACKS[-1]).std()
    df["Z_Score"] = (df["Forward_Return_5"] - rolling_mean) / rolling_std

    df["Asset"] = SYMBOLS.index(symbol)

    df["Market_State_ZScore"] = pd.cut(
        df["Z_Score"],
        bins=[-np.inf, -0.5, 0.5, np.inf],
        labels=[0, 1, 2]
    )
    df = df.drop(["Z_Score", "Market_State_Threshold", "Forward_Return_5", "Forward_Log_Return_5","Open", "High", "Low", "Close", "Volume", "Vol_Adj_Threshold",], axis=1)
    df = df.dropna()

    extended_dfs[symbol] = df

    return df

general_data_features = []
for symbol in SYMBOLS:
  general_data_features.append(extend_features(asset_models[symbol]["features"], symbol))

general_data = pd.concat(general_data_features, axis=0)
general_data = general_data.sort_index()
general_data["Market_State_ZScore"].value_counts()

In [None]:
"""
    General ML Regieme Detection MOdel (General_Model)
"""
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import LSTM, Dense, Layer, Input, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.models import Model
from sklearn.pipeline import Pipeline
from tensorflow.keras import layers
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow as tf
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
import os

# ------------------ Config ------------------
EPOCHS = 30
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
MODEL_SHAPES = [16, 16]
# n_components = 
# --------------------------------------------

class AttentionLayer(Layer):
    def __init__(self):
        super(AttentionLayer, self).__init__()
        self.dense = Dense(1, activation='tanh')

    def call(self, inputs):
        attention_weights = tf.nn.softmax(self.dense(inputs), axis=1)
        context_vector = inputs * attention_weights
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

def create_sequences(x, y, seq_length=16):
    xs, ys = [], []
    x_values = x
    y_values = y
    for i in range(len(x_values) - seq_length):
        xs.append(x_values[i:i+seq_length])
        ys.append(y_values[i+seq_length])
    return np.array(xs), np.array(ys)

class General_Model:
  def __init__(self, general_data, seq_length=16, model_type="nn"):

    self.seq_length = seq_length
    self.feature_cols = general_data.columns.difference(["Market_State_ZScore"])
    gen_model_x = general_data[self.feature_cols]
    gen_model_y = general_data["Market_State_ZScore"]

    if hasattr(gen_model_y, 'cat'):
        gen_model_y = gen_model_y.cat.codes
    x_train, x_test, y_train, y_test = train_test_split(gen_model_x, gen_model_y, test_size=0.15, shuffle=True)

    if model_type == "nn":

      gen_model_scaler = StandardScaler()
      x_train = gen_model_scaler.fit_transform(x_train)
      x_test = gen_model_scaler.transform(x_test)

      # gen_model_pca = PCA(n_components=n_components)
      # x_train = gen_model_pca.fit_transform(x_train)
      # x_test = gen_model_pca.transform(x_test)

      self.gen_x_train, self.gen_y_train = create_sequences(x_train, y_train, seq_length)
      self.gen_x_test, self.gen_y_test = create_sequences(x_test, y_test, seq_length)

    elif model_type == "xgb":
      self.gen_x_train, self.gen_y_train = x_train, y_train
      self.gen_x_test, self.gen_y_test = x_test, y_test

  def xgb_init(self):
    self.xgb_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('xgb', XGBClassifier(
          objective='multi:softprob',
          num_class=3,
          n_estimators=1500,
          max_depth=2,
          learning_rate=LEARNING_RATE,
          subsample=0.8,
          colsample_bytree=1,
          min_child_weight=3,
          gamma=1,
          reg_alpha=0.5,
          reg_lambda=1.0,
          verbosity=2,
          use_label_encoder=False,
          eval_metric="mlogloss"
      ))
    ])

  def train_xgb(self):
    self.xgb_init()
    self.xgb_pipeline.fit(self.gen_x_train, self.gen_y_train)

    self.gen_y_pred = self.xgb_pipeline.predict(self.gen_x_test)

    acc = accuracy_score(self.gen_y_test, self.gen_y_pred)

    print(f"Test Accuracy: {acc:.4f}")
    return self.xgb_pipeline

  def init__nn(self, input_shape, lr = 1e-2):
    def build_lstm_model(input_shape, units=64):
        inputs = Input(shape=input_shape)

        lstm_out = Bidirectional(LSTM(units, return_sequences=True))(inputs)
        lstm_out = Bidirectional(LSTM(units*2, return_sequences=True))(lstm_out)

        attention_out = AttentionLayer()(lstm_out)

        outputs = Dense(3, activation='softmax')(attention_out)
        return Model(inputs, outputs)

    input_shape = (input_shape, len(self.feature_cols))

    experts = []
    for units in MODEL_SHAPES:
      experts.append(build_lstm_model(input_shape, units=units))

    inputs = Input(shape=input_shape)

    expert_outputs = []
    for expert in experts:
      out = expert(inputs)
      expert_outputs.append(out)

    combined = Concatenate()(expert_outputs)
    outputs = Dense(3, activation='softmax')(combined)
    self.gen_ensemble_model = Model(inputs, outputs)
    self.gen_ensemble_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                          loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  def train__nn(self, epochs=50, batch_size=16, lr = 1e-2):
    self.init__nn(input_shape=batch_size, lr = lr)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
    self.gen_ensemble_model.fit(
        self.gen_x_train, self.gen_y_train,
        validation_data=(self.gen_x_test, self.gen_y_test),
        epochs=epochs,
        batch_size=batch_size,
        #callbacks=[reduce_lr],
        verbose=1
    )
    self.gen_ensemble_model.evaluate(
        self.gen_x_test, self.gen_y_test,
        batch_size=batch_size,
        verbose=1
    )
    return self.gen_ensemble_model

  def plot_xgb(self):

    print("Classification Report:")
    print(classification_report(self.gen_y_test, self.gen_y_pred))

    cm = confusion_matrix(self.gen_y_test, self.gen_y_pred)
    labels = self.gen_y_test.unique()
    plt.figure(figsize=(6,5))

    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.show()

gem_model = General_Model(general_data, seq_length=BATCH_SIZE)
# xgb_pipeline = gem_model.train_xgb()
ensemble_model = gem_model.train__nn(epochs=EPOCHS, batch_size=BATCH_SIZE, lr = LEARNING_RATE)
# gem_model.plot_xgb()

In [None]:
"""
    Asset Specific(Residual_Model_A) ML Regieme Detection MOdel
"""

from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import yfinance as yf
import pandas as pd
import numpy as np
import pickle
import torch
import os


class Residual_Model_A(nn.Module):
    def __init__(self):
        pass
res_model = Residual_Model_A()

In [None]:
"""
     final output General_Model(x) + Residual_Model_A(x)
"""

out = gen_model(x) + res_model(x)

In [None]:
"""
    Monte arlo simm stress Test
"""

                     +-------------------+
                     | Raw Market Data   |
                     | (multi-asset)     |
                     +--------+----------+
                              |
                     +--------v----------+
                     | Feature Generator |
                     | - Rolling stats   |
                     | - Regime probs    |
                     +--------+----------+
                              |
               +--------------v-------------+
               | General Market State Model |
               | (Trained on pooled data)   |
               +--------------+-------------+
                              |
        +---------------------v---------------------+
        | For each asset A:                         |
        | - Compute residuals                       |
        | - Train Residual_Model_A                  |
        |   (on asset-specific patterns)            |
        +---------------------+---------------------+
                              |
            +-----------------v------------------+
            | Final Prediction for asset A:       |
            | General_Model(x) + Residual_Model_A(x) |
            +-----------------+------------------+

                          ↓
          +------------------------------------+
          | Monte Carlo Simulation / Scenarios |
          | (for stress-testing and robustness)|
          +------------------------------------+

                          ↓
          +----------------+------------------+
          | Evaluation & Metrics               |
          | - Asset-level & portfolio-level    |
          +------------------------------------+
