In [None]:
import yfinance as yf
import statsmodels.api as sms
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

assets = ["TSLA", "PLTR", "NVDA"]
START = "2021-01-01"
END = "2025-01-01"
WINDOW=252

data = yf.download(assets,start = START, end=END)
portfolio_returns = data["Close"].pct_change().dropna()
return_cumprod = portfolio_returns.add(1).cumprod().sub(1) * 100
correlation = portfolio_returns.corr()
sns.heatmap(correlation)

In [None]:
from scipy.optimize import minimize

# -------- CONFIG ----------
n = 7
seq_len = 200
rebalance_step = 1
WINDOW = 30
n_return = 5
BATCH_SIZE = 1
LR = 5e-5
EMBD_SIZE = 8
NUM_HEADS = 2
DIM_FFN = 1024
DROPOUT = 0
CONV1D_EMBD = False
KERNEL_SIZE = 5
GAMMA = 0.9
# --------------------------


class SharpeOptim:
    def __init__(self, rfr: float = 0.009):
        self.rfr = rfr

    def neg_sharpe(self, weights):
        pf_ret = np.dot(weights, self.mean_returns)
        pf_vol = np.sqrt(np.dot(weights.T, np.dot(self.cov_mtx, weights)))
        if pf_vol == 0:
            return 1e6
        return -(pf_ret - self.rfr) / pf_vol

    def max_sharpe(self, window_returns: pd.DataFrame):
        n_assets = window_returns.shape[1]
        self.mean_returns = window_returns.mean(axis=0).values
        self.cov_mtx = window_returns.cov().values + np.eye(n_assets) * 1e-8

        cons = ({"type": "eq", "fun": lambda w: np.sum(w) - 1})
        bounds = tuple((0, 1) for _ in range(n_assets))
        init_guess = np.array(n_assets * [1.0 / n_assets])

        result = minimize(self.neg_sharpe, init_guess, method="SLSQP",
                          bounds=bounds, constraints=cons)

        weights = result.x
        weights = np.where(weights < 0.05, 0, weights)
        return weights / weights.sum()

# --------------------------
# Rolling / Sliding window
# --------------------------
opt = SharpeOptim(rfr=0.009)

weights_history = {}
df = data["Close"]
dates = df.index[n:]
for i in range(0, len(dates), rebalance_step):
    end_date = dates[i]
    start_loc = df.index.get_loc(end_date) - n
    start_date = df.index[start_loc]

    window = df.loc[start_date:end_date].pct_change().dropna()

    w = opt.max_sharpe(window)
    weights_history[end_date] = w
    r = i

weights = pd.DataFrame(weights_history).T
weights.columns = df.columns

arrays = [
    ["Target"]*len(assets),
    assets
    ]

columns = pd.MultiIndex.from_arrays(arrays, names=["Price", "Ticker"])
weights_df = pd.DataFrame(weights.values,
                  index=weights.index,
                  columns=columns)

data = pd.concat([data, weights_df], axis=1).shift(-n)

data["Target"]


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from itertools import combinations
import statsmodels.api as sm
import pandas as pd
import numpy as np


class ModelSelector:
  def __init__(self, data):
    data.dropna(inplace=True)
    self.data = data

  def create_features(self):
    all_pairs = list(combinations(assets, 2))
    features_list = []

    for stock_1, stock_2 in all_pairs:
        print(f"Processing pair: {stock_1} - {stock_2}")

        alpha = [np.nan] * WINDOW
        beta = [np.nan] * WINDOW

        for i in range(WINDOW, len(data)):
            past_data = data["Close"].iloc[i-WINDOW:i]
            x = past_data[stock_1].values
            y = past_data[stock_2].values
            x = sm.add_constant(x)
            model = sm.OLS(y, x).fit()
            alpha.append(model.params[0])
            beta.append(model.params[1])

        rolling_params = pd.DataFrame({"alpha": alpha, "beta": beta}, index=data.index)

        spread = data["Close"][stock_2] - (rolling_params["alpha"] + rolling_params["beta"]*data["Close"][stock_1])

        rolling_mean = spread.rolling(WINDOW).mean()
        rolling_std = spread.rolling(WINDOW).std()
        z_score = (spread - rolling_mean) / rolling_std
        n_day_return = data["Close"][stock_2].pct_change(n_return)

        pair_features = pd.DataFrame({
            f"{stock_1}_{stock_2}_spread": spread,
            f"{stock_1}_{stock_2}_rolling_mean": rolling_mean,
            f"{stock_1}_{stock_2}_rolling_std": rolling_std,
            f"{stock_1}_{stock_2}_z_score": z_score,
            f"{stock_1}_{stock_2}_return_{n_return}d": n_day_return
        }, index=data.index)

        features_list.append(pair_features)

    self.spread_data = pd.concat(features_list, axis=1)

    self.spread_data.dropna(inplace = True)
    print(self.spread_data.columns)

  def preprocessing(self):
    self.create_features()
    to_drop = []
    for asset in assets:
        to_drop.append(("Target", asset))

    y = self.data["Target"].reindex(self.spread_data.index).values
    x = self.spread_data

    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
        x, y, test_size=0.2, shuffle=True
    )
    scaler = StandardScaler()
    self.x_train = scaler.fit_transform(self.x_train)
    self.x_test = scaler.transform(self.x_test)

  def regressor(self):
    param_grid = {
        "estimator__n_estimators": [200, 300, 500],
        "estimator__max_depth": [5, 10, 15],
        "estimator__min_samples_split": [2, 5, 10],
        "estimator__min_samples_leaf": [1, 2, 3, 5],
        "estimator__max_features": ["sqrt", "log2"]
    }

    search = RandomizedSearchCV(
        estimator=MultiOutputRegressor(RandomForestRegressor(random_state=42)),
        param_distributions=param_grid,
        n_iter=10,
        scoring="r2",
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    self.reg = search

  def train(self):
    self.reg.fit(self.x_train, self.y_train)
    y_pred = self.reg.predict(self.x_test)
    y_pred = np.clip(y_pred, 0, 1)
    y_pred = y_pred / y_pred.sum(axis=1, keepdims=True)

    print("Best params:", self.reg.best_params_)
    print(r2_score(self.y_test, y_pred))

    for i in range(len(y_pred)):
      print(f"pred: {y_pred[i]}, act: {self.y_test[i]}")

    return self.reg


model_selector = ModelSelector(data)
model = model_selector.preprocessing()
model = model_selector.regressor()
model = model_selector.train()