## Objective of the Test

This test explores the effectiveness of using **historical returns vs. forecasted returns** in portfolio modeling and prediction. Additionally, it compares models that rely solely on **price history** with those that incorporate **alternative data** (macroeconomic indicators and Google Trends). The goal is to assess whether incorporating forecasted returns and external signals provides a measurable advantage over traditional methods based only on past prices.


Time interval is chosen due to constrains where weekly google trends data can only be downloaded 5 years back

In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
import random
import warnings
warnings.filterwarnings("ignore")

# Set random seed
random.seed(10)

merged = pd.read_csv("merged.csv", index_col=0, parse_dates=True)

# Load data
df = merged

# Split into training and testing
train = df[df.index < "2024-01-01"]
test = df[df.index >= "2024-01-01"]

# Separate returns and risk-free rate
train_rf = train['risk_free_rate_weekly']
test_rf = test['risk_free_rate_weekly']

# Remove non-numeric and risk-free column from returns
returns_columns = [col for col in df.columns if col != "risk_free_rate_weekly"]

# Storage for metrics
all_returns = []
all_stds = []
all_sharpes = []

# Simulation loop
for _ in range(1000):
    selected_assets = random.sample(returns_columns, k=random.randint(6, 10))

    train_returns = train[selected_assets].replace([np.inf, -np.inf], np.nan).dropna()
    test_returns = test[selected_assets].replace([np.inf, -np.inf], np.nan)
    test_returns = test_returns[train_returns.columns]

    if train_returns.empty or test_returns.empty:
        continue

    mu = train_returns.mean() * 52
    cov = train_returns.cov() * 52
    rf = train_rf.mean() * 52

    n = len(mu)
    x0 = np.array([1/n] * n)
    bounds = tuple((0, 1) for _ in range(n))  # No short-selling
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}  # Must sum to 1

    try:
        result = minimize(
            lambda w: -((np.dot(w, mu) - rf) / (np.sqrt(np.dot(w.T, np.dot(cov, w))))),
            x0, method='SLSQP', bounds=bounds, constraints=constraints
        )

        if result.success:
            weights = result.x
            test_portfolio_returns = test_returns.dot(weights)
            test_excess_returns = test_portfolio_returns - test_rf.loc[test_returns.index].values

            ann_return = (1 + test_excess_returns.mean()) ** 52 - 1
            ann_std = test_excess_returns.std() * np.sqrt(52)
            sharpe = ann_return / ann_std

            all_returns.append(ann_return)
            all_stds.append(ann_std)
            all_sharpes.append(sharpe)

    except Exception:
        continue

# Compute medians
median_return = np.median(all_returns)
median_std = np.median(all_stds)
median_sharpe = np.median(all_sharpes)

{
    "Median Annualized Return": np.round(median_return, 4),
    "Median Annualized Volatility": np.round(median_std, 4),
    "Median Sharpe Ratio": np.round(median_sharpe, 4)
}



{'Median Annualized Return': np.float64(0.4279),
 'Median Annualized Volatility': np.float64(0.3414),
 'Median Sharpe Ratio': np.float64(1.3252)}

In [None]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import ParameterGrid
from scipy.optimize import minimize
import numpy as np
import pandas as pd
import random

df = merged

# Use only return columns
returns_columns = [col for col in df.columns if col != "risk_free_rate_weekly"]

# Split into training and test sets
train_df = df[df.index < "2024-01-01"]
test_df = df[df.index >= "2024-01-01"]
test_rf = test_df['risk_free_rate_weekly']

# Hyperparameter grid for tuning
param_grid = {
    'units': [16, 32],
    'dropout': [0.1, 0.2],
    'recurrent_dropout': [0.1],
    'batch_size': [8],
    'epochs': [30],
    'lr': [0.001, 0.0005],
    'seq_len': [20]
}

def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

# Tuning
returns_columns = [col for col in train_df.columns if col != 'risk_free_rate_weekly']
test_rf = test_df['risk_free_rate_weekly']
best_sharpe = -np.inf
best_params = None

for params in ParameterGrid(param_grid):
    predicted_returns = {}
    valid_assets = []

    for asset in returns_columns:
        series = train_df[asset].dropna().values.reshape(-1, 1)
        if len(series) < 60:
            continue

        # Smooth + scale
        series = pd.Series(series.flatten()).rolling(3, min_periods=1).mean().values.reshape(-1, 1)
        scaler = MinMaxScaler()
        scaled_series = scaler.fit_transform(series)

        X, y = create_sequences(scaled_series, params['seq_len'])
        if len(X) == 0:
            continue

        try:
            # Model
            model = Sequential([
                GRU(params['units'], input_shape=(params['seq_len'], 1),
                    dropout=params['dropout'], recurrent_dropout=params['recurrent_dropout']),
                Dense(1)
            ])
            model.compile(optimizer=Adam(learning_rate=params['lr']), loss='mse')
            model.fit(X, y, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0,
                      callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])

            # Forecast 52 steps
            last_seq = scaled_series[-params['seq_len']:]
            preds = []
            for _ in range(52):
                input_seq = last_seq.reshape(1, params['seq_len'], 1)
                pred = model.predict(input_seq, verbose=0)
                preds.append(pred[0][0])
                last_seq = np.append(last_seq[1:], pred).reshape(params['seq_len'], 1)

            preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
            predicted_returns[asset] = preds
            valid_assets.append(asset)
        except:
            continue

    # Optimize portfolio if enough assets
    if len(valid_assets) < 6:
        continue

    try:
        mu = pd.DataFrame(predicted_returns).mean().values * 52
        cov = train_df[valid_assets].cov().values * 52
        n = len(valid_assets)
        x0 = np.array([1/n] * n)
        bounds = tuple((0, 1) for _ in range(n))
        constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}

        result = minimize(
            lambda w: -((np.dot(w, mu) - test_rf.mean() * 52) /
                        np.sqrt(np.dot(w.T, np.dot(cov, w)))),
            x0, method='SLSQP', bounds=bounds, constraints=constraints
        )

        if result.success:
            weights = result.x
            test_returns = test_df[valid_assets].dot(weights)
            excess = test_returns - test_rf.loc[test_returns.index].values
            ann_return = (1 + excess.mean()) ** 52 - 1
            ann_std = excess.std() * np.sqrt(52)
            sharpe = ann_return / ann_std

            if sharpe > best_sharpe:
                best_sharpe = sharpe
                best_params = params
    except:
        continue

print("Best GRU Hyperparameters:", best_params)


Best GRU Hyperparameters: {'batch_size': 8, 'dropout': 0.1, 'epochs': 30, 'lr': 0.001, 'recurrent_dropout': 0.1, 'seq_len': 20, 'units': 32}

In [None]:
# GRU-based Simulation with Best Hyperparameters

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from scipy.optimize import minimize
import random

df = merged

# Column setup
returns_columns = [col for col in df.columns if col != "risk_free_rate_weekly"]
train_df = df[df.index < "2024-01-01"]
test_df = df[df.index >= "2024-01-01"]
test_rf = test_df['risk_free_rate_weekly']

# Helper to create GRU sequences
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

# GRU hyperparameters
GRU_PARAMS = {
    'units': 32,
    'dropout': 0.1,
    'recurrent_dropout': 0.1,
    'batch_size': 8,
    'epochs': 30,
    'lr': 0.001,
    'seq_len': 20
}

# Run 109 simulations
all_returns, all_stds, all_sharpes = [], [], []
random.seed(10)

for _ in range(10):
    selected_assets = random.sample(returns_columns, k=random.randint(6, 10))
    predicted_returns = {}
    valid_assets = []

    for asset in selected_assets:
        series = train_df[asset].dropna().values.reshape(-1, 1)
        if len(series) < 60:
            continue

        # Smooth and scale
        series = pd.Series(series.flatten()).rolling(window=3, min_periods=1).mean().values.reshape(-1, 1)
        scaler = MinMaxScaler()
        scaled_series = scaler.fit_transform(series)

        # Create sequences
        X, y = create_sequences(scaled_series, GRU_PARAMS['seq_len'])

        model = Sequential([
            GRU(GRU_PARAMS['units'],
                input_shape=(X.shape[1], 1),
                dropout=GRU_PARAMS['dropout'],
                recurrent_dropout=GRU_PARAMS['recurrent_dropout']),
            Dense(1)
        ])
        model.compile(optimizer=Adam(learning_rate=GRU_PARAMS['lr']), loss='mse')
        model.fit(X, y, epochs=GRU_PARAMS['epochs'], batch_size=GRU_PARAMS['batch_size'],
                  verbose=0, callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])

        # Forecast next 52 returns
        last_seq = scaled_series[-GRU_PARAMS['seq_len']:]
        preds = []
        for _ in range(52):
            input_seq = last_seq.reshape(1, GRU_PARAMS['seq_len'], 1)
            pred = model.predict(input_seq, verbose=0)[0][0]
            preds.append(pred)
            last_seq = np.append(last_seq[1:], [[pred]], axis=0)

        preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
        predicted_returns[asset] = preds
        valid_assets.append(asset)

    if len(valid_assets) < 6:
        continue

    mu = pd.DataFrame(predicted_returns).mean().values * 52
    cov = train_df[valid_assets].cov().values * 52
    rf = train_df['risk_free_rate_weekly'].mean() * 52

    n = len(valid_assets)
    x0 = np.array([1/n] * n)
    bounds = tuple((0, 1) for _ in range(n))
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}

    try:
        result = minimize(
            lambda w: -((np.dot(w, mu) - rf) / np.sqrt(np.dot(w.T, np.dot(cov, w)))),
            x0, method='SLSQP', bounds=bounds, constraints=constraints
        )

        if result.success:
            weights = result.x
            test_returns = test_df[valid_assets].dot(weights)
            test_excess_returns = test_returns - test_rf.loc[test_returns.index].values

            ann_return = (1 + test_excess_returns.mean()) ** 52 - 1
            ann_std = test_excess_returns.std() * np.sqrt(52)
            sharpe = ann_return / ann_std

            all_returns.append(ann_return)
            all_stds.append(ann_std)
            all_sharpes.append(sharpe)

    except Exception:
        continue

# Final output
print({
    "Median Annualized Return": np.round(np.median(all_returns), 4),
    "Median Annualized Volatility": np.round(np.median(all_stds), 4),
    "Median Sharpe Ratio": np.round(np.median(all_sharpes), 4)
})


SP500 Results to compare as benchmark

In [None]:
import yfinance as yf
import numpy as np
import pandas as pd

# Download S&P 500 data for 2024
sp500 = yf.download('^GSPC', start='2024-01-01', end='2024-12-31', progress=False)

# Calculate daily returns
sp500['Daily Return'] = sp500['Close'].pct_change()

# Annualized return
daily_returns = sp500['Daily Return'].dropna()
annualized_return = ((1 + daily_returns.mean()) ** 252 - 1)

# Annualized volatility
annualized_volatility = daily_returns.std() * np.sqrt(252)

# Sharpe ratio
risk_free_rate = 0.045
sharpe_ratio = (annualized_return - risk_free_rate) / annualized_volatility

# Print results
print(f"Annualized Return: {annualized_return:.4%}")
print(f"Annualized Volatility: {annualized_volatility:.4%}")
print(f"Sharpe Ratio: {sharpe_ratio:.4f}")

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
import warnings
warnings.filterwarnings("ignore")

# Load data
df_alt = pd.read_csv("df_alt.csv", index_col="Date", parse_dates=True)

# Columns
returns_columns = [
    'ADBE', 'AVGO', 'BRK-B', 'COST', 'DECK', 'HD', 'HUBB', 'INTC', 'JNJ',
    'KO', 'MANH', 'MRK', 'NDSN', 'PEP', 'PG', 'POOL', 'RPM', 'SMCI', 'STLD', 'WLK'
]
macro_columns = ['FedFundsRate', 'CPI', 'WeeklyInflationRate', 'UnemploymentRate']
trend_columns = [f"{ticker}_trend" for ticker in returns_columns]

# Train/test split
train_df = df_alt[df_alt.index < "2024-01-01"]
test_df = df_alt[df_alt.index >= "2024-01-01"]

# Hyperparameter distributions
param_dist = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

best_params = {}  # Store the best parameters for each asset

for asset in returns_columns:
    # Select features for this asset
    cols = [asset, f"{asset}_trend"] + macro_columns
    df_feat = train_df[cols].copy()
    df_feat["target"] = df_feat[asset].shift(-1)  # Predicting next period's return
    df_feat.dropna(inplace=True)  # Remove rows with missing values

    if len(df_feat) < 80:
        continue

    X = df_feat.drop(columns="target").values
    y = df_feat["target"].values
    scaler = StandardScaler()  # Scale features
    X_scaled = scaler.fit_transform(X)

    model = RandomForestRegressor(random_state=10)
    # Hyperparameter tuning with cross-validation
    search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, random_state=10)
    search.fit(X_scaled, y)
    best_params[asset] = search.best_params_

# Best parameters for each asset
print("\n Best Parameters Used:")
for asset, params in best_params.items():
    print(f"{asset}: {params}")


### Portfolio optimzation with RF


In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df_alt = pd.read_csv("/content/df_alt.csv", index_col="Date", parse_dates=True)

# Define columns
returns_columns = [
    'ADBE', 'AVGO', 'BRK-B', 'COST', 'DECK', 'HD', 'HUBB', 'INTC', 'JNJ',
    'KO', 'MANH', 'MRK', 'NDSN', 'PEP', 'PG', 'POOL', 'RPM', 'SMCI', 'STLD', 'WLK'
]
macro_columns = ['FedFundsRate', 'CPI', 'WeeklyInflationRate', 'UnemploymentRate']
trend_columns = [f"{ticker}_trend" for ticker in returns_columns]

# Split data
train_df = df_alt[df_alt.index < "2024-01-01"]
test_df = df_alt[df_alt.index >= "2024-01-01"]
test_rf = test_df["risk_free_rate_weekly"]

# Hyperparameter tuning
param_dist = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
best_params = {}

for asset in returns_columns:
    cols = [asset, f"{asset}_trend"] + macro_columns
    df_feat = train_df[cols].copy()
    df_feat["target"] = df_feat[asset].shift(-1)
    df_feat.dropna(inplace=True)

    if len(df_feat) < 80:
        continue

    X = df_feat.drop(columns="target").values
    y = df_feat["target"].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = RandomForestRegressor(random_state=10)
    search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, random_state=10)
    search.fit(X_scaled, y)
    best_params[asset] = search.best_params_

# Forecasting and optimization
def clip_predictions(preds, lower=-0.01, upper=0.01):
    return np.clip(preds, lower, upper)

all_returns, all_stds, all_sharpes = [], [], []
random.seed(10)

for _ in range(100):
    selected_assets = random.sample(list(best_params.keys()), k=random.randint(6, 10))
    predicted_returns = {}
    valid_assets = []

    for asset in selected_assets:
        cols = [asset, f"{asset}_trend"] + macro_columns
        df_feat = train_df[cols].copy()
        df_feat["target"] = df_feat[asset].shift(-1)
        df_feat.dropna(inplace=True)

        if len(df_feat) < 80:
            continue

        X = df_feat.drop(columns="target").values
        y = df_feat["target"].values
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        model = RandomForestRegressor(**best_params[asset], random_state=10, max_samples=0.8)
        model.fit(X_scaled, y)

        test_features = test_df[cols].dropna()
        if len(test_features) < len(test_df):
            continue

        X_test_scaled = scaler.transform(test_features.values)
        preds = clip_predictions(model.predict(X_test_scaled))

        if len(preds) >= len(test_df):
            predicted_returns[asset] = preds[:len(test_df)]
            valid_assets.append(asset)

    if len(valid_assets) < 6:
        continue

    mu = pd.DataFrame(predicted_returns).mean().values * 52
    cov = train_df[valid_assets].cov().values * 52

    n = len(mu)
    x0 = np.array([1 / n] * n)
    bounds = tuple((0, 1) for _ in range(n))
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}

    try:
        result = minimize(
            lambda w: -((np.dot(w, mu) - test_rf.mean() * 52) / np.sqrt(np.dot(w.T, np.dot(cov, w)))),
            x0, method='SLSQP', bounds=bounds, constraints=constraints
        )

        if result.success:
            weights = result.x
            test_returns = test_df[valid_assets].dot(weights)
            test_excess_returns = test_returns - test_rf.loc[test_returns.index].values

            ann_return = (1 + test_excess_returns.mean()) ** 52 - 1
            ann_std = test_excess_returns.std() * np.sqrt(52)
            sharpe = ann_return / ann_std

            all_returns.append(ann_return)
            all_stds.append(ann_std)
            all_sharpes.append(sharpe)
    except:
        continue

results = {
    "Median Annualized Return": np.round(np.median(all_returns), 4),
    "Median Annualized Volatility": np.round(np.median(all_stds), 4),
    "Median Sharpe Ratio": np.round(np.median(all_sharpes), 4)
}

results

### Portfolio Optimzation with XGB

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from scipy.optimize import minimize
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df_alt = pd.read_csv("/content/df_alt.csv", index_col="Date", parse_dates=True)

# Define columns
returns_columns = [
    'ADBE', 'AVGO', 'BRK-B', 'COST', 'DECK', 'HD', 'HUBB', 'INTC', 'JNJ',
    'KO', 'MANH', 'MRK', 'NDSN', 'PEP', 'PG', 'POOL', 'RPM', 'SMCI', 'STLD', 'WLK'
]
macro_columns = ['FedFundsRate', 'CPI', 'WeeklyInflationRate', 'UnemploymentRate']

# Split data
train_df = df_alt[df_alt.index < "2024-01-01"]
test_df = df_alt[df_alt.index >= "2024-01-01"]
test_rf = test_df["risk_free_rate_weekly"]

# Hyperparameter tuning for XGBoost
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [2, 3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1]
}
best_params = {}

for asset in returns_columns:
    cols = [asset, f"{asset}_trend"] + macro_columns
    df_feat = train_df[cols].copy()
    df_feat["target"] = df_feat[asset].shift(-1)
    df_feat.dropna(inplace=True)

    if len(df_feat) < 80:
        continue

    X = df_feat.drop(columns="target").values
    y = df_feat["target"].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = XGBRegressor(random_state=10)
    search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, random_state=10)
    search.fit(X_scaled, y)
    best_params[asset] = search.best_params_

# Forecasting and optimization
def clip_predictions(preds, lower=-0.01, upper=0.01):
    return np.clip(preds, lower, upper)

all_returns, all_stds, all_sharpes = [], [], []
random.seed(10)

for _ in range(100):
    selected_assets = random.sample(list(best_params.keys()), k=random.randint(6, 10))
    predicted_returns = {}
    valid_assets = []

    for asset in selected_assets:
        cols = [asset, f"{asset}_trend"] + macro_columns
        df_feat = train_df[cols].copy()
        df_feat["target"] = df_feat[asset].shift(-1)
        df_feat.dropna(inplace=True)

        if len(df_feat) < 80:
            continue

        X = df_feat.drop(columns="target").values
        y = df_feat["target"].values
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        model = XGBRegressor(**best_params[asset], random_state=10)
        model.fit(X_scaled, y)

        test_features = test_df[cols].dropna()
        if len(test_features) < len(test_df):
            continue

        X_test_scaled = scaler.transform(test_features.values)
        preds = clip_predictions(model.predict(X_test_scaled))

        if len(preds) >= len(test_df):
            predicted_returns[asset] = preds[:len(test_df)]
            valid_assets.append(asset)

    if len(valid_assets) < 6:
        continue

    mu = pd.DataFrame(predicted_returns).mean().values * 52
    cov = train_df[valid_assets].cov().values * 52

    n = len(mu)
    x0 = np.array([1 / n] * n)
    bounds = tuple((0, 1) for _ in range(n))
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}

    try:
        result = minimize(
            lambda w: -((np.dot(w, mu) - test_rf.mean() * 52) / np.sqrt(np.dot(w.T, np.dot(cov, w)))),
            x0, method='SLSQP', bounds=bounds, constraints=constraints
        )

        if result.success:
            weights = result.x
            test_returns = test_df[valid_assets].dot(weights)
            test_excess_returns = test_returns - test_rf.loc[test_returns.index].values

            ann_return = test_excess_returns.mean() * 52
            ann_std = test_excess_returns.std() * np.sqrt(52)
            sharpe = ann_return / ann_std

            all_returns.append(ann_return)
            all_stds.append(ann_std)
            all_sharpes.append(sharpe)
    except:
        continue

results = {
    "Median Annualized Return": np.round(np.median(all_returns), 4),
    "Median Annualized Volatility": np.round(np.median(all_stds), 4),
    "Median Sharpe Ratio": np.round(np.median(all_sharpes), 4)
}

results