In [2]:
!pip install "u8darts[all]"
!pip install catboost
!pip install scikit-learn
!pip install yfinance
!pip install arch



In [15]:
import yfinance as yf
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [3]:
# --- Ticker and Date Setup ---
tickers = ['AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'META', 'TSLA', 'BRK-B', 'JPM', 'V']
start_date = '2010-01-01'
end_date = '2024-12-31'
train_end = '2023-12-31'
test_start = '2024-01-01'

# --- Download Data ---
data = yf.download(tickers, start=start_date, end=end_date)
close_data = data['Close'].copy()
log_close_data = np.log(close_data)
log_data = pd.concat([close_data, log_close_data], axis=1, keys=['Close', 'LogClose'])


  data = yf.download(tickers, start=start_date, end=end_date)
[*********************100%***********************]  10 of 10 completed


In [5]:
# --- Parameters ---
max_lag = 5
target_returns = {}

# --- Create lagged features ---
logclose_df = log_data['LogClose']
feature_df = pd.DataFrame(index=logclose_df.index)

for t in tickers:
    for lag in range(1, max_lag + 1):
        feature_df[f"{t}_lag{lag}"] = logclose_df[t].shift(lag)


In [7]:
# --- Model training loop per ticker ---
results = {}

for t in tickers:
    print(f"\n📈 Training model for {t}")
    
    # Target: next-day log return
    feature_df[f"target_{t}"] = logclose_df[t].shift(-1) - logclose_df[t]

    model_data = feature_df.dropna()

    # Time-based split
    train = model_data.loc[:train_end]
    test = model_data.loc[test_start:]

    X_train = train.drop(columns=[f"target_{t}"])
    y_train = train[f"target_{t}"]

    X_test = test.drop(columns=[f"target_{t}"])
    y_test = test[f"target_{t}"]
    # CatBoost Model
    model = CatBoostRegressor(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        loss_function='RMSE'
    )
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"✅ {t} RMSE: {rmse:.6f}")

    # Store results
    results[t] = {
        'model': model,
        'rmse': rmse,
        'y_test': y_test,
        'y_pred': pd.Series(y_pred, index=y_test.index)
    }


📈 Training model for AAPL
✅ AAPL RMSE: 0.015074

📈 Training model for MSFT
✅ MSFT RMSE: 0.011251

📈 Training model for NVDA
✅ NVDA RMSE: 0.030358

📈 Training model for AMZN
✅ AMZN RMSE: 0.014709

📈 Training model for GOOGL
✅ GOOGL RMSE: 0.014105

📈 Training model for META
✅ META RMSE: 0.019523

📈 Training model for TSLA
✅ TSLA RMSE: 0.038336

📈 Training model for BRK-B
✅ BRK-B RMSE: 0.009729

📈 Training model for JPM
✅ JPM RMSE: 0.011997

📈 Training model for V
✅ V RMSE: 0.010601
