# 04 Model Training

Model residual returns using technical, fundamental, and LLM sentiment features.


In [None]:
from src.data_utils import load_pickle, save_pickle
from src.residual_model import compute_residuals, train_residual_model

import pandas as pd

features = load_pickle('../data/processed/features.pkl')
# Simulate returns and factors for demo
returns = features.groupby('ticker')['rsi'].shift(-1).fillna(0)  # Dummy returns (replace!)
returns.name = "returns"
factors = load_pickle('../data/raw/ff_factors.pkl').reindex(features.index.get_level_values(0).unique(), method='ffill')

residuals = compute_residuals(returns, factors)
model, preds, y_test = train_residual_model(features.drop(columns=['ticker']), residuals)
save_pickle(pd.DataFrame({'preds': preds}), '../data/processed/predictions.pkl')


In [None]:
# 04 Model Training

from src.data_utils import load_pickle, save_pickle
from src.residual_model import compute_residuals, train_residual_model

import pandas as pd
import numpy as np

features = load_pickle('../data/processed/features.pkl')
news_sentiment = load_pickle('../data/processed/news_sentiment.pkl')

# Merge features and sentiment by date, ticker
full_features = features.join(news_sentiment[['sentiment_score']], how='left')
full_features.fillna(0, inplace=True)

# Simulate returns: daily close-to-close returns per ticker
ohlcv = load_pickle('../data/raw/ohlcv.pkl')
returns_list = []
for ticker in ohlcv.columns.levels[0]:
    closes = ohlcv[ticker]['Close'].dropna()
    ret = closes.pct_change().rename('return').to_frame()
    ret['ticker'] = ticker
    ret['date'] = ret.index
    returns_list.append(ret)
returns = pd.concat(returns_list).set_index(['date', 'ticker'])
returns = returns.reindex(full_features.index)
returns.fillna(0, inplace=True)

# Factor data (FF factors) - forward fill to all available dates
ff_factors = load_pickle('../data/raw/ff_factors.pkl')
ff_factors.index = pd.to_datetime(ff_factors.index)
ff_factors = ff_factors.reindex(pd.to_datetime(full_features.index.get_level_values(0).unique()), method='ffill')

# Compute residuals (cross-sectional for simplicity)
resid = []
dates = full_features.index.get_level_values(0).unique()
for date in dates:
    # Prepare a cross-section for this day
    day_idx = full_features.index.get_level_values(0) == date
    if not np.any(day_idx): continue
    r = returns.loc[date].values.flatten()
    factors = ff_factors.loc[date].values.reshape(1, -1).repeat(len(r), axis=0)
    try:
        residuals = compute_residuals(r, factors)
    except Exception:
        residuals = np.zeros_like(r)
    resid.extend(residuals)

full_features['residual'] = resid

# Prepare data for ML model
model_features = full_features.drop(columns=['residual'])
target = full_features['residual']

# Remove inf/nan
mask = model_features.notnull().all(axis=1) & np.isfinite(target)
model_features = model_features[mask]
target = target[mask]

model, preds, y_test = train_residual_model(model_features, target)
save_pickle(pd.DataFrame({'preds': preds, 'y_test': y_test}, index=y_test.index), '../data/processed/predictions.pkl')
