In [1]:
import pandas as pd
import numpy as np

# Define the target ETF and the feature tickers
TARGET_TICKER = 'SPY'
FEATURE_TICKERS = ['QQQ', 'UVXY', 'TLT', 'GLD', 'AAPL', 'MSFT']
ALL_TICKERS = [TARGET_TICKER] + FEATURE_TICKERS

# Load all tickers into a single DataFrame
all_data = []
for ticker in ALL_TICKERS:
    df = pd.read_csv(f'../data/raw/{ticker}.csv', index_col='Date', parse_dates=True)
    all_data.append(df)

# Concatenate along columns
full_df = pd.concat(all_data, axis=1).dropna()


In [2]:
# Define the target: 1 if next day's close is higher, 0 otherwise
full_df['target'] = (full_df['SPY'].shift(-1) > full_df['SPY']).astype(int)

In [3]:
# Feature Creation

returns_df = full_df.drop('target', axis=1).pct_change()

# Moving Averages
full_df['spy_ma_50'] = full_df['SPY'].rolling(window=50).mean()
full_df['spy_ma_200'] = full_df['SPY'].rolling(window=200).mean()

# Relative Strength Index (RSI)
delta = full_df['SPY'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
full_df['spy_rsi'] = 100 - (100 / (1 + rs))

for ticker in ALL_TICKERS:
    for lag in [1, 3, 5, 10]:
        returns_df[f'{ticker}_return_lag_{lag}'] = returns_df[ticker].shift(lag)

full_df['spy_vol_30'] = returns_df['SPY'].rolling(window=30).std()

In [4]:
# Merge features - this requires careful index alignment
features = pd.concat([
    full_df[['spy_ma_50', 'spy_ma_200', 'spy_rsi', 'spy_vol_30']],
    returns_df.drop(ALL_TICKERS, axis=1) # Drop the raw return columns, keep the lagged ones
], axis=1)

# Combine features with the target
final_dataset = pd.concat([features, full_df['target']], axis=1)

# Drop all rows with NaN values created by rolling windows and lags
final_dataset.dropna(inplace=True)

print("Final Dataset Shape:", final_dataset.shape)
print("Final Dataset Head:")
print(final_dataset.head())

Final Dataset Shape: (3288, 33)
Final Dataset Head:
             spy_ma_50  spy_ma_200    spy_rsi  spy_vol_30  SPY_return_lag_1  \
Date                                                                          
2012-07-19  105.600180  103.310573  68.405222    0.009913          0.007407   
2012-07-20  105.622857  103.413266  51.627099    0.010106          0.002621   
2012-07-23  105.619226  103.502369  44.246455    0.010235         -0.009148   
2012-07-24  105.603668  103.578793  36.190334    0.010075         -0.010113   
2012-07-25  105.612253  103.658380  38.217947    0.009873         -0.008587   

            SPY_return_lag_3  SPY_return_lag_5  SPY_return_lag_10  \
Date                                                                
2012-07-19         -0.002357         -0.004845          -0.004512   
2012-07-20          0.006867          0.016778          -0.009503   
2012-07-23          0.007407         -0.002357          -0.001255   
2012-07-24          0.002621          0.006867   

In [5]:
PROCESSED_DATA_PATH = '../data/processed/etf_features.parquet'
final_dataset.to_parquet(PROCESSED_DATA_PATH)
print(f"Saved processed data to {PROCESSED_DATA_PATH}")

Saved processed data to ../data/processed/etf_features.parquet
