In [1]:
import pandas as pd
import numpy as np
import os
import sys

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    
from config import RAW_DATA_DIR, PROCESSED_DATA_PATH, ALL_TICKERS, TARGET_TICKER

# Load all tickers into a single DataFrame
all_data = []
for ticker in ALL_TICKERS:
    df = pd.read_csv(RAW_DATA_DIR / f'{ticker}.csv', index_col='Date', parse_dates=True)
    all_data.append(df)

# Concatenate along columns
full_df = pd.concat(all_data, axis=1).dropna()


In [2]:
# Define the target: 1 if next day's close is higher, 0 otherwise
full_df['target'] = (full_df['SPY'].shift(-1) > full_df['SPY']).astype(int)

In [None]:
# --- ROBUST DATA DOWNLOAD SCRIPT ---
import requests

# Create a session object to add a browser-like header
session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
session.headers.update(headers)

print("Attempting to download all ticker data...")
# Download the historical data for all tickers using the session
data = yf.download(
    ALL_TICKERS,
    start=START_DATE,
    end=END_DATE,
    session=session  # <-- This is the crucial addition
)

# --- ROBUSTNESS CHECK ---
# Stop execution if the download failed and the DataFrame is empty.
if data.empty or 'Close' not in data:
    raise ConnectionError("Failed to download data from yfinance. The response was empty. This could be a network block or an API issue.")

print("Download successful.")

# yfinance downloads a multi-level column DataFrame. Let's save each ticker's data.
# We are primarily interested in 'Adj Close' as it accounts for splits and dividends.
adj_close_data = data['Close']

# Save each ticker's adjusted close price to a separate CSV file
for ticker in ALL_TICKERS:
    # Check if the ticker column exists and has data
    if ticker in adj_close_data and not adj_close_data[ticker].dropna().empty:
        ticker_data = adj_close_data[[ticker]].dropna()
        file_path = os.path.join(RAW_DATA_DIR, f'{ticker}.csv')
        ticker_data.to_csv(file_path)
        print(f"Saved raw data for {ticker} to {file_path}")
    else:
        print(f"Warning: No data found for {ticker} after download. Skipping file save.")

# Display the head of the SPY data to verify
print("\nSPY Raw Data Head:")
print(pd.read_csv(os.path.join(RAW_DATA_DIR, 'SPY.csv')).head())

Failed to get ticker 'UVXY' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'SPY' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'QQQ' reason: Expecting value: line 1 column 1 (char 0)
Failed to get ticker 'TLT' reason: Expecting value: line 1 column 1 (char 0)
[                       0%%                      ]Failed to get ticker 'GLD' reason: Expecting value: line 1 column 1 (char 0)
[                       0%%                      ]Failed to get ticker 'AAPL' reason: Expecting value: line 1 column 1 (char 0)
[********************* 43%%                      ]  3 of 7 completedFailed to get ticker 'MSFT' reason: Expecting value: line 1 column 1 (char 0)
[*********************100%%**********************]  4 of 7 completed

7 Failed downloads:
[**********************86%%***************       ]  6 of 7 completed['UVXY', 'SPY', 'QQQ', 'TLT', 'GLD', 'AAPL', 'MSFT']: Exception('%ticker%: No timezone found, symbol may be delisted')


Attempting to download all ticker data...


ConnectionError: Failed to download data from yfinance. The response was empty. This could be a network block or an API issue.

In [3]:
# Feature Creation

returns_df = full_df.drop('target', axis=1).pct_change()

# Moving Averages
full_df['spy_ma_50'] = full_df['SPY'].rolling(window=50).mean()
full_df['spy_ma_200'] = full_df['SPY'].rolling(window=200).mean()

# Relative Strength Index (RSI)
delta = full_df['SPY'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
full_df['spy_rsi'] = 100 - (100 / (1 + rs))

for ticker in ALL_TICKERS:
    for lag in [1, 3, 5, 10]:
        returns_df[f'{ticker}_return_lag_{lag}'] = returns_df[ticker].shift(lag)

full_df['spy_vol_30'] = returns_df['SPY'].rolling(window=30).std()

In [4]:
# Merge features - this requires careful index alignment
features = pd.concat([
    full_df[['spy_ma_50', 'spy_ma_200', 'spy_rsi', 'spy_vol_30']],
    returns_df.drop(ALL_TICKERS, axis=1) # Drop the raw return columns, keep the lagged ones
], axis=1)

# Combine features with the target
final_dataset = pd.concat([features, full_df['target']], axis=1)

# Drop all rows with NaN values created by rolling windows and lags
final_dataset.dropna(inplace=True)

print("Final Dataset Shape:", final_dataset.shape)
print("Final Dataset Head:")
print(final_dataset.head())

Final Dataset Shape: (3288, 33)
Final Dataset Head:
             spy_ma_50  spy_ma_200    spy_rsi  spy_vol_30  SPY_return_lag_1  \
Date                                                                          
2012-07-19  105.600181  103.310573  68.404999    0.009913          0.007407   
2012-07-20  105.622857  103.413266  51.627126    0.010106          0.002621   
2012-07-23  105.619227  103.502369  44.246325    0.010235         -0.009148   
2012-07-24  105.603668  103.578793  36.190600    0.010075         -0.010112   
2012-07-25  105.612253  103.658380  38.218211    0.009873         -0.008587   

            SPY_return_lag_3  SPY_return_lag_5  SPY_return_lag_10  \
Date                                                                
2012-07-19         -0.002357         -0.004845          -0.004512   
2012-07-20          0.006867          0.016778          -0.009504   
2012-07-23          0.007407         -0.002357          -0.001255   
2012-07-24          0.002621          0.006867   

In [5]:
final_dataset.to_parquet(PROCESSED_DATA_PATH)
print(f"Saved processed data to {PROCESSED_DATA_PATH}")

Saved processed data to C:\Users\dawso\Dev\Personal\AIGrind\mlops-etf-forecasting\data\processed\etf_features.parquet
