In [4]:
!pip install yfinance pandas numpy statsmodels arch xgboost scikit-learn hmmlearn shap matplotlib seaborn --quiet

In [5]:
# Import libraries
import pandas as pd
import numpy as np
import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.api import ARIMA
from arch import arch_model
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from hmmlearn.hmm import GaussianHMM
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configure settings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def ingest_data():
    """
    Ingests raw time series data. Downloads daily data from Yahoo Finance and
    simulates realistic weekly and monthly macroeconomic data.
    """

# Here we have the tickers from yfinance, maybe switch soon to a more reliable source.
    tickers = {
        'CL=F': 'wti_price',   # WTI Crude Oil Futures
        'BZ=F': 'brent_price', # Brent Crude Oil Futures
        'DX-Y.NYB': 'dxy',     # US Dollar Index
        '^TNX': '10y_yield',   # 10-Year Treasury Yield
        '^VIX': 'vix'          # CBOE Volatility Index
    }


    end=datetime.now().strftime('%Y-%m-%d')
    # From 2005 to today
    data = yf.download(list(tickers.keys()), start='2005-01-01', end=end, progress=False) # just for the bar
    # We're only working with the price at the end of the day, so this will be measured on the scale of days.
    data = data['Close'] # The price is already adjusted for splits and dividends



    
    data = data.rename(columns=tickers)


    # We can't have any missing values, so we'll forward fill them.
    data = data.ffill().dropna() # Forward fill to handle non-trading days

    # --- Simulate Weekly Data (EIA Inventories) ---
    weekly_dates = pd.date_range(start=data.index.min(), end=data.index.max(), freq='W-WED') # We're using the end of the week as the measurement point.

    # Ill have some random data here, but in reality this will be the EIA Inventories data, this will just trend up.
    eia_inventories = pd.Series(
        500 + (np.random.randn(len(weekly_dates)) * 10).cumsum(),
        index=weekly_dates, name='eia_inventories'
    )

    # --- Simulate Monthly Data (CPI, Industrial Production) ---
    monthly_dates = pd.date_range(start=data.index.min(), end=data.index.max(), freq='M')
    cpi = pd.Series(
        250 + (np.random.randn(len(monthly_dates)) * 0.5).cumsum(),
        index=monthly_dates, name='cpi'
    ) # We use 0.5 here because it represents inflation in a way, which is less volatile than the other data.


    industrial_production = pd.Series(
        100 + (np.random.randn(len(monthly_dates)) * 0.2).cumsum(),
        index=monthly_dates, name='industrial_production'
    ) # Very stable data, so we use 0.2.

    # Merge all datasets into a single daily-frequency DataFrame
    df = data.copy()
    df = df.merge(eia_inventories, how='left', left_index=True, right_index=True)
    df = df.merge(cpi, how='left', left_index=True, right_index=True)
    df = df.merge(industrial_production, how='left', left_index=True, right_index=True)

    # --- Data Catalog ---
    metadata = {
        'wti_price': {'frequency': 'daily', 'source': 'Yahoo Finance', 'publication_lag': '0D'},
        'brent_price': {'frequency': 'daily', 'source': 'Yahoo Finance', 'publication_lag': '0D'},
        'dxy': {'frequency': 'daily', 'source': 'Yahoo Finance', 'publication_lag': '0D'},
        '10y_yield': {'frequency': 'daily', 'source': 'Yahoo Finance', 'publication_lag': '0D'},
        'vix': {'frequency': 'daily', 'source': 'Yahoo Finance', 'publication_lag': '0D'},
        'eia_inventories': {'frequency': 'weekly', 'source': 'Simulated', 'publication_lag': '5B'},
        'cpi': {'frequency': 'monthly', 'source': 'Simulated', 'publication_lag': '10B'},
        'industrial_production': {'frequency': 'monthly', 'source': 'Simulated', 'publication_lag': '12B'},
    }

    print("Data ingestion complete.")
    return df, metadata

# Execute data ingestion
raw_data, metadata = ingest_data()
print("Raw Data Head:")
print(raw_data.head())

Data ingestion complete.
Raw Data Head:
            brent_price  wti_price        dxy  10y_yield        vix  \
Date                                                                  
2007-07-30    75.739998  76.830002  80.849998      4.804  20.870001   
2007-07-31    77.050003  78.209999  80.769997      4.771  23.520000   
2007-08-01    75.349998  76.529999  80.870003      4.759  23.670000   
2007-08-02    75.760002  76.860001  80.709999      4.753  21.219999   
2007-08-03    74.750000  75.480003  80.180000      4.700  25.160000   

            eia_inventories         cpi  industrial_production  
Date                                                            
2007-07-30              NaN         NaN                    NaN  
2007-07-31              NaN  250.041367             100.045445  
2007-08-01       493.205851         NaN                    NaN  
2007-08-02              NaN         NaN                    NaN  
2007-08-03              NaN         NaN                    NaN  
