## Imports

In [43]:
import math
import os
import time
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

# import matplotlib.pyplot as plt
# %matplotlib inline

from fredapi import Fred
# import numpy as np
import polars as pl
# import seaborn as sns
import yfinance as yf

from dotenv import load_dotenv
from pathlib import Path
from typing import Iterable, List, Dict

In [37]:
env_path = Path.cwd().parent / ".env"
load_dotenv(dotenv_path=env_path)

True

## Functions

### Download Ticker Data

In [25]:
def download_ticker_data(tickers: Iterable[str], start: str, end: str, dest_dir: str = "data", retries: int = 3, pause: float = 1.0) -> List[str]:
    """
    Downloads historical stock data for a list of tickers using yfinance and saves each to a CSV file.

    Parameters
    ----------
    tickers : Iterable[str]
        List or iterable of ticker symbols to download.
    start : str
        Start date for historical data in 'YYYY-MM-DD' format.
    end : str
        End date for historical data in 'YYYY-MM-DD' format.
    dest_dir : str, optional
        Directory to save CSV files (default is "data").
    retries : int, optional
        Number of retry attempts for each ticker if download fails (default is 3).
    pause : float, optional
        Seconds to wait between retries (default is 1.0).

    Returns
    -------
    List[str]
        List of file paths to the saved CSV files.
    """
    cwd = Path.cwd()
    dest_path = Path(dest_dir)
    if dest_path.is_absolute():
        dest = dest_path
    else:
        dest = cwd.parent / dest_path
    dest.mkdir(parents=True, exist_ok=True)
    saved_files: List[str] = []
    for t in tickers:
        t = str(t).upper().strip()
        out_path = dest / f"{t}.csv"
        attempt = 0
        while attempt < retries:
            try:
                print(f"Downloading data for {t} (attempt {attempt+1}/{retries}) from {start} to {end} ...")
                ticker = yf.Ticker(t)
                historical_data = ticker.history(start=start, end=end)
                if historical_data is None or historical_data.empty:
                    raise ValueError(f"No data returned for {t} (start={start}, end={end})")
                historical_data.to_csv(out_path)
                print(f"Data for {t} saved to {out_path}")
                saved_files.append(str(out_path))
                break
            except Exception as exc:
                attempt += 1
                print(f"Failed to download {t}: {exc}")
                if attempt < retries:
                    print(f"Retrying after {pause} seconds...")
                    time.sleep(pause)
                else:
                    print(f"Giving up on {t} after {retries} attempts.")
    return saved_files

## Data Ingestion

### Yahoo Finance

In [26]:
tickers = ["PG", "KO", "PEP", "COST", "WMT", "^VIX"]
start_date = "2010-01-01"
end_date = "2024-12-31"

saved = download_ticker_data(tickers, start_date, end_date)
print(saved)

Downloading data for PG (attempt 1/3) from 2010-01-01 to 2024-12-31 ...
Data for PG saved to c:\Users\elibf\Documents\MSDS-451-Final\MSDS-451-Final\data\PG.csv
Downloading data for KO (attempt 1/3) from 2010-01-01 to 2024-12-31 ...
Data for KO saved to c:\Users\elibf\Documents\MSDS-451-Final\MSDS-451-Final\data\KO.csv
Downloading data for PEP (attempt 1/3) from 2010-01-01 to 2024-12-31 ...
Data for PEP saved to c:\Users\elibf\Documents\MSDS-451-Final\MSDS-451-Final\data\PEP.csv
Downloading data for COST (attempt 1/3) from 2010-01-01 to 2024-12-31 ...
Data for COST saved to c:\Users\elibf\Documents\MSDS-451-Final\MSDS-451-Final\data\COST.csv
Downloading data for WMT (attempt 1/3) from 2010-01-01 to 2024-12-31 ...
Data for WMT saved to c:\Users\elibf\Documents\MSDS-451-Final\MSDS-451-Final\data\WMT.csv
Downloading data for ^VIX (attempt 1/3) from 2010-01-01 to 2024-12-31 ...
Data for ^VIX saved to c:\Users\elibf\Documents\MSDS-451-Final\MSDS-451-Final\data\^VIX.csv
['c:\\Users\\elibf\\Do

In [27]:
pg_data = pl.read_csv(saved[0])
ko_data = pl.read_csv(saved[1])
pep_data = pl.read_csv(saved[2])
cost_data = pl.read_csv(saved[3])
wmt_data = pl.read_csv(saved[4])
vix_data = pl.read_csv(saved[5])

### Federal Reserve Economic Data

In [38]:
fred = Fred(api_key=os.getenv("FRED_API_KEY"))

In [39]:
indicators = {
    'GDP': 'A191RL1Q225SBEA', # Real Gross Domestic Product
    'CPI': 'CPIAUCSL', # Consumer Price Index for All Urban Consumers: All Items
    'UNRATE': 'UNRATE', # Unemployment Rate
    'FEDFUNDS': 'FEDFUNDS', # Effective Federal Funds Rate
    'VIX': 'VIXCLS', # CBOE Volatility Index
    'T10Y3M': 'T10Y3M' # 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity
}

In [45]:
fred_dfs: Dict[str, pl.DataFrame] = {}

for name, code in indicators.items():
    series = fred.get_series(code)
    # handle empty or missing series
    if series is None or len(series) == 0:
        fred_dfs[name] = pl.DataFrame({})
        continue

    # If the series has an index, align values to index; otherwise treat as sequence
    try:
        idx = list(series.index)
        dates = [d for d in idx]
        values = []
        for v in series.values:
            if isinstance(v, float) and math.isnan(v):
                values.append(None)
            else:
                values.append(v)
    except Exception:
        vals_list = list(series)
        dates = list(range(len(vals_list)))
        values = [None if (isinstance(v, float) and math.isnan(v)) else v for v in vals_list]

    df = pl.DataFrame({"DATE": dates, name: values})
    try:
        df = df.with_column(pl.col("DATE").str.strptime(pl.Date, "%Y-%m-%d").alias("DATE"))
    except Exception:
        pass

    fred_dfs[name] = df

## Exploratory Data Analysis

### Procter & Gamble ($PG)

In [28]:
pg_data.schema

Schema([('Date', String),
        ('Open', Float64),
        ('High', Float64),
        ('Low', Float64),
        ('Close', Float64),
        ('Volume', Int64),
        ('Dividends', Float64),
        ('Stock Splits', Float64)])

In [29]:
pg_data.head()

Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
str,f64,f64,f64,f64,i64,f64,f64
"""2010-01-04 00:00:00-05:00""",38.679348,38.805938,38.375534,38.685677,9190800,0.0,0.0
"""2010-01-05 00:00:00-05:00""",38.692005,38.786946,38.356542,38.698334,8649400,0.0,0.0
"""2010-01-06 00:00:00-05:00""",38.571743,38.584403,38.293248,38.514778,9908400,0.0,0.0
"""2010-01-07 00:00:00-05:00""",38.343886,38.419839,38.128684,38.305908,8972800,0.0,0.0
"""2010-01-08 00:00:00-05:00""",38.20463,38.31223,38.014746,38.255264,8464600,0.0,0.0


In [30]:
pg_data.describe()

statistic,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
str,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""3773""",3773.0,3773.0,3773.0,3773.0,3773.0,3773.0,3773.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,85.236579,85.848934,84.663057,85.275298,8757200.0,0.011324,0.0
"""std""",,38.879625,39.202039,38.588737,38.900924,5194200.0,0.091097,0.0
"""min""","""2010-01-04 00:00:00-05:00""",38.027422,38.147681,25.291183,38.103374,2022100.0,0.0,0.0
"""25%""",,56.137858,56.490187,55.769029,56.094166,6144900.0,0.0,0.0
"""50%""",,68.792523,69.225196,68.486249,68.870911,7663800.0,0.0,0.0
"""75%""",,122.254526,123.03277,121.253833,122.241798,9819000.0,0.0,0.0
"""max""","""2024-12-30 00:00:00-05:00""",176.593111,177.014965,175.347145,176.298798,123735700.0,1.007,0.0


### Coca-Cola ($KO)

In [31]:
ko_data.head()

Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
str,f64,f64,f64,f64,i64,f64,f64
"""2010-01-04 00:00:00-05:00""",17.594805,17.613274,17.514773,17.557867,13870400,0.0,0.0
"""2010-01-05 00:00:00-05:00""",17.499377,17.542472,17.280827,17.345469,23172400,0.0,0.0
"""2010-01-06 00:00:00-05:00""",17.345476,17.37318,17.231585,17.339321,19264600,0.0,0.0
"""2010-01-07 00:00:00-05:00""",17.339316,17.351628,17.160782,17.296221,13234600,0.0,0.0
"""2010-01-08 00:00:00-05:00""",17.071516,17.126923,16.852966,16.976093,28712400,0.0,0.0


### PepsiCo ($PEP)

In [32]:
pep_data.describe()

statistic,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
str,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""3773""",3773.0,3773.0,3773.0,3773.0,3773.0,3773.0,3773.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,93.969341,94.616491,93.291766,93.978312,5244100.0,0.013296,0.0
"""std""",,42.155884,42.467232,41.783274,42.132156,2401000.0,0.109972,0.0
"""min""","""2010-01-04 00:00:00-05:00""",37.113316,37.344928,36.775302,36.906754,883300.0,0.0,0.0
"""25%""",,57.474385,57.774879,57.071079,57.464031,3760800.0,0.0,0.0
"""50%""",,86.44788,86.934984,85.88974,86.456772,4760000.0,0.0,0.0
"""75%""",,125.346911,126.120993,124.533382,125.525894,5987800.0,0.0,0.0
"""max""","""2024-12-30 00:00:00-05:00""",180.996784,181.282222,179.505153,180.582458,27559700.0,1.355,0.0


### Costco ($COST)

In [33]:
cost_data.describe()

statistic,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
str,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""3773""",3773.0,3773.0,3773.0,3773.0,3773.0,3773.0,3773.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,247.197242,249.27975,245.106105,247.320245,2391400.0,0.02044,0.0
"""std""",,216.153611,218.03324,214.167997,216.229205,1304300.0,0.352758,0.0
"""min""","""2010-01-04 00:00:00-05:00""",38.455259,38.86299,38.204901,38.347946,491000.0,0.0,0.0
"""25%""",,91.03559,91.616998,90.524865,91.075737,1616700.0,0.0,0.0
"""50%""",,144.170424,145.548046,143.651506,144.957626,2060600.0,0.0,0.0
"""75%""",,354.559691,356.436957,351.686524,354.013245,2728400.0,0.0,0.0
"""max""","""2024-12-30 00:00:00-05:00""",997.889962,1004.435155,989.830639,990.926453,24233000.0,15.0,0.0


### Walmart ($WMT)

In [34]:
wmt_data.describe()

statistic,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
str,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""3773""",3773.0,3773.0,3773.0,3773.0,3773.0,3773.0,3773.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,30.276485,30.510099,30.057381,30.287885,25686000.0,0.002619,0.000795
"""std""",,16.028062,16.152691,15.898288,16.032867,14549000.0,0.02088,0.04884
"""min""","""2010-01-04 00:00:00-05:00""",11.65102,11.726111,11.571086,11.626796,6284700.0,0.0,0.0
"""25%""",,19.043536,19.17178,18.918589,19.030716,16995300.0,0.0,0.0
"""50%""",,23.212726,23.354764,23.089677,23.237837,21722400.0,0.0,0.0
"""75%""",,43.003178,43.436576,42.657617,43.016922,29998800.0,0.0,0.0
"""max""","""2024-12-30 00:00:00-05:00""",94.756085,95.251255,94.409464,94.775887,242694300.0,0.208,3.0


### FRED

In [47]:
fred_dfs.keys()

dict_keys(['GDP', 'CPI', 'UNRATE', 'FEDFUNDS', 'VIX', 'T10Y3M'])

In [48]:
fred_dfs['GDP'].head()

DATE,GDP
datetime[μs],f64
1947-04-01 00:00:00,-1.0
1947-07-01 00:00:00,-0.8
1947-10-01 00:00:00,6.4
1948-01-01 00:00:00,6.2
1948-04-01 00:00:00,6.8


In [49]:
fred_dfs['CPI'].head()

DATE,CPI
datetime[μs],f64
1947-01-01 00:00:00,21.48
1947-02-01 00:00:00,21.62
1947-03-01 00:00:00,22.0
1947-04-01 00:00:00,22.0
1947-05-01 00:00:00,21.95


In [50]:
fred_dfs['UNRATE'].head()

DATE,UNRATE
datetime[μs],f64
1948-01-01 00:00:00,3.4
1948-02-01 00:00:00,3.8
1948-03-01 00:00:00,4.0
1948-04-01 00:00:00,3.9
1948-05-01 00:00:00,3.5


In [51]:
fred_dfs['FEDFUNDS'].head()

DATE,FEDFUNDS
datetime[μs],f64
1954-07-01 00:00:00,0.8
1954-08-01 00:00:00,1.22
1954-09-01 00:00:00,1.07
1954-10-01 00:00:00,0.85
1954-11-01 00:00:00,0.83


In [52]:
fred_dfs['VIX'].head()

DATE,VIX
datetime[μs],f64
1990-01-02 00:00:00,17.24
1990-01-03 00:00:00,18.19
1990-01-04 00:00:00,19.22
1990-01-05 00:00:00,20.11
1990-01-08 00:00:00,20.26


In [53]:
fred_dfs['T10Y3M'].head()

DATE,T10Y3M
datetime[μs],f64
1982-01-04 00:00:00,2.32
1982-01-05 00:00:00,2.24
1982-01-06 00:00:00,2.43
1982-01-07 00:00:00,2.46
1982-01-08 00:00:00,2.5
