# Program Starter

In [29]:
import json
from pathlib import Path
import pandas as pd
import yfinance as yf

## Configuration

In [30]:
START_DATE = "2006-01-01"  
END_DATE = "2025-12-31"
INTERVAL = "1d"

BASE = Path("data")
BASE.mkdir(exist_ok=True)

In [31]:
def load_markets(path: str = "markets.json") -> dict:
    with open(path, "r") as f:
        markets = json.load(f)
    print("Loaded Markets:", markets)
    return markets

In [32]:
load_markets()

Loaded Markets: {'primary_total_market': ['VGT', 'VEU', 'BND', 'TLT', 'GLD', 'TIP']}


{'primary_total_market': ['VGT', 'VEU', 'BND', 'TLT', 'GLD', 'TIP']}

In [33]:
def formulate_monthly_prices(tickers, start=START_DATE, end=END_DATE, interval=INTERVAL):
    data = yf.download(
        tickers,
        start=START_DATE,
        end=END_DATE,
        interval=INTERVAL,
        auto_adjust=True,
        progress=False,
        group_by="column",
    )

    # keep close prices
    if isinstance(data.columns, pd.MultiIndex):
        data = data["Close"] if "Close" in data.columns.get_level_values(0) else data["Adj Close"]
    else:
        data = data["Close"] if "Close" in data else data["Adj Close"]

    if isinstance(data, pd.Series):
        data = data.to_frame()

    return data.dropna(how="all")

In [34]:
formulate_monthly_prices("AAPL")

Ticker,AAPL
Date,Unnamed: 1_level_1
2006-01-03,2.239734
2006-01-04,2.246327
2006-01-05,2.228649
2006-01-06,2.286177
2006-01-09,2.278687
...,...
2025-12-23,272.105377
2025-12-24,273.554016
2025-12-26,273.144409
2025-12-29,273.504089


In [None]:
def update_monthly_dataset(name, tickers):
    price_file = BASE / f"{name}_prices_monthly.csv"
    returns_file = BASE / f"{name}_returns_monthly.csv"

    if not price_file.exists():
        print(f"[NEW] Creating dataset for {name}")
        data = formulate_monthly_prices(tickers, START_DATE, END_DATE, INTERVAL)
    else:
        print(f"[UPDATE] Updating dataset for {name}")
        existing = pd.read_csv(price_file, index_col=0, parse_dates=True)
        last_date = existing.index.max()

        # Start from next month to avoid overlap
        start_ts = last_date + pd.offsets.MonthBegin()
        end_ts = pd.to_datetime(END_DATE)

        if start_ts > end_ts:
            data = existing
        else:
            new_data = formulate_monthly_prices(tickers, start_ts.strftime("%Y-%m-%d"), END_DATE, INTERVAL)
            data = pd.concat([existing, new_data])
            data = data[~data.index.duplicated(keep="last")]

    # Ensure all tickers exist
    missing = [t for t in tickers if t not in data.columns]
    if missing:
        raise ValueError(f"{name} -> Missing tickers: {missing}")

    # Clean rows where any asset missing
    data = data.dropna()

    # Save prices
    data.to_csv(price_file)

    # Recompute returns every time
    rets = data.pct_change().dropna()           # (latest-old)/old * 100
    rets.to_csv(returns_file)

    print(f"{name}: updated through {data.index.max().date()}")


In [36]:
markets = load_markets()

markets

Loaded Markets: {'primary_total_market': ['VGT', 'VEU', 'BND', 'TLT', 'GLD', 'TIP']}


{'primary_total_market': ['VGT', 'VEU', 'BND', 'TLT', 'GLD', 'TIP']}

In [37]:
for name, tickers in markets.items():
    print("name",name, "\ntic", tickers)
    update_monthly_dataset(name, tickers)

name primary_total_market 
tic ['VGT', 'VEU', 'BND', 'TLT', 'GLD', 'TIP']
[NEW] Creating dataset for primary_total_market
primary_total_market: updated through 2025-12-30
