# Imports & config

In [2]:
from pathlib import Path
from datetime import datetime
import pandas as pd
import yfinance as yf
from dotenv import load_dotenv
import os

# Load secrets if you later switch to an API that needs keys
load_dotenv()  # keeps keys out of code; safe even if none are set # Loads environment variables from a .env file into your system environment.

TICKER = "AAPL"
START = "2015-01-01"
END   = None  # or "2025-08-22"
SOURCE = "yfinance"


# Helper: reproducible raw path

In [3]:
def raw_path(prefix: str, source: str, ident: str, ext: str = "csv") -> Path: 
    # with type hints(;) which will not coerce # Python won’t enforce types at runtime — they’re just hints for IDEs/linting.
    # also `-> Path` make sure we will have a Path object, Path is a class from the pathlib module in Python, so of course it is an certain kind of object.
        # It represents filesystem paths in an object-oriented way (instead of plain strings)
    ts = datetime.now().strftime("%Y%m%d-%H%M")
    Path("data/raw").mkdir(parents=True, exist_ok=True) # Creates the folder if it doesn’t exist. 
    # Para: parents=True create parent directories if needed; exist_ok=True don’t raise an error if the folder already exists.
    return Path(f"data/raw/{prefix}_{source}_{ident}_{ts}.{ext}")

# Helper: validation

In [5]:
def validate_df(df: pd.DataFrame,
                required_cols: list[str],
                expected_types: dict[str, str],
                date_col: str = "date") -> dict:
    msgs = {}

    # Schema
    missing = set(required_cols) - set(df.columns)
    if missing:
        msgs["missing_cols"] = f"Missing columns: {sorted(missing)}"

    # Types
    for col, t in expected_types.items():
        try:
            if t.startswith("datetime"):
                df[col] = pd.to_datetime(df[col], errors="raise")
            elif t in ("float", "int"):
                df[col] = pd.to_numeric(df[col], errors="coerce")
        except Exception as e:
            msgs[f"type_{col}"] = f"Failed to parse {col} as {t}: {e}"

    # Completeness
    na_counts = df[required_cols].isna().sum()
    if na_counts.any():
        msgs["na_counts"] = f"NA counts: {na_counts.to_dict()}"

    # Sanity
    if date_col in df and not df[date_col].is_monotonic_increasing:
        msgs["date_order"] = "Dates are not monotonically increasing."
    if df.duplicated().any():
        msgs["dupes"] = f"Found {int(df.duplicated().sum())} duplicate rows."
    # domain sanity: prices positive, volume nonnegative
    if "adj_close" in df and (df["adj_close"] <= 0).any():
        msgs["price_nonpositive"] = "Found non-positive adjusted prices."
    if "volume" in df and (df["volume"] < 0).any():
        msgs["volume_negative"] = "Found negative volumes."

    return msgs


# Fetch via yfinance(API)

In [10]:
# raw = yf.download(TICKER, start=START, end=END, auto_adjust=False, progress=False)
# raw is indexed by DatetimeIndex, columns: ['Open','High','Low','Close','Adj Close','Volume']

# Pick one of these two options
# A) Use Ticker().history() → always single-level columns
df = yf.Ticker(TICKER).history(start=START, end=END, auto_adjust=False)
# columns: ['Open','High','Low','Close','Adj Close','Volume']
# auto_adjust=True, Automatically adjusts OHLC (Open, High, Low, Close) prices for dividends and stock splits.

# B) Download data for one or multiple tickers, so ensure you pass a STRING ticker (not a list) to download()
# df = yf.download(TICKER, start=START, end=END, auto_adjust=False, progress=False)
# If TICKER is 'AAPL' (string), most versions return single-level columns.

# (If you ever still get MultiIndex)
# if isinstance(df.columns, pd.MultiIndex):
#     df = df.xs(TICKER, axis=1, level='Ticker')  # collapse to single level


In [11]:
df = df.drop(columns=["Dividends", "Stock Splits"])
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-02 00:00:00-05:00,27.847500,27.860001,26.837500,27.332500,24.261045,212818400
2015-01-05 00:00:00-05:00,27.072500,27.162500,26.352501,26.562500,23.577576,257142000
2015-01-06 00:00:00-05:00,26.635000,26.857500,26.157499,26.565001,23.579792,263188400
2015-01-07 00:00:00-05:00,26.799999,27.049999,26.674999,26.937500,23.910433,160423600
2015-01-08 00:00:00-05:00,27.307501,28.037500,27.174999,27.972500,24.829128,237458000
...,...,...,...,...,...,...
2025-08-18 00:00:00-04:00,231.699997,233.119995,230.110001,230.889999,230.889999,37476200
2025-08-19 00:00:00-04:00,231.279999,232.869995,229.350006,230.559998,230.559998,39402600
2025-08-20 00:00:00-04:00,229.979996,230.470001,225.770004,226.009995,226.009995,42263900
2025-08-21 00:00:00-04:00,226.270004,226.520004,223.779999,224.899994,224.899994,30621200


# Parse dtypes (dates, floats)

In [8]:
df["date"] = pd.to_datetime(df["date"], errors="raise")

MultiIndex([(     'date',     ''),
            ('adj_close', 'AAPL'),
            (    'close', 'AAPL'),
            (     'high', 'AAPL'),
            (      'low', 'AAPL'),
            (     'open', 'AAPL'),
            (   'volume', 'AAPL')],
           names=['Price', 'Ticker'])


# Validate (fail fast if issues)

In [None]:
required = ["date","open","high","low","close","adj_close","volume"]
expected = {
    "date": "datetime64[ns]",
    "open": "float", "high": "float", "low": "float",
    "close": "float", "adj_close": "float",
    "volume": "int"  # yfinance returns int-like; coerce ok
}
msgs = validate_df(df, required, expected, date_col="date")
if msgs:
    raise ValueError(msgs)

# Optional: quick completeness print
print("Shape:", df.shape)
print("Head:\n", df.head(3))

# Save raw CSV (timestamped)

In [None]:
fp = raw_path("api", SOURCE, TICKER)
df.to_csv(fp, index=False)
fp