Homework 04

In [38]:
from pathlib import Path
from datetime import datetime
import os, json, requests
import pandas as pd
from dotenv import load_dotenv
import requests

RAW = pathlib.Path('data/raw'); RAW.mkdir(parents=True, exist_ok=True)

In [65]:
def ts():
    return datetime.now().strftime("%Y%m%d-%H%M")
    
def save_csv(df: pd.DataFrame, prefix: str, **meta):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print('Saved', path)
    return path

def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}

In [26]:
load_dotenv()
ALPHAVANTAGE_KEY = os.getenv("ALPHAVANTAGE_API_KEY")

9678QFRFIV7I77KP


Part 1

In [44]:
SYMBOL = "DIS"
USE_ALPHA = bool(os.getenv('ALPHAVANTAGE_API_KEY'))

def USE_ALPHA (symbol: str, api_key: str) -> pd.DataFrame:
    url = 'https://www.alphavantage.co/query'
    params = {
        'function':'TIME_SERIES_DAILY_ADJUSTED',
        'symbol':SYMBOL,
        'outputsize':'compact',
        'apikey':os.getenv('ALPHAVANTAGE_API_KEY')}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    
    ts_key = next((k for k in js if "Time Series" in k), None)
    if not ts_key:
        raise ValueError(f"Unexpected keys: {list(js.keys())[:5]}")
    df["date"] = pd.to_datetime(df["date"])
    df["adj_close"] = pd.to_numeric(df["adj_close"], errors="coerce")
    return df[["date", "adj_close"]].sort_values("date").reset_index(drop=True)

def USE_YFINANCE(symbol: str) -> pd.DataFrame:
    import yfinance as yf
    df = yf.download(symbol, period="3mo", interval="1d", auto_adjust=False, progress=False)
    df = df.reset_index()[["Date", "Adj Close"]]
    df.columns = ["date", "adj_close"]
    return df.sort_values("date").reset_index(drop=True)

try:
    if USE_ALPHA:
        df_api = USE_ALPHA(SYMBOL, ALPHAVANTAGE_KEY)
    else:
        raise ValueError("No ALPHAVANTAGE_API_KEY set")
except Exception as e:
    print("Alpha Vantage failed ->", e)
    df_api = USE_YFINANCE(SYMBOL)

v_api = validate(df_api, ['date','adj_close'])
v_api

Alpha Vantage failed -> Unexpected keys: ['Information']


{'missing': [], 'shape': (63, 2), 'na_total': 0}

_ = save_csv(df_api.sort_values('date'), prefix='api', source='alpha' if USE_ALPHA else 'yfinance', symbol=SYMBOL)

Part 2

In [76]:
SCRAPE_URL = 'https://en.wikipedia.org/wiki/S%26P_500'  
headers = {'User-Agent':'AFE-Homework/1.0'}
try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")
    
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)
    
except Exception as e:
    print('Scrape failed, using inline demo table:', e)
    html = '<table><tr><th>Ticker</th><th>Price</th></tr><tr><td>AAA</td><td>101.2</td></tr></table>'
    soup = BeautifulSoup(html, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)

if 'Price' in df_scrape.columns:
    df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')
v_scrape = validate(df_scrape, list(df_scrape.columns)); v_scrape

Scrape failed, using inline demo table: 1 columns passed, passed data had 14 columns


{'missing': [], 'shape': (1, 2), 'na_total': 0}

In [78]:
_ = save_csv(df_scrape, prefix='scrape', site='example', table='markets')

Saved data/raw/scrape_site-example_table-markets_20250817-1521.csv


In [None]:
# Documentation
## Data Sources
- **API Source**
  - Alpha Vantage (`TIME_SERIES_DAILY_ADJUSTED`, ticker = DIS)  
  - Fallback: Yahoo Finance (`yfinance`) if Alpha Vantage fails  
  - URL: https://www.alphavantage.co/query
- **Scrape Source""
  - Wikipedia — *List of S&P 500 companies*  
    https://en.wikipedia.org/wiki/List_of_S%26P_500_companies  
  - Table: `id="constituents"`
- **Assumptions and Risks**
  - API Source: Alpha Vantage free API only allows ~5 requests/min; exceeding this triggers fallback to yfinance.
  - Scrape Source: Scraping depends on the presence of `table#constituents` in Wikipedia.
  - Schema changes: Column names could be renamed in Wikipedia