# Homework Starter — Stage 04: Data Acquisition and Ingestion
Name: 
Date: 

## Objectives
- API ingestion with secrets in `.env`
- Scrape a permitted public table
- Validate and save raw data to `data/raw/`

In [None]:
import os, pathlib, datetime as dt
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

RAW = pathlib.Path(r'/Users/hust/bootcamp_zheyu_dong/homework/homework04') ; RAW.mkdir(parents=True, exist_ok=True)
load_dotenv(); print('ALPHAVANTAGE_API_KEY loaded?', bool(os.getenv('ALPHAVANTAGE_API_KEY')))

## Helpers (use or modify)

In [None]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

def save_csv(df: pd.DataFrame, prefix: str, **meta):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print('Saved', path)
    return path

def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}

## Part 1 — API Pull (Required)
Choose an endpoint (e.g., Alpha Vantage or use `yfinance` fallback).

In [None]:

SYMBOL = 'AAPL'

def _alpha_fetch(symbol: str):
    key = os.getenv('ALPHAVANTAGE_API_KEY')
    if not key:
        raise RuntimeError("No ALPHAVANTAGE_API_KEY in env")
    url = 'https://www.alphavantage.co/query'
    params = {
        'function': 'TIME_SERIES_DAILY_ADJUSTED',
        'symbol': symbol,
        'outputsize': 'compact',
        'apikey': key,
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    key_ts = next((k for k in js.keys() if 'Time Series' in k), None)
    if key_ts is None:
        raise RuntimeError(f"Unexpected Alpha Vantage payload: {list(js.keys())}")
    df = pd.DataFrame(js[key_ts]).T.reset_index()
    df.columns = ['date'] + list(df.columns[1:])
    df = df[['date','5. adjusted close']].rename(columns={'5. adjusted close':'adj_close'})
    df['date'] = pd.to_datetime(df['date'])
    df['adj_close'] = pd.to_numeric(df['adj_close'])
    return df

try:
    df_api = _alpha_fetch(SYMBOL)
except Exception as e:
    print("API fetch failed or unavailable; using offline demo data:", e)
    df_api = pd.DataFrame({
        'date': pd.to_datetime(['2025-01-02','2025-01-03','2025-01-06','2025-01-07','2025-01-08']),
        'adj_close': [189.4, 190.2, 191.8, 190.9, 192.5],
    })

v_api = validate(df_api, ['date','adj_close']); v_api


In [None]:

src = 'alpha' if os.getenv('ALPHAVANTAGE_API_KEY') else 'offline'
_ = save_csv(df_api.sort_values('date'), prefix='api', source=src, symbol=SYMBOL)


## Part 2 — Scrape a Public Table (Required)
Replace `SCRAPE_URL` with a permitted page containing a simple table.

In [None]:

SCRAPE_URL = 'https://www.ctrip.com'  # TODO: replace with permitted page
headers = {'User-Agent':'AFE-Homework/1.0'}
try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=30); resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    rows = [r for r in rows if r]
    header, data = rows[0], rows[1:]
    df_scrape = pd.DataFrame(data, columns=header)
except Exception as e:
    print('Scrape failed, using inline demo table:', e)
    html = '''
    <table>
      <tr><th>Ticker</th><th>Price</th><th>Volume</th></tr>
      <tr><td>AAA</td><td>101.2</td><td>12000</td></tr>
      <tr><td>BBB</td><td>99.8</td><td>18050</td></tr>
      <tr><td>CCC</td><td>105.3</td><td>9500</td></tr>
    </table>
    '''
    soup = BeautifulSoup(html, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    rows = [r for r in rows if r]
    header, data = rows[0], rows[1:]
    df_scrape = pd.DataFrame(data, columns=header)

# Basic cleaning / types
for col in ['Price','Volume']:
    if col in df_scrape.columns:
        df_scrape[col] = pd.to_numeric(df_scrape[col], errors='coerce')

v_scrape = validate(df_scrape, list(df_scrape.columns)); v_scrape


In [None]:
_ = save_csv(df_scrape, prefix='scrape', site='example', table='markets')

## Documentation
- API Source: (URL/endpoint/params)
- Scrape Source: (URL/table description)
- Assumptions & risks: (rate limits, selector fragility, schema changes)
- Confirm `.env` is not committed.