In [25]:
import requests
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

BASE = "https://ifcb-data.whoi.edu"
DATASET = "mvco"  # change to another dataset slug if needed (e.g., 'arctic')

def fetch_feed(start_date: str, end_date: str, metric: str = "temperature"):
    """
    Fetch the JSON feed of bins between dates (YYYY-MM-DD).
    'metric' just selects which timeline stream to filter by; 'temperature' is commonly available.
    """
    url = f"{BASE}/{DATASET}/api/feed/{metric}/start/{start_date}/end/{end_date}"
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.json()

def parse_hdr_text(hdr_text: str) -> dict:
    """
    IFCB .hdr files are simple 'key = value' lines (sometimes 'key= value' or 'key=value').
    This returns a dict of ALL metadata fields present.
    """
    meta = {}
    for line in hdr_text.splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        # split only on first '=' to preserve values containing '='
        if "=" in line:
            k, v = line.split("=", 1)
            meta[k.strip()] = v.strip()
        elif ":" in line:  # occasionally ':' is used
            k, v = line.split(":", 1)
            meta[k.strip()] = v.strip()
    return meta

def fetch_hdr(pid: str) -> dict:
    """
    Download and parse the .hdr for a given PID, returning a dict of metadata.
    Adds 'pid' and 'time' fields (time also inferred from feed when available).
    """
    hdr_url = f"{BASE}/{DATASET}/{pid}.hdr"
    r = requests.get(hdr_url, timeout=30)
    r.raise_for_status()
    meta = parse_hdr_text(r.text)
    meta["pid"] = pid
    return meta

def collect_metadata(start_date: str, end_date: str, max_workers: int = 8) -> list[dict]:
    """
    Returns a list of dicts — one per bin — containing:
      - pid (bin id)
      - time (ISO timestamp from the feed, if present)
      - all available .hdr metadata fields
    """
    feed = fetch_feed(start_date, end_date)
    # Each feed element typically has 'pid' and 'date' (ISO8601); some streams include more.
    items = [{"pid": e["pid"].split("/")[-1], "time": e.get("date")} for e in feed]

    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(fetch_hdr, it["pid"]): it for it in items}
        for fut in as_completed(futs):
            base = futs[fut]
            try:
                meta = fut.result()
                # ensure 'time' is present (prefer feed time; fall back to HDR if it has a timestamp field)
                if "time" not in meta or meta["time"] in (None, ""):
                    meta["time"] = base.get("time")
                # normalize time if it looks like a numeric or compact form (best-effort)
                if isinstance(meta.get("time"), str):
                    t = meta["time"]
                    # pass; many feeds already provide ISO strings like '2015-05-21T05:23:14Z'
                    # you can add custom parsing here if your HDR has custom keys like 'start_time'
                results.append(meta)
            except Exception as e:
                # You may want to log or collect failures
                results.append({"pid": base["pid"], "time": base.get("time"), "_error": str(e)})
    return results

if __name__ == "__main__":
    START = "2013-01-01"
    END   = "2014-01-01"

    data = collect_metadata(START, END)

    # convert list of dicts → DataFrame
    df = pd.DataFrame(data)

    # save as CSV inside Colab
    out_file = "ifcb_metadata.csv"
    df.to_csv(out_file, index=False)

    print(f"Saved {len(df)} rows to {out_file}")
    # sort: time (None last), then pid
    def sort_key(r):
        t = r.get("time")
        # try to parse ISO times; if not parseable, push to end
        try:
            dt = datetime.fromisoformat(t.replace("Z", "+00:00")) if isinstance(t, str) else None
        except Exception:
            dt = None
        return (dt is None, dt or datetime.max, r.get("pid", ""))
    for row in sorted(data, key=sort_key):
        writer.writerow(row)


KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

out_file = "/content/drive/MyDrive/ifcb_metadata.csv"
df.to_csv(out_file, index=False)
print("Saved to", out_file)