In [1]:
# level42_vpin_flow_toxicity.py
# Free-only VPIN using Binance public 1m klines (no API key).
# Outputs:
#   - CSV : level42_vpin_buckets.csv  (bucket_time, buy, sell, vol, imb, vpin, toxic_flag)
#   - JSON: level42_vpin_metrics.json (means/quantiles/config)
# Usage:
#   python level42_vpin_flow_toxicity.py --symbol BTCUSDT --days 3 --bucket-mult 3 --m 50 --toxic-q 0.9
# Notes:
#   - This uses a bar-sign proxy for buy/sell splits. If you have trade prints, replace split_volume_signed().

import os
import time
import json
import math
import argparse
from dataclasses import dataclass, asdict
from typing import Optional, Dict
from collections import deque

import requests
from requests.adapters import HTTPAdapter, Retry
import numpy as np
import pandas as pd


# ----------------------------- Config -----------------------------
@dataclass
class Config:
    symbol: str = "BTCUSDT"        # Binance spot symbol
    interval: str = "1m"           # 1-minute klines
    days: int = 3                  # how many days back to pull
    bucket_mult: float = 3.0       # bucket size = bucket_mult * median(1m volume)
    m: int = 50                    # VPIN smoothing window (in buckets)
    toxic_q: float = 0.90          # toxic when VPIN >= this quantile (computed from result)
    base_url: str = "https://api.binance.com"
    csv_path: str = "level42_vpin_buckets.csv"
    json_path: str = "level42_vpin_metrics.json"
    timeout: float = 10.0          # HTTP timeout
    backoff_sleep: float = 0.15    # sleep between paged requests
    verify_ssl: bool = True        # set False only if your env has broken certs


# ----------------------------- HTTP Session (Retry/Backoff) -----------------------------
def build_http_session(cfg: Config) -> requests.Session:
    session = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=("GET",),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    session.verify = cfg.verify_ssl
    return session


# ----------------------------- Loader (Binance public klines) -----------------------------
def fetch_klines(session: requests.Session, cfg: Config,
                 start_ms: Optional[int] = None,
                 end_ms: Optional[int] = None,
                 limit: int = 1000):
    url = f"{cfg.base_url}/api/v3/klines"
    params = {"symbol": cfg.symbol, "interval": cfg.interval, "limit": limit}
    if start_ms is not None:
        params["startTime"] = int(start_ms)
    if end_ms is not None:
        params["endTime"] = int(end_ms)

    resp = session.get(url, params=params, timeout=cfg.timeout)
    resp.raise_for_status()
    return resp.json()


def load_minutes(session: requests.Session, cfg: Config) -> pd.DataFrame:
    # Pull last `days` of klines with forward pagination from start -> end
    end = int(time.time() * 1000)
    start = end - cfg.days * 24 * 60 * 60 * 1000
    frames = []
    cur_start = start
    last_end_guard = -1

    while True:
        data = fetch_klines(session, cfg, start_ms=cur_start, end_ms=end, limit=1000)
        if not data:
            break

        df = pd.DataFrame(data, columns=[
            "open_time", "open", "high", "low", "close", "volume",
            "close_time", "qav", "num_trades", "taker_buy_vol", "taker_buy_qav", "ignore"
        ])

        # Basic numeric casting & timezone to UTC
        for col in ["open", "high", "low", "close", "volume"]:
            df[col] = pd.to_numeric(df[col], errors="coerce")

        df["open_time"] = pd.to_datetime(df["open_time"], unit="ms", utc=True)
        df["close_time"] = pd.to_datetime(df["close_time"], unit="ms", utc=True)

        frames.append(df[["open_time", "close_time", "open", "high", "low", "close", "volume"]])

        # pagination: next start = last open_time + 1 minute
        last_open = int(df["open_time"].iloc[-1].value / 1e6)  # ms
        if last_open == last_end_guard:  # safety: prevent infinite loops
            break
        last_end_guard = last_open
        cur_start = last_open + 60_000

        if df.shape[0] < 1000 or cur_start >= end:
            break

        time.sleep(cfg.backoff_sleep)

    if not frames:
        raise RuntimeError("No klines returned; check symbol/interval or network.")

    out = pd.concat(frames).drop_duplicates(subset=["open_time"]).set_index("open_time").sort_index()
    # Drop rows with missing essential fields
    out = out.dropna(subset=["open", "close", "volume"])
    # Remove zero/negative volumes (shouldn't happen, but guard)
    out = out[out["volume"] > 0]
    if out.empty:
        raise RuntimeError("Klines loaded, but all rows invalid after cleaning.")
    return out


# ----------------------------- Signed Volume (bar-sign proxy) -----------------------------
def split_volume_signed(df: pd.DataFrame) -> pd.DataFrame:
    """Approximate buy/sell volume using the sign of (close - open)."""
    ret = df["close"] - df["open"]
    sign = np.sign(ret.to_numpy(dtype=float))
    vol = df["volume"].to_numpy(dtype=float)

    buy = np.where(sign > 0, vol, np.where(sign < 0, 0.0, 0.5 * vol))
    sell = np.where(sign < 0, vol, np.where(sign > 0, 0.0, 0.5 * vol))

    out = df.copy()
    out["buy_vol"] = buy
    out["sell_vol"] = sell
    return out


# ----------------------------- Volume-Time Bucketing -----------------------------
def build_volume_buckets(bars: pd.DataFrame, bucket_vol: float, m: int) -> pd.DataFrame:
    """
    Convert 1m bars with (buy_vol, sell_vol, volume) into equal-volume buckets.
    - Proportionally split a bar if it straddles a bucket boundary.
    - Maintain a deque for O(1) VPIN smoothing over last m buckets.
    """
    if not np.isfinite(bucket_vol) or bucket_vol <= 0:
        raise ValueError("bucket_vol must be positive and finite.")

    buy_cum = sell_cum = vol_cum = 0.0
    vpin_window = deque(maxlen=m)

    rows = []
    for ts, row in bars.iterrows():
        v = float(row["volume"])
        if v <= 0 or not np.isfinite(v):
            continue
        buy = float(row["buy_vol"])
        sell = float(row["sell_vol"])

        remaining = v
        while remaining > 1e-12:
            need = bucket_vol - vol_cum
            take = remaining if remaining <= need + 1e-12 else need
            frac = (take / v) if v > 0 else 0.0

            buy_cum += buy * frac
            sell_cum += sell * frac
            vol_cum += take
            remaining -= take

            if vol_cum >= bucket_vol - 1e-12:  # close bucket
                imb = abs(buy_cum - sell_cum) / bucket_vol  # 0..1+ (rare >1 if data dirty)
                vpin_window.append(imb)
                vpin = float(sum(vpin_window) / len(vpin_window))

                rows.append({
                    "bucket_time": ts,   # timestamp of bar that closed the bucket
                    "buy": buy_cum,
                    "sell": sell_cum,
                    "vol": vol_cum,
                    "imb": imb,
                    "vpin": vpin
                })
                # reset bucket accumulators for next bucket
                buy_cum = sell_cum = vol_cum = 0.0

    if not rows:
        raise RuntimeError("No buckets formed (bucket too large?). Try lowering --bucket-mult or increase --days.")

    out = pd.DataFrame(rows).set_index("bucket_time")
    # basic sanity: remove impossible values
    out = out[(out["vol"] > 0) & out["imb"].notna() & out["vpin"].notna()]
    return out


# ----------------------------- Pipeline -----------------------------
def compute_vpin(cfg: Config) -> (pd.DataFrame, Dict):
    session = build_http_session(cfg)

    # 1) Load 1m bars
    bars = load_minutes(session, cfg)

    # 2) Signed volume proxy
    bars = split_volume_signed(bars)

    # 3) Bucket size from median 1m volume
    med_vol = float(bars["volume"].median())
    if not np.isfinite(med_vol) or med_vol <= 0:
        raise RuntimeError("Median volume is invalid. Data may be too sparse.")
    bucket_vol = cfg.bucket_mult * med_vol

    # 4) Build buckets + VPIN
    buckets = build_volume_buckets(bars, bucket_vol, cfg.m)

    # 5) Toxic flag via quantile threshold
    q_cut = float(buckets["vpin"].quantile(cfg.toxic_q))
    buckets["toxic_flag"] = (buckets["vpin"] >= q_cut).astype(int)

    # 6) Metrics
    metrics = {
        "config": asdict(cfg),
        "symbol": cfg.symbol,
        "interval": cfg.interval,
        "median_1m_vol": med_vol,
        "bucket_vol": bucket_vol,
        "buckets": int(buckets.shape[0]),
        "vpin_mean": float(buckets["vpin"].mean()),
        "vpin_p50": float(buckets["vpin"].quantile(0.50)),
        "vpin_p80": float(buckets["vpin"].quantile(0.80)),
        "vpin_p90": float(buckets["vpin"].quantile(0.90)),
        "toxic_quantile_cut": q_cut,
        "toxic_share_pct": float(100.0 * buckets["toxic_flag"].mean()),
    }
    return buckets, metrics


# ----------------------------- I/O -----------------------------
def save_outputs(buckets: pd.DataFrame, metrics: Dict, cfg: Config):
    # Ensure directory exists
    os.makedirs(os.path.dirname(cfg.csv_path) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.json_path) or ".", exist_ok=True)

    # CSV (append-safe: dedupe on index)
    if os.path.exists(cfg.csv_path):
        prev = pd.read_csv(cfg.csv_path, parse_dates=["bucket_time"]).set_index("bucket_time")
        out = pd.concat([prev, buckets], axis=0)
        out = out[~out.index.duplicated(keep="last")].sort_index()
    else:
        out = buckets.sort_index()
    out.to_csv(cfg.csv_path, index=True)

    # JSON metrics
    with open(cfg.json_path, "w") as f:
        json.dump(metrics, f, indent=2)

    print(f"[OK] Saved buckets → {cfg.csv_path}")
    print(f"[OK] Saved metrics → {cfg.json_path}")
    print("Metrics summary:", {k: (round(v, 4) if isinstance(v, (int, float)) else v) for k, v in metrics.items()})


# ----------------------------- CLI -----------------------------
def parse_args() -> Config:
    p = argparse.ArgumentParser(description="Level-42 VPIN (Flow Toxicity) — Binance 1m klines")
    p.add_argument("--symbol", type=str, default="BTCUSDT", help="Binance spot symbol (e.g., BTCUSDT)")
    p.add_argument("--days", type=int, default=3, help="How many recent days of 1m bars to fetch")
    p.add_argument("--bucket-mult", type=float, default=3.0, help="Bucket size multiplier on median 1m volume")
    p.add_argument("--m", type=int, default=50, help="Smoothing window in buckets for VPIN")
    p.add_argument("--toxic-q", type=float, default=0.90, help="Quantile for toxic flag (0-1)")
    p.add_argument("--csv", type=str, default="level42_vpin_buckets.csv", help="Output CSV path")
    p.add_argument("--json", type=str, default="level42_vpin_metrics.json", help="Output JSON path")
    p.add_argument("--interval", type=str, default="1m", help="Binance kline interval (default 1m)")
    p.add_argument("--base-url", type=str, default="https://api.binance.com", help="Binance REST base URL")
    p.add_argument("--timeout", type=float, default=10.0, help="HTTP timeout seconds")
    p.add_argument("--no-verify-ssl", action="store_true", help="Disable SSL cert verification (not recommended)")

    args = p.parse_args()
    cfg = Config(
        symbol=args.symbol,
        interval=args.interval,
        days=args.days,
        bucket_mult=args.bucket_mult,
        m=args.m,
        toxic_q=args.toxic_q,
        base_url=args.base_url,
        csv_path=args.csv,
        json_path=args.json,
        timeout=args.timeout,
        verify_ssl=not args.no_verify_ssl,
    )
    return cfg


# ----------------------------- Main -----------------------------
def main():
    cfg = parse_args()
    buckets, metrics = compute_vpin(cfg)
    save_outputs(buckets, metrics, cfg)


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--symbol SYMBOL] [--days DAYS]
                             [--bucket-mult BUCKET_MULT] [--m M]
                             [--toxic-q TOXIC_Q] [--csv CSV] [--json JSON]
                             [--interval INTERVAL] [--base-url BASE_URL]
                             [--timeout TIMEOUT] [--no-verify-ssl]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\adity\AppData\Roaming\jupyter\runtime\kernel-0d441352-f13f-4320-bfd8-3e818416b5b1.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
