In [None]:
"""
===============================================================================
5-Minute → 15-Minute OHLC Builder (Polygon Raw Data)
===============================================================================

PURPOSE
-------
This script converts raw 5-minute price data into 15-minute OHLC bars and
stores them as parquet files for faster downstream use (e.g., trailing stop-loss
simulations).

Instead of resampling 5-minute data every time inside the stop-loss logic,
this script builds and caches 15-minute bars once (or incrementally).

This significantly reduces runtime for repeated backtests.

-------------------------------------------------------------------------------
INPUT DATA STRUCTURE
-------------------------------------------------------------------------------
Expected raw structure:

RAW_5M_ROOT/
    AAOI/
        <5-min csv or parquet files>
    MSFT/
        <5-min files>
    ...
    (one folder per symbol)

Each raw file should contain columns (case-insensitive detection supported):
    - date / trade_date / datetime / timestamp
    - open (or open_close)
    - high (or high_close)
    - low  (or low_close)
    - close (or close_close)

Multiple files per symbol are supported (they are concatenated).

-------------------------------------------------------------------------------
OUTPUT STRUCTURE
-------------------------------------------------------------------------------
OUT_15M_ROOT/
    symbol=AAOI/
        rs_15T.parquet
    symbol=MSFT/
        rs_15T.parquet
    ...

Each output file contains:
    - _dt
    - open
    - high
    - low
    - close

These are 15-minute resampled OHLC bars.

-------------------------------------------------------------------------------
KEY FEATURES
-------------------------------------------------------------------------------

1) Market Hours Filtering (Optional)
   If FILTER_MARKET_HOURS = True, raw 5-minute data is filtered to:
       MARKET_START → MARKET_END
   before resampling.

2) 5-Min → 15-Min Resampling
   Uses standard OHLC aggregation:
       open  = first
       high  = max
       low   = min
       close = last

3) Incremental Mode (Recommended)
   If INCREMENTAL = True:
       - Script reads the existing rs_15T.parquet
       - Only processes 5-minute data after the last saved timestamp
       - Appends new 15-minute bars
   This makes daily updates very fast.

4) Robust File Handling
   - Supports both CSV and parquet input
   - Auto-detects common column name variations
   - Sorts and removes duplicate timestamps

-------------------------------------------------------------------------------
WHY THIS EXISTS
-------------------------------------------------------------------------------
Your trailing stop-loss backtest runs on 15-minute candles.

If resampling is done inside the stop-loss script:
    - It repeats work for every run
    - It increases memory and runtime
    - It slows multi-strategy iterations

By building a dedicated 15-minute dataset once:
    - Stop-loss code becomes much faster
    - Data consistency improves
    - Only incremental updates are required

-------------------------------------------------------------------------------
HOW TO RUN
-------------------------------------------------------------------------------

1) Set paths in CONFIG:
       RAW_5M_ROOT
       OUT_15M_ROOT

2) (Optional) Adjust:
       FILTER_MARKET_HOURS
       MARKET_START / MARKET_END
       INCREMENTAL

3) Run:
       python build_15T_from_5m.py

4) Output will be saved under OUT_15M_ROOT.

-------------------------------------------------------------------------------
IMPORTANT ASSUMPTIONS
-------------------------------------------------------------------------------
- Raw timestamps are already in the correct timezone.
- No timezone conversion is applied in this script.
- Market hours filter is purely time-based (no exchange calendar logic).
- Parquet writing requires pyarrow or fastparquet installed.

-------------------------------------------------------------------------------
INTENDED WORKFLOW
-------------------------------------------------------------------------------
Step 1: Run this builder script (one-time or daily incremental update)
Step 2: Stop-loss simulation loads rs_15T.parquet directly
Step 3: ATR and trailing logic run on prebuilt 15-minute bars

-------------------------------------------------------------------------------
"""

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

# =========================================================
# CONFIG
# =========================================================
RAW_5M_ROOT = Path(r"D:\work\Trade Analysis\Polygon_5min_2016_to_today")   # contains folders like AAOI, MSFT, etc.

OUT_15M_ROOT = Path(r"D:\work\Trade Analysis\Polygon_15min_from_5min")    # new cache location
OUT_15M_ROOT.mkdir(parents=True, exist_ok=True)

RESAMPLE_RULE = "15T"

# If you want to match your stop-loss market window:
FILTER_MARKET_HOURS = True
MARKET_START = "14:30"
MARKET_END   = "21:00"

# Incremental build: only append bars after the last saved _dt
INCREMENTAL = True

# If your raw files are very large, this keeps memory reasonable
SORT_AND_DEDUP = True

# =========================================================
# Helpers
# =========================================================
def normalize_ticker(x: str) -> str:
    x = str(x).strip().upper()
    return "".join([ch for ch in x if ch.isalnum()])

def _pick_first_existing(df: pd.DataFrame, candidates):
    cols = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c.lower() in cols:
            return cols[c.lower()]
    return ""

def _load_any_file(fp: Path) -> pd.DataFrame:
    """Loads either .csv or .parquet"""
    if fp.suffix.lower() in [".parquet"]:
        return pd.read_parquet(fp)
    if fp.suffix.lower() in [".csv"]:
        return pd.read_csv(fp, low_memory=False)
    return pd.DataFrame()

def load_symbol_5m(symbol_dir: Path) -> pd.DataFrame:
    """
    Robust loader:
    - If directory contains multiple parquet/csv files, loads and concatenates them.
    - Expected columns (any variants): date/trade_date, open/high/low/close (or *_close variants).
    """
    if not symbol_dir.exists() or not symbol_dir.is_dir():
        return pd.DataFrame()

    files = []
    for ext in ("*.parquet", "*.csv"):
        files.extend(sorted(symbol_dir.rglob(ext)))

    if not files:
        return pd.DataFrame()

    parts = []
    for fp in files:
        df = _load_any_file(fp)
        if df is None or df.empty:
            continue

        # detect columns
        dt_col = _pick_first_existing(df, ["date", "trade_date", "datetime", "timestamp"])
        o_col  = _pick_first_existing(df, ["open", "open_close", "o"])
        h_col  = _pick_first_existing(df, ["high", "high_close", "h"])
        l_col  = _pick_first_existing(df, ["low", "low_close", "l"])
        c_col  = _pick_first_existing(df, ["close", "close_close", "c"])

        if not all([dt_col, o_col, h_col, l_col, c_col]):
            continue

        x = df[[dt_col, o_col, h_col, l_col, c_col]].copy()
        x.columns = ["_dt", "open", "high", "low", "close"]

        x["_dt"] = pd.to_datetime(x["_dt"], errors="coerce")
        x = x.dropna(subset=["_dt"])
        for col in ["open", "high", "low", "close"]:
            x[col] = pd.to_numeric(x[col], errors="coerce")
        x = x.dropna(subset=["open", "high", "low", "close"])

        if not x.empty:
            parts.append(x)

    if not parts:
        return pd.DataFrame()

    out = pd.concat(parts, ignore_index=True)

    if SORT_AND_DEDUP:
        out = out.sort_values("_dt")
        out = out.drop_duplicates(subset=["_dt"], keep="last")

    return out.reset_index(drop=True)

def filter_market_hours(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    dfi = df.set_index("_dt").sort_index()
    dfi = dfi.between_time(MARKET_START, MARKET_END, inclusive="both")
    return dfi.reset_index()

def resample_ohlc(df: pd.DataFrame, rule: str) -> pd.DataFrame:
    if df.empty:
        return df
    dfi = df.set_index("_dt").sort_index()
    ohlc = dfi[["open", "high", "low", "close"]].resample(rule).agg({
        "open": "first",
        "high": "max",
        "low": "min",
        "close": "last",
    }).dropna()
    return ohlc.reset_index()

def get_out_path(symbol: str) -> Path:
    sym = normalize_ticker(symbol)
    sym_dir = OUT_15M_ROOT / f"symbol={sym}"
    sym_dir.mkdir(parents=True, exist_ok=True)
    return sym_dir / "rs_15T.parquet"

def load_existing_last_dt(out_fp: Path):
    if not out_fp.exists():
        return None
    try:
        existing = pd.read_parquet(out_fp, columns=["_dt"])
        if existing.empty:
            return None
        return pd.to_datetime(existing["_dt"]).max()
    except Exception:
        return None

def build_symbol(symbol: str):
    sym = normalize_ticker(symbol)
    in_dir = RAW_5M_ROOT / sym
    out_fp = get_out_path(sym)

    df5 = load_symbol_5m(in_dir)
    if df5.empty:
        return sym, "NO_INPUT_DATA", 0

    # incremental cut
    if INCREMENTAL:
        last_dt = load_existing_last_dt(out_fp)
        if last_dt is not None:
            df5 = df5[df5["_dt"] > last_dt].copy()
            if df5.empty:
                return sym, "UP_TO_DATE", 0

    if FILTER_MARKET_HOURS:
        df5 = filter_market_hours(df5)
        if df5.empty:
            return sym, "EMPTY_AFTER_MARKET_FILTER", 0

    df15 = resample_ohlc(df5, RESAMPLE_RULE)
    if df15.empty:
        return sym, "EMPTY_AFTER_RESAMPLE", 0

    # append/merge with existing
    if out_fp.exists():
        try:
            old = pd.read_parquet(out_fp)
            merged = pd.concat([old, df15], ignore_index=True)
            merged = merged.sort_values("_dt").drop_duplicates(subset=["_dt"], keep="last").reset_index(drop=True)
        except Exception:
            merged = df15
    else:
        merged = df15

    merged.to_parquet(out_fp, index=False)
    return sym, "BUILT", len(df15)

def list_symbols(root: Path):
    # folders like AAOI, MSFT ...
    syms = []
    for p in root.iterdir():
        if p.is_dir():
            syms.append(normalize_ticker(p.name))
    return sorted(set(syms))

def main():
    symbols = list_symbols(RAW_5M_ROOT)
    print(f"Found {len(symbols)} symbol folders under: {RAW_5M_ROOT}")

    results = []
    for i, sym in enumerate(symbols, 1):
        s, status, n = build_symbol(sym)
        results.append((s, status, n))
        if i % 25 == 0:
            print(f"Processed {i}/{len(symbols)} symbols...")

    res = pd.DataFrame(results, columns=["Symbol", "Status", "New15MBarsWritten"])
    print("\nStatus counts:")
    print(res["Status"].value_counts(dropna=False).to_string())
    out_log = OUT_15M_ROOT / "build_15T_log.csv"
    res.to_csv(out_log, index=False)
    print(f"\nSaved log: {out_log}")

if __name__ == "__main__":
    main()

Found 1920 symbol folders under: D:\work\Trade Analysis\Polygon_5min_2016_to_today
Processed 25/1920 symbols...
Processed 50/1920 symbols...
Processed 75/1920 symbols...
Processed 100/1920 symbols...
Processed 125/1920 symbols...
Processed 150/1920 symbols...
Processed 175/1920 symbols...
Processed 200/1920 symbols...
Processed 225/1920 symbols...
Processed 250/1920 symbols...
Processed 275/1920 symbols...
Processed 300/1920 symbols...
