# Price Features and Target

Computes:
1. **Target**: log return `log(close_{t+1} / close_t)`
2. **Cross-sectional targets**: demeaned return, rank
3. **Recent price features**: short-term returns, volatility, distance from highs/lows

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
prices = pd.read_parquet("data/prices.pqt")
print(f"Prices: {len(prices):,} rows, {prices['symbol'].nunique():,} symbols")
print(f"Columns: {list(prices.columns)}")

In [None]:
# Parse dates and sort
prices["date"] = pd.to_datetime(prices["date"]).dt.date
prices = prices.sort_values(["symbol", "date"]).reset_index(drop=True)

print(f"Date range: {prices['date'].min()} to {prices['date'].max()}")

In [None]:
# Compute log return for target (as per README section 10)
prices["next_close"] = prices.groupby("symbol")["close"].shift(-1)
prices["target_date"] = prices.groupby("symbol")["date"].shift(-1)
prices["target_return"] = np.log(prices["next_close"] / prices["close"])

prices[["symbol", "date", "close", "next_close", "target_return", "target_date"]].head(10)

In [None]:
# Drop rows without target (last day per symbol)
prices = prices.dropna(subset=["target_return"]).copy()
print(f"Rows with target: {len(prices):,}")

In [None]:
# Verify alignment: target_date > date
invalid = prices[pd.to_datetime(prices["target_date"]) <= pd.to_datetime(prices["date"])]
print(f"Invalid rows (target_date <= date): {len(invalid)} (should be 0)")

# Gap distribution
gap = (pd.to_datetime(prices["target_date"]) - pd.to_datetime(prices["date"])).dt.days
print(f"\nGap distribution (days):")
print(gap.value_counts().sort_index())

In [None]:
# Target stats
print("Target return (log) stats:")
print(prices["target_return"].describe())

## Cross-sectional targets

Per README section 10:
- **Demeaned return**: `r[i,t] - mean_j r[j,t]` (for regression)
- **Rank**: percentile rank within day (for ranking loss)

In [None]:
# Cross-sectional demeaned return (for regression objective)
prices["target_demean"] = prices.groupby("date")["target_return"].transform(
    lambda x: x - x.mean()
)

# Cross-sectional rank (for ranking objective), scaled to [0, 1]
prices["target_rank"] = prices.groupby("date")["target_return"].rank(pct=True)

print("Demeaned return stats:")
print(prices["target_demean"].describe())
print("\nRank stats:")
print(prices["target_rank"].describe())

## Same-day price features

Using close(t) as proxy for price at 15:30 ET (when model runs):
- `overnight_gap`: reaction to overnight news
- `intraday_ret`: same-day continuation/reversal

This helps the model know if news has already been priced in.

In [None]:
# Same-day features (using close(t) as proxy for 15:30 price)
prices["close_lag1"] = prices.groupby("symbol")["close"].shift(1)

# Overnight gap: open(t) vs close(t-1)
prices["overnight_gap"] = prices["open"] / prices["close_lag1"] - 1

# Intraday return: close(t) vs open(t)
prices["intraday_ret"] = prices["close"] / prices["open"] - 1

print("Same-day feature stats:")
print(prices[["overnight_gap", "intraday_ret"]].describe())

## Historical price features

Short-term returns and volatility using data up to close(t-1).

In [None]:
# Short-term returns (using lagged data)
# ret_1d: close(t-1) / close(t-2) - 1
for lag in [1, 2, 3, 5]:
    prices[f"ret_{lag}d"] = prices.groupby("symbol")["close_lag1"].transform(
        lambda x: x / x.shift(lag) - 1
    )

# Short-term volatility: std of daily returns over past 5 days
prices["daily_ret"] = prices.groupby("symbol")["close"].pct_change()
prices["vol_5d"] = prices.groupby("symbol")["daily_ret"].transform(
    lambda x: x.shift(1).rolling(5, min_periods=3).std()
)

# Distance from 5-day high/low (using lagged data)
prices["high_5d"] = prices.groupby("symbol")["high"].transform(
    lambda x: x.shift(1).rolling(5, min_periods=1).max()
)
prices["low_5d"] = prices.groupby("symbol")["low"].transform(
    lambda x: x.shift(1).rolling(5, min_periods=1).min()
)
prices["dist_from_high_5d"] = (prices["close_lag1"] - prices["high_5d"]) / prices["high_5d"]
prices["dist_from_low_5d"] = (prices["close_lag1"] - prices["low_5d"]) / prices["low_5d"]

# Clean up intermediate columns
prices = prices.drop(columns=["close_lag1", "daily_ret", "high_5d", "low_5d"])

print("Historical price features added")

In [None]:
# Check feature stats
price_feature_cols = [
    "overnight_gap", "intraday_ret",  # same-day
    "ret_1d", "ret_2d", "ret_3d", "ret_5d", "vol_5d",  # historical
    "dist_from_high_5d", "dist_from_low_5d"
]
print("Price feature stats:")
print(prices[price_feature_cols].describe())

## Cross-sectional normalization of price features

Per README section 8: z-score within each day

In [None]:
def cross_sectional_zscore(df: pd.DataFrame, col: str) -> pd.Series:
    """Z-score within each date, with winsorization at 3 std."""
    grouped = df.groupby("date")[col]
    mean = grouped.transform("mean")
    std = grouped.transform("std")
    z = (df[col] - mean) / std
    # Winsorize at +/- 3
    return z.clip(-3, 3)

# Normalize price features
for col in price_feature_cols:
    prices[f"{col}_z"] = cross_sectional_zscore(prices, col)

normalized_cols = [f"{col}_z" for col in price_feature_cols]
print("Normalized feature stats:")
print(prices[normalized_cols].describe())

In [None]:
# Select columns to save
# Rename date -> feature_date for clarity when merging later

id_cols = ["symbol", "date", "target_date"]
target_cols = ["target_return", "target_demean", "target_rank"]
raw_price_cols = ["open", "high", "low", "close", "volume"]
feature_cols = normalized_cols  # Use normalized features

out = prices[id_cols + target_cols + raw_price_cols + feature_cols].copy()
out = out.rename(columns={"date": "feature_date"})

print(f"Output columns: {list(out.columns)}")
print(f"Rows: {len(out):,}")
out.head()

In [None]:
# Drop rows with missing features (early dates lacking history)
n_before = len(out)
out = out.dropna()
n_after = len(out)
print(f"Dropped {n_before - n_after:,} rows with missing features")
print(f"Final rows: {n_after:,}")

In [None]:
out.to_parquet("data/price_features.pqt", index=False)
print(f"Saved to data/price_features.pqt")
print(f"File size: {Path('data/price_features.pqt').stat().st_size / 1e6:.1f} MB")

In [None]:
# Summary
print(f"Date range: {out['feature_date'].min()} to {out['feature_date'].max()}")
print(f"Symbols: {out['symbol'].nunique():,}")
print(f"Days: {out['feature_date'].nunique():,}")