# Price Features and Target

Computes:
1. **Target**: log return `log(close_{t+1} / close_t)`
2. **Cross-sectional targets**: demeaned return, rank
3. **Recent price features**: short-term returns, volatility, distance from highs/lows

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
prices = pd.read_parquet("data/prices.pqt")

# Filter to universe (symbols with fundamentals)
universe = pd.read_parquet("data/universe.pqt")
universe_symbols = set(universe["symbol"].unique())
n_before_symbols = prices["symbol"].nunique()
prices = prices[prices["symbol"].isin(universe_symbols)]
n_after_symbols = prices["symbol"].nunique()

print(f"Filtered to universe: {n_before_symbols:,} -> {n_after_symbols:,} symbols")
print(f"Prices: {len(prices):,} rows")
print(f"Columns: {list(prices.columns)}")

Filtered to universe: 5,644 -> 5,560 symbols
Prices: 5,851,737 rows
Columns: ['symbol', 'date', 'open', 'high', 'low', 'close', 'volume', 'change', 'changePercent', 'vwap']


In [3]:
# Parse dates and sort
prices["date"] = pd.to_datetime(prices["date"]).dt.date
prices = prices.sort_values(["symbol", "date"]).reset_index(drop=True)

print(f"Date range: {prices['date'].min()} to {prices['date'].max()}")

Date range: 2020-12-21 to 2025-12-19


In [4]:
# Compute log return for target (as per README section 10)
prices["next_close"] = prices.groupby("symbol")["close"].shift(-1)
prices["target_date"] = prices.groupby("symbol")["date"].shift(-1)
prices["target_return"] = np.log(prices["next_close"] / prices["close"])

prices[["symbol", "date", "close", "next_close", "target_return", "target_date"]].head(10)

Unnamed: 0,symbol,date,close,next_close,target_return,target_date
0,A,2020-12-21,117.78,117.37,-0.003487,2020-12-22
1,A,2020-12-22,117.37,117.3,-0.000597,2020-12-23
2,A,2020-12-23,117.3,117.31,8.5e-05,2020-12-24
3,A,2020-12-24,117.31,117.83,0.004423,2020-12-28
4,A,2020-12-28,117.83,117.23,-0.005105,2020-12-29
5,A,2020-12-29,117.23,117.39,0.001364,2020-12-30
6,A,2020-12-30,117.39,118.49,0.009327,2020-12-31
7,A,2020-12-31,118.49,118.64,0.001265,2021-01-04
8,A,2021-01-04,118.64,119.61,0.008143,2021-01-05
9,A,2021-01-05,119.61,122.89,0.027053,2021-01-06


In [5]:
# Drop rows without target (last day per symbol)
prices = prices.dropna(subset=["target_return"]).copy()
print(f"Rows with target: {len(prices):,}")

Rows with target: 5,846,177


In [6]:
# Verify alignment: target_date > date
invalid = prices[pd.to_datetime(prices["target_date"]) <= pd.to_datetime(prices["date"])]
print(f"Invalid rows (target_date <= date): {len(invalid)} (should be 0)")

# Gap distribution
gap = (pd.to_datetime(prices["target_date"]) - pd.to_datetime(prices["date"])).dt.days
print(f"\nGap distribution (days):")
print(gap.value_counts().sort_index())

Invalid rows (target_date <= date): 0 (should be 0)

Gap distribution (days):
1       4574416
2         59484
3       1042296
4        168941
5           272
         ...   
354           1
400           1
451           1
549           1
1600          1
Name: count, Length: 90, dtype: int64


In [7]:
# Target stats
print("Target return (log) stats:")
print(prices["target_return"].describe())

Target return (log) stats:
count    5.846177e+06
mean    -6.689940e-04
std      6.952715e-02
min     -1.354025e+01
25%     -1.438874e-02
50%      0.000000e+00
75%      1.278077e-02
max      1.624830e+01
Name: target_return, dtype: float64


## Cross-sectional targets

Per README section 10:
- **Demeaned return**: `r[i,t] - mean_j r[j,t]` (for regression)
- **Rank**: percentile rank within day (for ranking loss)

In [8]:
# Cross-sectional demeaned return (for regression objective)
prices["target_demean"] = prices.groupby("date")["target_return"].transform(
    lambda x: x - x.mean()
)

# Cross-sectional rank (for ranking objective), scaled to [0, 1]
prices["target_rank"] = prices.groupby("date")["target_return"].rank(pct=True)

print("Demeaned return stats:")
print(prices["target_demean"].describe())
print("\nRank stats:")
print(prices["target_rank"].describe())

Demeaned return stats:
count    5.846177e+06
mean    -1.693960e-20
std      6.832344e-02
min     -1.355233e+01
25%     -1.312194e-02
50%      9.053858e-05
75%      1.297793e-02
max      1.627505e+01
Name: target_demean, dtype: float64

Rank stats:
count    5.846177e+06
mean     5.001078e-01
std      2.886625e-01
min      1.802451e-04
25%      2.501054e-01
50%      5.002199e-01
75%      7.500527e-01
max      1.000000e+00
Name: target_rank, dtype: float64


## Same-day price features

Using close(t) as proxy for price at 15:30 ET (when model runs):
- `overnight_gap`: reaction to overnight news
- `intraday_ret`: same-day continuation/reversal

This helps the model know if news has already been priced in.

In [9]:
# Same-day features (using close(t) as proxy for 15:30 price)
prices["close_lag1"] = prices.groupby("symbol")["close"].shift(1)

# Overnight gap: open(t) vs close(t-1)
prices["overnight_gap"] = prices["open"] / prices["close_lag1"] - 1

# Intraday return: close(t) vs open(t)
prices["intraday_ret"] = prices["close"] / prices["open"] - 1

print("Same-day feature stats:")
print(prices[["overnight_gap", "intraday_ret"]].describe())

Same-day feature stats:
       overnight_gap  intraday_ret
count   5.840617e+06  5.846177e+06
mean    3.992294e+00           inf
std     4.967638e+03           NaN
min    -1.000000e+00 -9.999998e-01
25%    -5.602241e-03 -1.344955e-02
50%     0.000000e+00  0.000000e+00
75%     6.410256e-03  1.138647e-02
max     1.139062e+07           inf


  sqr = _ensure_numeric((avg - values) ** 2)


## Historical price features

Short-term returns and volatility using data up to close(t-1).

In [10]:
# Short-term returns (using lagged data)
# ret_1d: close(t-1) / close(t-2) - 1
for lag in [1, 2, 3, 5]:
    prices[f"ret_{lag}d"] = prices.groupby("symbol")["close_lag1"].transform(
        lambda x: x / x.shift(lag) - 1
    )

# Short-term volatility: std of daily returns over past 5 days
prices["daily_ret"] = prices.groupby("symbol")["close"].pct_change()
prices["vol_5d"] = prices.groupby("symbol")["daily_ret"].transform(
    lambda x: x.shift(1).rolling(5, min_periods=3).std()
)

# Distance from 5-day high/low (using lagged data)
prices["high_5d"] = prices.groupby("symbol")["high"].transform(
    lambda x: x.shift(1).rolling(5, min_periods=1).max()
)
prices["low_5d"] = prices.groupby("symbol")["low"].transform(
    lambda x: x.shift(1).rolling(5, min_periods=1).min()
)
prices["dist_from_high_5d"] = (prices["close_lag1"] - prices["high_5d"]) / prices["high_5d"]
prices["dist_from_low_5d"] = (prices["close_lag1"] - prices["low_5d"]) / prices["low_5d"]

# Clean up intermediate columns
prices = prices.drop(columns=["close_lag1", "daily_ret", "high_5d", "low_5d"])

print("Historical price features added")

Historical price features added


In [11]:
# Check feature stats
price_feature_cols = [
    "overnight_gap", "intraday_ret",  # same-day
    "ret_1d", "ret_2d", "ret_3d", "ret_5d", "vol_5d",  # historical
    "dist_from_high_5d", "dist_from_low_5d"
]
print("Price feature stats:")
print(prices[price_feature_cols].describe())

Price feature stats:


  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


       overnight_gap  intraday_ret        ret_1d        ret_2d        ret_3d  \
count   5.840617e+06  5.846177e+06  5.835057e+06  5.829499e+06  5.823941e+06   
mean    3.992294e+00           inf  4.179436e+00  4.345554e+00  6.205359e+00   
std     4.967638e+03           NaN  5.005467e+03  5.015137e+03  6.956365e+03   
min    -1.000000e+00 -9.999998e-01 -9.999987e-01 -9.999987e-01 -9.999985e-01   
25%    -5.602241e-03 -1.344955e-02 -1.428571e-02 -2.066929e-02 -2.564673e-02   
50%     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
75%     6.410256e-03  1.138647e-02  1.286125e-02  1.843347e-02  2.273626e-02   
max     1.139062e+07           inf  1.139062e+07  1.139062e+07  1.168269e+07   

             ret_5d        vol_5d  dist_from_high_5d  dist_from_low_5d  
count  5.812837e+06  5.823941e+06       5.840617e+06      5.840617e+06  
mean   2.452639e+00  6.634032e+00                inf               inf  
std    1.730051e+03  4.436850e+03                NaN        

## Cross-sectional normalization of price features

Per README section 8: z-score within each day

In [12]:
def cross_sectional_zscore(df: pd.DataFrame, col: str) -> pd.Series:
    """Z-score within each date, with winsorization at 3 std."""
    grouped = df.groupby("date")[col]
    mean = grouped.transform("mean")
    std = grouped.transform("std")
    z = (df[col] - mean) / std
    # Winsorize at +/- 3
    return z.clip(-3, 3)

# Normalize price features
for col in price_feature_cols:
    prices[f"{col}_z"] = cross_sectional_zscore(prices, col)

normalized_cols = [f"{col}_z" for col in price_feature_cols]
print("Normalized feature stats:")
print(prices[normalized_cols].describe())

Normalized feature stats:
       overnight_gap_z  intraday_ret_z      ret_1d_z      ret_2d_z  \
count     5.840611e+06    5.762911e+06  5.835051e+06  5.829493e+06   
mean     -1.064199e-02   -9.362218e-03 -1.187876e-02 -1.379948e-02   
std       5.381729e-01    7.751883e-01  6.436465e-01  6.449470e-01   
min      -3.000000e+00   -3.000000e+00 -3.000000e+00 -3.000000e+00   
25%      -1.456647e-01   -3.131814e-01 -2.186163e-01 -2.233674e-01   
50%      -1.563429e-02   -8.535514e-03 -1.487982e-02 -1.492273e-02   
75%       1.081883e-01    2.940735e-01  1.853740e-01  1.832536e-01   
max       3.000000e+00    3.000000e+00  3.000000e+00  3.000000e+00   

           ret_3d_z      ret_5d_z      vol_5d_z  dist_from_high_5d_z  \
count  5.823935e+06  5.812831e+06  5.823935e+06         5.814416e+06   
mean  -1.508260e-02 -1.689781e-02 -2.405513e-02         3.224844e-02   
std    6.410034e-01  6.342465e-01  4.470659e-01         8.226461e-01   
min   -3.000000e+00 -3.000000e+00 -1.269033e+00        

In [13]:
# Select columns to save
# Rename date -> feature_date for clarity when merging later

id_cols = ["symbol", "date", "target_date"]
target_cols = ["target_return", "target_demean", "target_rank"]
raw_price_cols = ["open", "high", "low", "close", "volume"]
feature_cols = normalized_cols  # Use normalized features

out = prices[id_cols + target_cols + raw_price_cols + feature_cols].copy()
out = out.rename(columns={"date": "feature_date"})

print(f"Output columns: {list(out.columns)}")
print(f"Rows: {len(out):,}")
out.head()

Output columns: ['symbol', 'feature_date', 'target_date', 'target_return', 'target_demean', 'target_rank', 'open', 'high', 'low', 'close', 'volume', 'overnight_gap_z', 'intraday_ret_z', 'ret_1d_z', 'ret_2d_z', 'ret_3d_z', 'ret_5d_z', 'vol_5d_z', 'dist_from_high_5d_z', 'dist_from_low_5d_z']
Rows: 5,846,177


Unnamed: 0,symbol,feature_date,target_date,target_return,target_demean,target_rank,open,high,low,close,volume,overnight_gap_z,intraday_ret_z,ret_1d_z,ret_2d_z,ret_3d_z,ret_5d_z,vol_5d_z,dist_from_high_5d_z,dist_from_low_5d_z
0,A,2020-12-21,2020-12-22,-0.003487,-0.008086,0.429827,117.7,118.45,115.96,117.78,2082000,,-0.239989,,,,,,,
1,A,2020-12-22,2020-12-23,-0.000597,-0.013092,0.316701,118.0,118.43,116.85,117.37,1670440,-0.09332,-0.143735,,,,,,0.476999,-0.420938
2,A,2020-12-23,2020-12-24,8.5e-05,0.002869,0.515245,118.0,118.89,117.22,117.3,1202100,-0.016099,-0.282491,-0.159268,,,,,0.538753,-0.392031
3,A,2020-12-24,2020-12-28,0.004423,0.001621,0.609302,117.04,118.37,116.84,117.31,733600,-0.128909,0.121416,-0.01628,-0.016431,,,,0.423266,-0.016796
4,A,2020-12-28,2020-12-29,-0.005105,0.005174,0.568587,118.51,118.78,116.67,117.83,1003100,-0.016067,0.022077,-0.003748,-0.016129,-0.016169,,-0.016782,0.463628,-0.016266


In [14]:
# Drop rows with missing features (early dates lacking history)
n_before = len(out)
out = out.dropna()
n_after = len(out)
print(f"Dropped {n_before - n_after:,} rows with missing features")
print(f"Final rows: {n_after:,}")

Dropped 315,637 rows with missing features
Final rows: 5,530,540


In [15]:
out.to_parquet("data/price_features.pqt", index=False)
print(f"Saved to data/price_features.pqt")
print(f"File size: {Path('data/price_features.pqt').stat().st_size / 1e6:.1f} MB")

Saved to data/price_features.pqt
File size: 635.8 MB


In [16]:
# Summary
print(f"Date range: {out['feature_date'].min()} to {out['feature_date'].max()}")
print(f"Symbols: {out['symbol'].nunique():,}")
print(f"Days: {out['feature_date'].nunique():,}")

Date range: 2020-12-30 to 2025-12-18
Symbols: 5,547
Days: 1,195
