# 0.2c Infer BMO/AMC Timing from Price Data

**Problem:** Historical earnings data from Nasdaq has ~0% BMO/AMC timing coverage because Nasdaq only provides timing for *upcoming* earnings, not historical.

**Solution:** Infer timing by comparing overnight gap magnitudes:
- **BMO (Before Market Open):** Large gap on earnings day T
- **AMC (After Market Close):** Large gap on day T+1

**Why this matters:** Without timing, ~50% of training data has misaligned price moves - we're computing moves on the wrong days for AMC earnings.

In [None]:
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta, date
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path('../data/earnings')

# Nasdaq headers
NASDAQ_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'application/json',
    'Origin': 'https://www.nasdaq.com',
    'Referer': 'https://www.nasdaq.com/',
}

In [None]:
# Parameters (can be overridden by papermill)
USE_CACHE = True

## 1. Load Historical Data

In [None]:
# Load historical earnings moves (has prices but no timing)
moves_df = pd.read_parquet(DATA_DIR / 'historical_earnings_moves.parquet')

# Phase 3.1: Deduplicate earnings dates (21 duplicate pairs found in review)
# Keep first occurrence for each (symbol, earnings_date) pair
original_len = len(moves_df)
moves_df = moves_df.drop_duplicates(subset=['symbol', 'earnings_date'], keep='first')
if len(moves_df) < original_len:
    print(f"Removed {original_len - len(moves_df)} duplicate (symbol, earnings_date) pairs")

# Phase 3.2: Remove outliers
# - ADTX has extreme prices (>$394M) due to reverse split data issues
# - Records with >100% moves are likely data errors
# - Weekend earnings dates are data errors
moves_df['earnings_date'] = pd.to_datetime(moves_df['earnings_date'])

# Remove ADTX
adtx_count = (moves_df['symbol'] == 'ADTX').sum()
if adtx_count > 0:
    moves_df = moves_df[moves_df['symbol'] != 'ADTX']
    print(f"Removed {adtx_count} ADTX records (bad price data from reverse split)")

# Remove >100% overnight moves (data errors)
if 'overnight_move_abs' in moves_df.columns:
    extreme_moves = moves_df['overnight_move_abs'] > 1.0
    if extreme_moves.sum() > 0:
        print(f"Removed {extreme_moves.sum()} records with >100% moves")
        moves_df = moves_df[~extreme_moves]

# Remove weekend earnings dates (data errors)
weekend_mask = moves_df['earnings_date'].dt.dayofweek >= 5
if weekend_mask.sum() > 0:
    print(f"Removed {weekend_mask.sum()} records with weekend earnings dates")
    moves_df = moves_df[~weekend_mask]

print(f"\nHistorical earnings records: {len(moves_df):,}")
print(f"Columns: {moves_df.columns.tolist()}")
print(f"\nDate range: {moves_df['earnings_date'].min()} to {moves_df['earnings_date'].max()}")

In [None]:
# Load prices for computing T+1 gaps (needed for AMC detection)
prices_df = pd.read_parquet('../data/prices.pqt')
prices_df['date'] = pd.to_datetime(prices_df['date'])
print(f"Prices: {len(prices_df):,} rows, {prices_df['symbol'].nunique():,} symbols")

In [None]:
# Check what price columns we have
print("Price columns:", prices_df.columns.tolist())
prices_df.head()

## 2. Compute Gap Ratios

For each earnings event, compute:
- `gap_T`: Overnight gap on earnings day (Close_T-1 -> Open_T)
- `gap_T1`: Overnight gap on day after (Close_T -> Open_T+1)
- `gap_ratio`: gap_T / gap_T1 (>1 suggests BMO, <1 suggests AMC)

In [None]:
def compute_gap_t1(symbol: str, earnings_date, prices_cache: dict) -> float:
    """Compute the overnight gap on T+1 (Open_T+1 - Close_T) / Close_T.
    
    This is the AMC reaction gap.
    """
    if symbol not in prices_cache:
        return np.nan
    
    pdf = prices_cache[symbol]
    earn_date = pd.to_datetime(earnings_date)
    
    try:
        # Find T (earnings day or next trading day)
        t_candidates = pdf[pdf.index >= earn_date].head(1)
        if t_candidates.empty:
            return np.nan
        t = t_candidates.index[0]
        
        # Find T+1
        t1_candidates = pdf[pdf.index > t].head(1)
        if t1_candidates.empty:
            return np.nan
        t1 = t1_candidates.index[0]
        
        close_t = pdf.loc[t, 'close']
        open_t1 = pdf.loc[t1, 'open']
        
        gap_t1 = abs(open_t1 - close_t) / close_t
        return gap_t1
    except Exception:
        return np.nan

# Build price cache (indexed by date for faster lookup)
print("Building price cache...")
prices_df_sorted = prices_df.sort_values(['symbol', 'date'])
price_cache = {}
for symbol, group in prices_df_sorted.groupby('symbol'):
    price_cache[symbol] = group.set_index('date')
print(f"Price cache: {len(price_cache)} symbols")

In [None]:
# Compute gap on T from existing data (already have open_t and close_t_minus_1)
moves_df['gap_t'] = abs(moves_df['open_t'] - moves_df['close_t_minus_1']) / moves_df['close_t_minus_1']

# Compute gap on T+1 (need to look up prices)
print("Computing T+1 gaps (this may take a minute)...")
gap_t1_values = []
for idx, row in moves_df.iterrows():
    gap_t1 = compute_gap_t1(row['symbol'], row['earnings_date'], price_cache)
    gap_t1_values.append(gap_t1)
    
    if len(gap_t1_values) % 10000 == 0:
        print(f"  Progress: {len(gap_t1_values):,}/{len(moves_df):,}")

moves_df['gap_t1'] = gap_t1_values
print(f"\nComputed {len(gap_t1_values):,} T+1 gaps")
print(f"Missing T+1 gaps: {moves_df['gap_t1'].isna().sum():,}")

In [None]:
# Compute gap ratio
# Phase 3.4: Use larger epsilon (0.001) to avoid extreme ratios
# Previously eps=1e-6 could create gap_ratio > 1,000,000 for tiny gap_t1 values
eps = 0.001
moves_df['gap_ratio'] = moves_df['gap_t'] / (moves_df['gap_t1'] + eps)

# Also filter out extreme gap ratios (likely data issues or tiny movements)
# Cap gap_ratio to reasonable range for classification
moves_df['gap_ratio'] = moves_df['gap_ratio'].clip(upper=100)

# Show distribution
valid_ratios = moves_df['gap_ratio'].dropna()
print("Gap ratio distribution (gap_T / gap_T+1):")
print(f"  Mean: {valid_ratios.mean():.2f}")
print(f"  Median: {valid_ratios.median():.2f}")
print(f"  25th percentile: {valid_ratios.quantile(0.25):.2f}")
print(f"  75th percentile: {valid_ratios.quantile(0.75):.2f}")
print(f"\n  >2.0 (likely BMO): {(valid_ratios > 2.0).sum():,} ({(valid_ratios > 2.0).mean()*100:.1f}%)")
print(f"  <0.5 (likely AMC): {(valid_ratios < 0.5).sum():,} ({(valid_ratios < 0.5).mean()*100:.1f}%)")
print(f"  0.5-2.0 (ambiguous): {((valid_ratios >= 0.5) & (valid_ratios <= 2.0)).sum():,}")

In [None]:
# Visualize gap ratio distribution
try:
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Log-scale histogram
    ax = axes[0]
    log_ratios = np.log10(valid_ratios.clip(0.01, 100))
    ax.hist(log_ratios, bins=100, edgecolor='black', alpha=0.7)
    ax.axvline(np.log10(2.0), color='green', linestyle='--', label='BMO threshold (ratio>2)')
    ax.axvline(np.log10(0.5), color='red', linestyle='--', label='AMC threshold (ratio<0.5)')
    ax.set_xlabel('log10(gap_ratio)')
    ax.set_ylabel('Count')
    ax.set_title('Gap Ratio Distribution (log scale)')
    ax.legend()
    
    # Scatter: gap_T vs gap_T1
    ax = axes[1]
    sample = moves_df.dropna(subset=['gap_t', 'gap_t1']).sample(min(5000, len(moves_df)))
    ax.scatter(sample['gap_t']*100, sample['gap_t1']*100, alpha=0.3, s=10)
    ax.plot([0, 20], [0, 20], 'k--', label='Equal gaps')
    ax.set_xlabel('Gap on T (%) - BMO reaction')
    ax.set_ylabel('Gap on T+1 (%) - AMC reaction')
    ax.set_title('Gap T vs Gap T+1')
    ax.set_xlim(0, 20)
    ax.set_ylim(0, 20)
    ax.legend()
    
    plt.tight_layout()
    plt.show()
except ImportError:
    print("matplotlib not available")

## 3. Classify Timing Based on Gap Ratio

In [None]:
def infer_timing(gap_ratio: float, bmo_threshold: float = 2.0, amc_threshold: float = 0.5) -> str:
    """Infer BMO/AMC timing from gap ratio.
    
    gap_ratio = gap_T / gap_T1
    - High ratio (>bmo_threshold): Gap on T is much larger -> BMO
    - Low ratio (<amc_threshold): Gap on T+1 is much larger -> AMC
    - Middle: Ambiguous
    """
    if pd.isna(gap_ratio):
        return 'unknown'
    if gap_ratio > bmo_threshold:
        return 'BMO'
    elif gap_ratio < amc_threshold:
        return 'AMC'
    else:
        return 'unknown'

# Apply inference
moves_df['inferred_timing'] = moves_df['gap_ratio'].apply(infer_timing)

print("Inferred timing distribution:")
print(moves_df['inferred_timing'].value_counts())
print(f"\nBMO/AMC coverage: {(moves_df['inferred_timing'] != 'unknown').mean()*100:.1f}%")

## 4. Validate Against Known Timings

Fetch recent upcoming earnings (which have timing) and validate our inference algorithm.

In [None]:
import time

def fetch_upcoming_earnings_with_timing(days_ahead: int = 30) -> pd.DataFrame:
    """Fetch upcoming earnings from Nasdaq (has BMO/AMC timing)."""
    from_date = date.today()
    to_date = from_date + timedelta(days=days_ahead)
    
    all_rows = []
    current_date = from_date
    
    while current_date <= to_date:
        date_str = current_date.strftime('%Y-%m-%d')
        url = f"https://api.nasdaq.com/api/calendar/earnings?date={date_str}"
        
        try:
            r = requests.get(url, headers=NASDAQ_HEADERS, timeout=10)
            if r.status_code == 200:
                data = r.json()
                rows = data.get('data', {}).get('rows', [])
                if rows:
                    for row in rows:
                        row['earnings_date'] = date_str
                    all_rows.extend(rows)
        except Exception as e:
            print(f"  {date_str}: {e}")
        
        current_date += timedelta(days=1)
        time.sleep(0.1)
    
    df = pd.DataFrame(all_rows)
    
    # Parse timing
    def parse_timing(time_str):
        if pd.isna(time_str):
            return 'unknown'
        time_str = str(time_str).lower()
        if 'pre-market' in time_str or 'before' in time_str:
            return 'BMO'
        elif 'after-hours' in time_str or 'after' in time_str:
            return 'AMC'
        return 'unknown'
    
    df['actual_timing'] = df['time'].apply(parse_timing)
    return df

print("Fetching upcoming earnings with known timing...")
upcoming_df = fetch_upcoming_earnings_with_timing(days_ahead=60)
print(f"\nFetched {len(upcoming_df)} upcoming earnings")
print(f"\nTiming distribution:")
print(upcoming_df['actual_timing'].value_counts())

In [None]:
# For validation, we need historical earnings that we can still get prices for
# Let's use recent historical data (last few months) where we have both timing and prices

# Actually, we need to look at PAST earnings with known timing
# Let's fetch very recent historical dates (last 30 days) to see if any have timing

def fetch_recent_historical_earnings(days_back: int = 60) -> pd.DataFrame:
    """Fetch recent historical earnings to check if any have timing."""
    to_date = date.today() - timedelta(days=1)  # Yesterday
    from_date = to_date - timedelta(days=days_back)
    
    all_rows = []
    current_date = from_date
    
    print(f"Fetching {from_date} to {to_date}...")
    
    while current_date <= to_date:
        date_str = current_date.strftime('%Y-%m-%d')
        url = f"https://api.nasdaq.com/api/calendar/earnings?date={date_str}"
        
        try:
            r = requests.get(url, headers=NASDAQ_HEADERS, timeout=10)
            if r.status_code == 200:
                data = r.json()
                rows = data.get('data', {}).get('rows', [])
                if rows:
                    for row in rows:
                        row['earnings_date'] = date_str
                    all_rows.extend(rows)
        except Exception:
            pass
        
        current_date += timedelta(days=1)
        time.sleep(0.1)
        
        if (current_date - from_date).days % 10 == 0:
            print(f"  Progress: {(current_date - from_date).days}/{days_back} days, {len(all_rows)} records")
    
    df = pd.DataFrame(all_rows)
    
    # Parse timing
    def parse_timing(time_str):
        if pd.isna(time_str):
            return 'unknown'
        time_str = str(time_str).lower()
        if 'pre-market' in time_str or 'before' in time_str:
            return 'BMO'
        elif 'after-hours' in time_str or 'after' in time_str:
            return 'AMC'
        return 'unknown'
    
    if 'time' in df.columns:
        df['actual_timing'] = df['time'].apply(parse_timing)
    return df

print("Fetching recent historical earnings...")
recent_hist_df = fetch_recent_historical_earnings(days_back=60)
print(f"\nFetched {len(recent_hist_df)} historical earnings")
if 'actual_timing' in recent_hist_df.columns:
    print(f"\nTiming distribution:")
    print(recent_hist_df['actual_timing'].value_counts())

In [None]:
# Since historical Nasdaq data doesn't have timing, let's validate differently:
# Use yfinance to get timing for a sample of stocks and compare with our inference

import yfinance as yf

def get_yf_timing(symbol: str) -> tuple:
    """Get earnings timing from yfinance for validation.
    
    Returns (earnings_date, timing) if available, else (None, None)
    """
    try:
        ticker = yf.Ticker(symbol)
        info = ticker.info
        
        ts = info.get('earningsTimestamp')
        if not ts:
            return None, None
        
        import pytz
        ET = pytz.timezone('US/Eastern')
        dt = datetime.fromtimestamp(ts, tz=pytz.UTC).astimezone(ET)
        
        # Determine timing from hour
        if dt.hour < 10:
            timing = 'BMO'
        elif dt.hour >= 16:
            timing = 'AMC'
        else:
            timing = 'unknown'
        
        return dt.date(), timing
    except Exception:
        return None, None

# Sample symbols from our historical data
sample_symbols = moves_df['symbol'].value_counts().head(100).index.tolist()
print(f"Validating against yfinance for {len(sample_symbols)} symbols...")

yf_timings = []
for i, symbol in enumerate(sample_symbols):
    earn_date, timing = get_yf_timing(symbol)
    if earn_date and timing != 'unknown':
        yf_timings.append({
            'symbol': symbol,
            'yf_earnings_date': earn_date,
            'yf_timing': timing
        })
    
    if (i+1) % 20 == 0:
        print(f"  Progress: {i+1}/{len(sample_symbols)}, found {len(yf_timings)} with timing")
    time.sleep(0.05)  # Be nice to yfinance

yf_df = pd.DataFrame(yf_timings)
print(f"\nFound {len(yf_df)} symbols with yfinance timing")
print(yf_df['yf_timing'].value_counts())

In [None]:
# Alternative validation: Use upcoming earnings prices when they become historical
# For now, let's validate by checking if the inference is internally consistent

# Cross-check: For inferred BMO, gap_t should be >> gap_t1
# For inferred AMC, gap_t1 should be >> gap_t

bmo_data = moves_df[moves_df['inferred_timing'] == 'BMO']
amc_data = moves_df[moves_df['inferred_timing'] == 'AMC']

print("Validation - Internal Consistency:")
print("\nBMO inferred (should have larger gap_t):")
print(f"  Mean gap_t: {bmo_data['gap_t'].mean()*100:.2f}%")
print(f"  Mean gap_t1: {bmo_data['gap_t1'].mean()*100:.2f}%")
print(f"  Ratio: {bmo_data['gap_t'].mean() / bmo_data['gap_t1'].mean():.2f}x")

print("\nAMC inferred (should have larger gap_t1):")
print(f"  Mean gap_t: {amc_data['gap_t'].mean()*100:.2f}%")
print(f"  Mean gap_t1: {amc_data['gap_t1'].mean()*100:.2f}%")
print(f"  Ratio: {amc_data['gap_t1'].mean() / amc_data['gap_t'].mean():.2f}x")

## 5. Tune Thresholds

Find optimal thresholds that balance coverage vs accuracy.

In [None]:
# Test different threshold combinations
threshold_results = []

for bmo_thresh in [1.5, 2.0, 2.5, 3.0]:
    for amc_thresh in [0.3, 0.4, 0.5, 0.67]:
        timing = moves_df['gap_ratio'].apply(
            lambda x: infer_timing(x, bmo_threshold=bmo_thresh, amc_threshold=amc_thresh)
        )
        
        bmo_count = (timing == 'BMO').sum()
        amc_count = (timing == 'AMC').sum()
        unknown_count = (timing == 'unknown').sum()
        total = len(timing)
        
        coverage = (bmo_count + amc_count) / total * 100
        bmo_pct = bmo_count / total * 100
        amc_pct = amc_count / total * 100
        
        threshold_results.append({
            'bmo_threshold': bmo_thresh,
            'amc_threshold': amc_thresh,
            'coverage_pct': coverage,
            'bmo_pct': bmo_pct,
            'amc_pct': amc_pct,
            'bmo_amc_ratio': bmo_pct / amc_pct if amc_pct > 0 else 0,
        })

results_df = pd.DataFrame(threshold_results)
print("Threshold tuning results:")
print(results_df.to_string(index=False))

# Best coverage with reasonable BMO/AMC balance (expect roughly 50/50)
print("\nRecommended: Choose thresholds with ~50% coverage and ~1.0 BMO/AMC ratio")

In [None]:
# Use the selected thresholds
BMO_THRESHOLD = 2.0  # gap_ratio > 2.0 -> BMO
AMC_THRESHOLD = 0.5  # gap_ratio < 0.5 -> AMC

moves_df['timing'] = moves_df['gap_ratio'].apply(
    lambda x: infer_timing(x, bmo_threshold=BMO_THRESHOLD, amc_threshold=AMC_THRESHOLD)
)

print(f"Final timing distribution (thresholds: BMO>{BMO_THRESHOLD}, AMC<{AMC_THRESHOLD}):")
print(moves_df['timing'].value_counts())
print(f"\nCoverage: {(moves_df['timing'] != 'unknown').mean()*100:.1f}%")

## 6. Correct Price Move Alignment

For AMC earnings, the current data computes moves on the wrong day. We need to recalculate.

In [None]:
# Current approach: Close_T-1 -> Open_T -> Close_T -> Close_T+1
# For BMO: Reaction is Open_T (correct - gap from T-1 close to T open)
# For AMC: Reaction is Open_T+1 (currently we capture Close_T-1 -> Close_T which is BEFORE earnings)

# Phase 3.3: Fix BMO/AMC base price inconsistency
# Both BMO and AMC should use close_t_minus_1 as base for comparable percentages
# This allows apples-to-apples comparison across timing types

def compute_corrected_moves(row, price_cache):
    """Compute timing-corrected moves based on inferred timing.
    
    Both BMO and AMC use close_t_minus_1 as the base price for comparable percentages.
    """
    symbol = row['symbol']
    timing = row['timing']
    
    if symbol not in price_cache:
        return pd.Series({'corrected_gap': np.nan, 'corrected_full': np.nan})
    
    pdf = price_cache[symbol]
    earn_date = pd.to_datetime(row['earnings_date'])
    
    try:
        # Find T-1, T, T+1
        t_minus_1_candidates = pdf[pdf.index < earn_date].tail(1)
        t_candidates = pdf[pdf.index >= earn_date].head(1)
        
        if t_minus_1_candidates.empty or t_candidates.empty:
            return pd.Series({'corrected_gap': np.nan, 'corrected_full': np.nan})
        
        t_minus_1 = t_minus_1_candidates.index[0]
        t = t_candidates.index[0]
        
        t_plus_1_candidates = pdf[pdf.index > t].head(1)
        if t_plus_1_candidates.empty:
            return pd.Series({'corrected_gap': np.nan, 'corrected_full': np.nan})
        t_plus_1 = t_plus_1_candidates.index[0]
        
        close_t_minus_1 = pdf.loc[t_minus_1, 'close']
        open_t = pdf.loc[t, 'open']
        close_t = pdf.loc[t, 'close']
        open_t_plus_1 = pdf.loc[t_plus_1, 'open']
        close_t_plus_1 = pdf.loc[t_plus_1, 'close']
        
        # Use close_t_minus_1 as base for BOTH timing types for consistency
        base_price = close_t_minus_1
        
        if timing == 'BMO':
            # BMO: Reaction is T-1 close -> T open -> T close
            corrected_gap = (open_t - base_price) / base_price
            corrected_full = (close_t - base_price) / base_price
        elif timing == 'AMC':
            # AMC: Reaction is T close -> T+1 open -> T+1 close
            # But we express as % of T-1 close for consistency with BMO
            corrected_gap = (open_t_plus_1 - close_t) / base_price
            corrected_full = (close_t_plus_1 - close_t) / base_price
        else:
            # Unknown: Use overnight move as best approximation
            corrected_gap = np.nan
            corrected_full = (close_t_plus_1 - base_price) / base_price
        
        return pd.Series({'corrected_gap': corrected_gap, 'corrected_full': corrected_full})
        
    except Exception:
        return pd.Series({'corrected_gap': np.nan, 'corrected_full': np.nan})

print("Computing timing-corrected moves (this may take a few minutes)...")
corrected = moves_df.apply(lambda row: compute_corrected_moves(row, price_cache), axis=1)
moves_df['corrected_gap'] = corrected['corrected_gap']
moves_df['corrected_full'] = corrected['corrected_full']
moves_df['corrected_gap_abs'] = moves_df['corrected_gap'].abs()
moves_df['corrected_full_abs'] = moves_df['corrected_full'].abs()

print("Done!")

In [None]:
# Compare original vs corrected moves
print("Original vs Corrected Moves (Absolute Values):")
print("\nOriginal gap_move_abs (all data, not timing-aware):")
print(f"  Mean: {moves_df['gap_move_abs'].mean()*100:.2f}%")
print(f"  Median: {moves_df['gap_move_abs'].median()*100:.2f}%")

print("\nCorrected gap_abs (timing-aware):")
valid_corrected = moves_df['corrected_gap_abs'].dropna()
print(f"  Mean: {valid_corrected.mean()*100:.2f}%")
print(f"  Median: {valid_corrected.median()*100:.2f}%")

print("\n--- By Timing ---")
for timing in ['BMO', 'AMC', 'unknown']:
    subset = moves_df[moves_df['timing'] == timing]
    if len(subset) > 0:
        print(f"\n{timing} ({len(subset):,} records):")
        print(f"  Original gap: {subset['gap_move_abs'].mean()*100:.2f}%")
        print(f"  Corrected gap: {subset['corrected_gap_abs'].mean()*100:.2f}%")

## 7. Save Updated Data

In [None]:
# Select columns to save
output_cols = [
    'symbol', 'earnings_date',
    'close_t_minus_1', 'open_t', 'close_t', 'close_t_plus_1',
    'gap_move', 'gap_move_abs', 'full_move', 'full_move_abs',
    'overnight_move', 'overnight_move_abs',
    'timing',  # Inferred timing
    'gap_t', 'gap_t1', 'gap_ratio',  # Inference features
    'corrected_gap', 'corrected_gap_abs',  # Timing-corrected moves
    'corrected_full', 'corrected_full_abs',
]

output_df = moves_df[output_cols].copy()

# === OUTLIER FILTERS ===
print(f"Before filters: {len(output_df):,} rows")

# Filter 1: Remove penny stocks (price < $1.00) - unreliable % moves
penny_mask = output_df['close_t_minus_1'] < 1.00
if penny_mask.sum() > 0:
    print(f"  Removing {penny_mask.sum()} penny stock records (price < $1.00)")
    output_df = output_df[~penny_mask]

# Filter 2: Remove extreme moves (> 50%) - likely data errors or reverse splits
move_col = 'corrected_full_abs' if 'corrected_full_abs' in output_df.columns else 'overnight_move_abs'
extreme_mask = output_df[move_col] > 0.50
if extreme_mask.sum() > 0:
    print(f"  Removing {extreme_mask.sum()} records with >{move_col} > 50%")
    output_df = output_df[~extreme_mask]

print(f"After filters: {len(output_df):,} rows")

# Save
output_path = DATA_DIR / 'historical_earnings_moves.parquet'
output_df.to_parquet(output_path, index=False)
print(f"\nSaved to {output_path}")
print(f"\nFinal dataset:")
print(f"  Records: {len(output_df):,}")
print(f"  Timing coverage: {(output_df['timing'] != 'unknown').mean()*100:.1f}%")
print(f"  BMO: {(output_df['timing'] == 'BMO').sum():,}")
print(f"  AMC: {(output_df['timing'] == 'AMC').sum():,}")
print(f"  Unknown: {(output_df['timing'] == 'unknown').sum():,}")

## 8. Summary

In [None]:
print("=" * 60)
print("TIMING INFERENCE SUMMARY")
print("=" * 60)

print(f"""
Problem: Historical Nasdaq data has 0% BMO/AMC timing
         (Nasdaq only provides timing for UPCOMING earnings)

Solution: Infer timing from overnight gap magnitudes
  - BMO: Large gap on T (earnings day open)
  - AMC: Large gap on T+1 (day after)
  - gap_ratio = gap_T / gap_T+1

Thresholds:
  - BMO: gap_ratio > {BMO_THRESHOLD}
  - AMC: gap_ratio < {AMC_THRESHOLD}

Results:
  - Total records: {len(output_df):,}
  - BMO inferred: {(output_df['timing'] == 'BMO').sum():,} ({(output_df['timing'] == 'BMO').mean()*100:.1f}%)
  - AMC inferred: {(output_df['timing'] == 'AMC').sum():,} ({(output_df['timing'] == 'AMC').mean()*100:.1f}%)
  - Unknown: {(output_df['timing'] == 'unknown').sum():,} ({(output_df['timing'] == 'unknown').mean()*100:.1f}%)
  - Coverage: {(output_df['timing'] != 'unknown').mean()*100:.1f}%

Files Updated:
  - {output_path}

Next Steps:
  1. Retrain ML model on timing-corrected data
  2. Compare model performance before/after correction
  3. Consider filtering to only BMO/AMC records for cleaner training
""")