In [None]:
# This module implements quantitative trading signals for the Alphathon competition
# All signals are designed to be investable (no look-ahead bias) and reproducible

import databento as db          
import pandas as pd             
import numpy as np             
from datetime import datetime, timedelta  
import warnings                
warnings.filterwarnings('ignore', category=DeprecationWarning)  


# Signal calculation parameters
SIGNAL_WINDOWS = [20, 60, 120]  
MIN_PERIODS_RATIO = 0.5

print("Financial Signal Generation Module Initialized")
print(f"Signal calculation windows: {SIGNAL_WINDOWS}")
print("All signals designed for investable implementation (no look-ahead bias)")

In [2]:
raw_data = pd.read_parquet('/Users/david/Desktop/Alphathon/Data for Alphathon/20180501_20240101_agg_data.parquet')

In [None]:
raw_data_shifted = raw_data.copy()

# IMPORTANT: Shift all OHLCV data by 1 period to avoid look-ahead bias
# This ensures signals are only based on information available at market open
print("Applying 1-period lag to OHLCV data to prevent look-ahead bias...")
raw_data_shifted[['open', 'high', 'low', 'close', 'volume']] = raw_data.groupby('symbol')[['open', 'high', 'low', 'close', 'volume']].shift(1)


raw_data_shifted = raw_data_shifted.dropna(subset=['open', 'high', 'low', 'close', 'volume']).reset_index(drop=True)

print(f"Data preprocessing complete!")
print(f"Original shape: {raw_data.shape}")
print(f"Processed shape: {raw_data_shifted.shape}")
print(f"Rows removed due to lagging: {raw_data.shape[0] - raw_data_shifted.shape[0]}")

raw_data_shifted

Unnamed: 0,symbol,ts_event,open,high,low,close,volume
0,A,2018-05-02 00:00:00+00:00,65.63,66.34,65.49,66.24,803289.0
1,A,2018-05-03 00:00:00+00:00,66.00,66.86,65.81,65.91,685405.0
2,A,2018-05-04 00:00:00+00:00,65.77,66.46,64.86,66.34,676332.0
3,A,2018-05-07 00:00:00+00:00,66.03,67.25,65.61,67.00,409636.0
4,A,2018-05-08 00:00:00+00:00,67.16,67.98,67.07,67.39,532910.0
...,...,...,...,...,...,...,...
792676,ZTS,2023-12-22 00:00:00+00:00,196.00,197.00,193.88,194.66,1067697.0
792677,ZTS,2023-12-26 00:00:00+00:00,195.32,195.81,192.78,194.98,700415.0
792678,ZTS,2023-12-27 00:00:00+00:00,194.88,196.34,194.12,195.50,316032.0
792679,ZTS,2023-12-28 00:00:00+00:00,195.41,197.01,194.75,196.90,337393.0


In [None]:
def calculate_corwin_schultz_signal(df):
    """
    Calculate Corwin-Schultz high-low price spread signal
    
    Formula:
    β_t = (ln(H_t/L_t))² + (ln(H_{t+1}/L_{t+1}))²
    γ_t = (ln(max(H_t, H_{t+1})/min(L_t, L_{t+1})))²
    S_t = 2(e^{α_t} - 1) / (1 + e^{α_t})
    
    where α_t is derived from β_t and γ_t
    """
    
    # Create working copy to avoid modifying original data
    result_df = df.copy()
    
    # CRITICAL: Sort by symbol and timestamp to ensure proper time series ordering
    # This is essential for forward-looking calculations (t+1 references)
    result_df = result_df.sort_values(['symbol', 'ts_event']).reset_index(drop=True)
    
    # Define nested function to calculate signals for each symbol group
    def calc_signal_group(group):
        """
        Calculate Corwin-Schultz signals for a single symbol group
        
        This function processes one symbol at a time to ensure proper
        time series calculations without cross-contamination between stocks.
        """
        group = group.copy()
        
        # ========================================================================
        # STEP 1: Calculate β_t component
        # β_t = (ln(H_t/L_t))² + (ln(H_{t+1}/L_{t+1}))²
        # ========================================================================
        
        # Calculate squared log high-low ratios for current period
        ln_hl_t = np.log(group['high'] / group['low']) ** 2
        
        # Get squared log high-low ratios for next period (forward-looking)
        ln_hl_t_plus_1 = ln_hl_t.shift(-1)
        
        # Sum the two components to get β_t
        beta_t = ln_hl_t + ln_hl_t_plus_1
        
        # Calculate γ_t = (ln(max(H_t, H_{t+1})/min(L_t, L_{t+1})))²
        h_t = group['high']
        h_t_plus_1 = group['high'].shift(-1)
        l_t = group['low']
        l_t_plus_1 = group['low'].shift(-1)
        
        max_h = np.maximum(h_t, h_t_plus_1)
        min_l = np.minimum(l_t, l_t_plus_1)
        gamma_t = (np.log(max_h / min_l)) ** 2
        
        # Calculate α_t (effective spread parameter)
        # α_t = (√(2*β_t) - √(γ_t)) / (3 - 2*√2) - √(γ_t/(3 - 2*√2))
        sqrt_2_beta = np.sqrt(2 * beta_t)
        sqrt_gamma = np.sqrt(gamma_t)
        denominator = 3 - 2 * np.sqrt(2)
        
        alpha_t = (sqrt_2_beta - sqrt_gamma) / denominator - sqrt_gamma / denominator
        
        # Calculate S_t = 2(e^{α_t} - 1) / (1 + e^{α_t})
        exp_alpha = np.exp(alpha_t)
        s_t = 2 * (exp_alpha - 1) / (1 + exp_alpha)
        
        group['beta_t'] = beta_t
        group['gamma_t'] = gamma_t
        group['alpha_t'] = alpha_t
        group['corwin_schultz_signal'] = s_t
        
        return group
    
    # Apply calculation to each symbol group
    result_df = result_df.groupby('symbol', group_keys=False).apply(calc_signal_group)
    
    # Remove rows with NaN values (last row of each symbol due to forward looking)
    result_df = result_df.dropna(subset=['corwin_schultz_signal']).reset_index(drop=True)
    
    return result_df

# Calculate the Corwin-Schultz signal
data_with_signal = calculate_corwin_schultz_signal(raw_data_shifted)
data_with_signal[['symbol', 'ts_event', 'high', 'low', 'corwin_schultz_signal']].head(10)


  result_df = result_df.groupby('symbol', group_keys=False).apply(calc_signal_group)


Unnamed: 0,symbol,ts_event,high,low,corwin_schultz_signal
0,A,2018-05-02 00:00:00+00:00,66.34,65.49,-0.073014
1,A,2018-05-03 00:00:00+00:00,66.86,65.81,-0.11437
2,A,2018-05-04 00:00:00+00:00,66.46,64.86,-0.135668
3,A,2018-05-07 00:00:00+00:00,67.25,65.61,-0.181303
4,A,2018-05-08 00:00:00+00:00,67.98,67.07,-0.052806
5,A,2018-05-09 00:00:00+00:00,67.44,66.94,-0.090921
6,A,2018-05-10 00:00:00+00:00,68.4,67.18,-0.115842
7,A,2018-05-11 00:00:00+00:00,68.87,68.12,-0.127469
8,A,2018-05-14 00:00:00+00:00,69.59,68.91,-0.067448
9,A,2018-05-15 00:00:00+00:00,70.44,69.0,-1.09421


In [None]:
# data_with_signal.to_parquet('/Users/david/Desktop/Alphathon/Data for Alphathon/corwin_schultz_signal.parquet')

In [4]:
def calculate_mean_reversion_signals(df, k_values=[20, 60, 120]):
    """
    Calculate Mean Reversion/Volatility signals
    
    Formula:
    z_MA^(k)(t) = (P_t - MA_k(P)_t) / stdev_k(P)_t
    ΔEMA_t^(k) = EMA_t^(k) - EMA_{t-1}^(k)
    
    where k ∈ {20, 60, 120}
    """
    
    result_df = df.copy()
    result_df = result_df.sort_values(['symbol', 'ts_event']).reset_index(drop=True)
    
    def calc_mr_signals_group(group):
        group = group.copy()
        price = group['close']  # Using close price as P_t
        
        for k in k_values:
            # Calculate Moving Average
            ma_k = price.rolling(window=k, min_periods=1).mean()
            
            # Calculate Rolling Standard Deviation
            stdev_k = price.rolling(window=k, min_periods=1).std()
            
            # Calculate z-score signal: z_MA^(k)(t) = (P_t - MA_k(P)_t) / stdev_k(P)_t
            z_ma_signal = (price - ma_k) / stdev_k
            group[f'z_MA_{k}'] = z_ma_signal
            
            # Calculate EMA (Exponential Moving Average)
            alpha = 2 / (k + 1)  # Standard EMA smoothing factor
            ema_k = price.ewm(alpha=alpha, adjust=False).mean()
            
            # Calculate EMA difference: ΔEMA_t^(k) = EMA_t^(k) - EMA_{t-1}^(k)
            delta_ema = ema_k.diff()
            group[f'delta_EMA_{k}'] = delta_ema
            
        return group
    
    # Apply calculation to each symbol group
    result_df = result_df.groupby('symbol', group_keys=False).apply(calc_mr_signals_group)
    
    return result_df

def calculate_donchian_breakout_signals(df, k_values=[20, 60, 120]):
    """
    Calculate Donchian Channel Breakout signals
    
    Formula:
    BRK_t^U = 1{C_t > max_{τ∈[t-k,t-1]} C_τ}
    BRK_t^D = 1{C_t < min_{τ∈[t-k,t-1]} C_τ}
    
    where k ∈ {20, 60, 120}
    """
    
    result_df = df.copy()
    result_df = result_df.sort_values(['symbol', 'ts_event']).reset_index(drop=True)
    
    def calc_donchian_signals_group(group):
        group = group.copy()
        price = group['close']  # Using close price as C_t
        
        for k in k_values:
            # Calculate rolling max and min over [t-k, t-1] window
            # We use shift(1) to exclude current period and look at previous k periods
            rolling_max = price.shift(1).rolling(window=k, min_periods=1).max()
            rolling_min = price.shift(1).rolling(window=k, min_periods=1).min()
            
            # Upper breakout: BRK_t^U = 1{C_t > max_{τ∈[t-k,t-1]} C_τ}
            brk_upper = (price > rolling_max).astype(int)
            group[f'BRK_U_{k}'] = brk_upper
            
            # Lower breakout: BRK_t^D = 1{C_t < min_{τ∈[t-k,t-1]} C_τ}
            brk_lower = (price < rolling_min).astype(int)
            group[f'BRK_D_{k}'] = brk_lower
            
        return group
    
    # Apply calculation to each symbol group
    result_df = result_df.groupby('symbol', group_keys=False).apply(calc_donchian_signals_group)
    
    return result_df

# Calculate Mean Reversion signals
print("Calculating Mean Reversion/Volatility signals...")
data_with_mr = calculate_mean_reversion_signals(raw_data_shifted)

# Calculate Donchian Breakout signals
print("Calculating Donchian Channel Breakout signals...")
data_with_all_dc = calculate_donchian_breakout_signals(raw_data_shifted)




Calculating Mean Reversion/Volatility signals...


  result_df = result_df.groupby('symbol', group_keys=False).apply(calc_mr_signals_group)


Calculating Donchian Channel Breakout signals...


  result_df = result_df.groupby('symbol', group_keys=False).apply(calc_donchian_signals_group)


In [None]:
# data_with_mr.to_parquet('/Users/david/Desktop/Alphathon/signals/mean_reversion_signals.parquet')
# data_with_all_dc.to_parquet('/Users/david/Desktop/Alphathon/signals/donchian_breakout_signals.parquet')

In [None]:
def calculate_low_volatility_beta_signals(df, k_values=[20, 60, 120]):
    """
    Calculate Low Volatility/Low Beta signals
    
    Formula:
    β_{i,t}^{(k)} = Cov_k(r_i, r_m)_t / Var_k(r_m)_t
    
    where:
    - r_i is the return of stock i
    - r_m is the market return (equal-weighted average of all stocks)
    - k ∈ {20, 60, 120} is the rolling window
    """
    
    result_df = df.copy()
    result_df = result_df.sort_values(['symbol', 'ts_event']).reset_index(drop=True)
    
    # Calculate individual stock returns
    result_df['return'] = result_df.groupby('symbol')['close'].pct_change()
    
    # Calculate market return (equal-weighted average of all stocks)
    market_returns = result_df.groupby('ts_event')['return'].mean().reset_index()
    market_returns.columns = ['ts_event', 'market_return']
    
    # Merge market returns back to the main dataframe
    result_df = result_df.merge(market_returns, on='ts_event', how='left')
    
    def calc_beta_signals_group(group):
        group = group.copy()
        stock_returns = group['return']
        market_returns = group['market_return']
        
        for k in k_values:
            # Calculate rolling covariance between stock and market returns
            cov_k = stock_returns.rolling(window=k, min_periods=max(1, k//2)).cov(market_returns)
            
            # Calculate rolling variance of market returns
            var_k = market_returns.rolling(window=k, min_periods=max(1, k//2)).var()
            
            # Calculate beta: β_{i,t}^{(k)} = Cov_k(r_i, r_m)_t / Var_k(r_m)_t
            beta_k = cov_k / var_k
            
            group[f'beta_{k}'] = beta_k
            
            # Also calculate rolling correlation for additional insight
            corr_k = stock_returns.rolling(window=k, min_periods=max(1, k//2)).corr(market_returns)
            group[f'correlation_{k}'] = corr_k
            
            # Calculate rolling volatility (standard deviation) of stock returns
            vol_k = stock_returns.rolling(window=k, min_periods=max(1, k//2)).std()
            group[f'volatility_{k}'] = vol_k
        
        return group
    
    # Apply calculation to each symbol group
    result_df = result_df.groupby('symbol', group_keys=False).apply(calc_beta_signals_group)
    
    # Remove rows with NaN returns (first row of each symbol)
    result_df = result_df.dropna(subset=['return']).reset_index(drop=True)
    
    return result_df

# Calculate Low Volatility/Beta signals
print("Calculating Low Volatility/Low Beta signals...")
data_with_beta = calculate_low_volatility_beta_signals(data_with_all_signals)

print("Beta signal calculation complete!")
print(f"Data shape: {data_with_beta.shape}")

# Show sample of beta signals
beta_columns = [col for col in data_with_beta.columns if any(x in col for x in ['beta_', 'correlation_', 'volatility_'])]
print(f"\nGenerated Beta-related columns: {beta_columns}")

# Display sample for one symbol
sample_symbol = data_with_beta['symbol'].iloc[0]
sample_data = data_with_beta[data_with_beta['symbol'] == sample_symbol].head(10)
display_cols = ['symbol', 'ts_event', 'return', 'market_return'] + beta_columns[:6]
sample_data[display_cols]


In [None]:
# Summary of all generated signals
all_signal_columns = [col for col in data_with_beta.columns if any(x in col for x in 
                     ['z_MA_', 'delta_EMA_', 'BRK_U_', 'BRK_D_', 'beta_', 'correlation_', 'volatility_'])]

print("=" * 50)
print("COMPLETE SIGNAL SUMMARY")
print("=" * 50)

signal_categories = {
    'Mean Reversion Z-Scores': [col for col in all_signal_columns if 'z_MA_' in col],
    'EMA Momentum': [col for col in all_signal_columns if 'delta_EMA_' in col],
    'Donchian Upper Breakouts': [col for col in all_signal_columns if 'BRK_U_' in col],
    'Donchian Lower Breakouts': [col for col in all_signal_columns if 'BRK_D_' in col],
    'Beta Coefficients': [col for col in all_signal_columns if 'beta_' in col],
    'Market Correlations': [col for col in all_signal_columns if 'correlation_' in col],
    'Rolling Volatilities': [col for col in all_signal_columns if 'volatility_' in col]
}

for category, signals in signal_categories.items():
    print(f"\n{category}: {len(signals)} signals")
    for signal in signals:
        print(f"  - {signal}")

print(f"\nTotal signals generated: {len(all_signal_columns)}")
print(f"Final dataset shape: {data_with_beta.shape}")
print(f"Number of symbols: {data_with_beta['symbol'].nunique()}")
print(f"Date range: {data_with_beta['ts_event'].min()} to {data_with_beta['ts_event'].max()}")

# Save the complete dataset
output_path = '/Users/david/Desktop/Alphathon/Data for Alphathon/complete_signals_dataset.parquet'
data_with_beta.to_parquet(output_path)
print(f"\nDataset saved to: {output_path}")


In [None]:
# Alpha Factors Generation (OHLCV-based factors) - Adapted for raw_data_shifted
import numpy as np
import pandas as pd
from pathlib import Path



# Utility functions
def pct_change_safe(series: pd.Series) -> pd.Series:
    out = series.pct_change()
    return out.replace([np.inf, -np.inf], np.nan)

def log_safe(series: pd.Series) -> pd.Series:
    return np.log(series.replace(0, np.nan))

# Prepare data from raw_data_shifted
print("Preparing OHLCV data from raw_data_shifted...")

# Pivot data to get OHLCV as columns with symbols as index
data_pivot = raw_data_shifted.set_index(['ts_event', 'symbol']).unstack('symbol')

# Extract OHLCV dataframes
O = data_pivot['open'].T
H = data_pivot['high'].T  
L = data_pivot['low'].T
C = data_pivot['close'].T
V = data_pivot['volume'].T

print(f"Data shape - O: {O.shape}, H: {H.shape}, L: {L.shape}, C: {C.shape}, V: {V.shape}")

# Base calculations
ret_cc = pct_change_safe(C)
dollarvol = C * V

# True Range & ATR
C_shift = C.shift(1)
TR = pd.concat([
    (H - L).abs(),
    (H - C_shift).abs(),
    (L - C_shift).abs()
], axis=0).groupby(level=0).max()
ATR14 = TR.rolling(14, min_periods=7).mean()

# Alpha factors (10 factors)
range_log = np.log(H.div(L)).replace([np.inf, -np.inf], np.nan)
logV = log_safe(V)
ADV60 = dollarvol.rolling(60, min_periods=20).mean()

factors_raw = {
    # No shift needed (available at market open)
    "alpha_overnight": O.div(C.shift(1)) - 1,
    "alpha_gap_rev": O.sub(C.shift(1)).div(C.shift(1)),
    
    # Shift needed (available after close)
    "alpha_intraday_rev": C.sub(O).div(O),
    "alpha_rangevol_rev": -0.5 * (range_log ** 2),
    "alpha_volsurprise": (logV - logV.rolling(60, min_periods=20).mean()) / 
                        logV.rolling(60, min_periods=20).std(),
    "alpha_turnover_jump": (dollarvol / ADV60).replace([np.inf, -np.inf], np.nan) - 1,
    "alpha_amihud_rev": -(ret_cc.abs() / dollarvol.replace(0, np.nan)).replace([np.inf, -np.inf], np.nan)
                       .rolling(20, min_periods=10).mean(),
    "alpha_clv": ((C - L) - (H - C)) / (H - L).replace(0, np.nan),
    "alpha_st_trend_5_20": (C.rolling(5, min_periods=3).mean() / 
                           C.rolling(20, min_periods=10).mean() - 1),
    "alpha_range_to_atr_rev": -(range_log / ATR14.replace(0, np.nan)).replace([np.inf, -np.inf], np.nan),
}

# Factors requiring shift (available after close)
NEED_SHIFT = {
    "alpha_intraday_rev",
    "alpha_rangevol_rev", 
    "alpha_volsurprise",
    "alpha_turnover_jump",
    "alpha_amihud_rev",
    "alpha_clv",
    "alpha_st_trend_5_20",
    "alpha_range_to_atr_rev",
}



Preparing OHLCV data from raw_data_shifted...
Data shape - O: (647, 1426), H: (647, 1426), L: (647, 1426), C: (647, 1426), V: (647, 1426)


  out = series.pct_change()


In [None]:
factor_dataframes = []

for name, df in factors_raw.items():
    # Apply shift if needed for investable signals
    investable = df.shift(1) if name in NEED_SHIFT else df
    investable = investable.replace([np.inf, -np.inf], np.nan)
    
    # Store for final dataframe
    factor_dataframes.append(investable)

# Create final dataframe with all alpha signals
ohlcv_long = raw_data_shifted[['symbol', 'ts_event']].copy()

# Add each alpha factor as a column
for i, (name, _) in enumerate(factors_raw.items()):
    factor_df = factor_dataframes[i]
    
    # Convert factor dataframe back to long format
    factor_long = factor_df.stack().reset_index()
    factor_long.columns = ['ts_event', 'symbol', name]
    
    # Merge with main dataframe
    ohlcv_long = ohlcv_long.merge(factor_long, on=['symbol', 'ts_event'], how='left')

# Create final signals dataframe
final_signals_df = ohlcv_long.copy()