In [7]:
import os
from os import path
import pandas as pd
import numpy as np
import sys
basepath = path.join(os.getcwd(), "export_cn_ex")

In [None]:
import os
import pandas as pd
import numpy as np

with open("csi500.txt", "r") as f:
    lines = [line.strip() for line in f if line.strip()]

df = pd.DataFrame(
    [line.split('\t') for line in lines],
    columns=['symbol', 'start_date', 'end_date']
)

df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

mask = (df['start_date'] >= '2015-12-30') & (df['end_date'] <= '2020-01-01')
stock_codes = np.unique(df[mask]['symbol'].tolist())

print(len(stock_codes))

dfs = []
for stock_code in stock_codes:
    fname = f"{stock_code[:8]}.csv"
    fpath = os.path.join(basepath, fname)
    if os.path.exists(fpath):
        df = pd.read_csv(fpath, parse_dates=['date'])[['date','open','high','low','close','volume','amount']]
        df = df[(df['date'] >= '2015-06-30') & (df['date'] <= '2020-01-31')]
        df['stock_code'] = stock_code[:8]
        dfs.append(df)

train_df = pd.concat(dfs, ignore_index=True)
train_df.set_index(['stock_code', 'date'], inplace=True)
train_df.sort_index(inplace=True)

train_df.to_csv("train_data.csv")
print(train_df.head())

In [None]:
import os
import pandas as pd
import numpy as np

with open("csi500.txt", "r") as f:
    lines = [line.strip() for line in f if line.strip()]

df = pd.DataFrame(
    [line.split('\t') for line in lines],
    columns=['symbol', 'start_date', 'end_date']
)

df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

mask = (df['start_date'] >= '2020-12-30') & (df['end_date'] <= '2024-01-01')
stock_codes = np.unique(df[mask]['symbol'].tolist())

print(len(stock_codes))

dfs = []
for stock_code in stock_codes:
    fname = f"{stock_code[:8]}.csv"
    fpath = os.path.join(basepath, fname)
    if os.path.exists(fpath):
        df = pd.read_csv(fpath, parse_dates=['date'])[['date','open','high','low','close','volume','amount']]
        df = df[(df['date'] >= '2020-06-30') & (df['date'] <= '2024-01-31')]
        df['stock_code'] = stock_code[:8]
        dfs.append(df)

test_df = pd.concat(dfs, ignore_index=True)
test_df.set_index(['stock_code', 'date'], inplace=True)
test_df.sort_index(inplace=True)

test_df.to_csv("test_data.csv")
print(test_df.head())

In [11]:
import pandas as pd

def heuristics_v2(df: pd.DataFrame) -> pd.Series:
    # 1-day momentum: immediate price movement
    momentum = df['close'] / df['close'].shift(1) - 1
    
    # Daily volatility: intraday range normalized by midpoint
    volatility = (df['high'] - df['low']) / ((df['high'] + df['low']) / 2 + 1e-7)
    
    # Volatility-normalized momentum with volume confirmation
    factor = (momentum / (volatility + 1e-7))
    
    return factor

In [1]:
import pandas as pd
import numpy as np

def heuristics_v2(df):
    """
    Generate novel alpha factor combining multiple market dynamics:
    - Asymmetric Volatility Momentum
    - Volume-Weighted Range Breakout  
    - Intraday Efficiency Momentum
    - Liquidity-Adjusted Gap Reversal
    - Regime-Adaptive Order Flow
    """
    data = df.copy()
    
    # 1. Asymmetric Volatility Momentum
    # Multi-timeframe momentum
    mom_3 = data['close'].pct_change(3)
    mom_10 = data['close'].pct_change(10)
    mom_20 = data['close'].pct_change(20)
    
    # Upside volatility (positive returns only)
    high_returns = (data['high'] / data['close'].shift(1) - 1).clip(lower=0)
    upside_vol = high_returns.rolling(window=10, min_periods=5).std()
    
    # Downside volatility (negative returns only)  
    low_returns = (data['low'] / data['close'].shift(1) - 1).clip(upper=0)
    downside_vol = low_returns.rolling(window=10, min_periods=5).std()
    
    # Volatility ratio with smoothing
    vol_ratio = (upside_vol + 1e-6) / (downside_vol + 1e-6)
    vol_ratio_smooth = vol_ratio.rolling(window=5, min_periods=3).mean()
    
    # Momentum alignment check
    momentum_aligned = ((mom_3 > 0) & (mom_10 > 0) & (mom_20 > 0)) | \
                      ((mom_3 < 0) & (mom_10 < 0) & (mom_20 < 0))
    
    # Combined asymmetric momentum
    asym_momentum = mom_10 * vol_ratio_smooth * momentum_aligned.astype(float)
    
    # 2. Volume-Weighted Range Breakout
    # Normalized trading range
    daily_range = data['high'] - data['low']
    avg_range_20 = daily_range.rolling(window=20, min_periods=10).mean()
    normalized_range = daily_range / (avg_range_20 + 1e-6)
    
    # Volume cluster detection
    vol_ma_20 = data['volume'].rolling(window=20, min_periods=10).mean()
    high_volume = data['volume'] > (vol_ma_20 * 1.5)
    
    # Resistance from past high volume days
    high_volume_highs = data['high'].where(high_volume).rolling(window=10, min_periods=5).max()
    breakout_strength = (data['close'] - high_volume_highs) / (high_volume_highs + 1e-6)
    
    # Weighted breakout signal
    range_breakout = breakout_strength * normalized_range * high_volume.astype(float)
    
    # 3. Intraday Efficiency Momentum
    # Morning strength (Open to High)
    morning_strength = (data['high'] - data['open']) / (data['open'] + 1e-6)
    
    # Afternoon efficiency (High to Close)
    afternoon_efficiency = (data['close'] - data['high']) / (data['high'] + 1e-6)
    
    # Daily price efficiency ratio
    efficiency_ratio = afternoon_efficiency / (np.abs(morning_strength) + 1e-6)
    
    # Pattern persistence
    same_direction = ((morning_strength > 0) & (afternoon_efficiency > 0)) | \
                    ((morning_strength < 0) & (afternoon_efficiency < 0))
    
    efficiency_ma_5 = efficiency_ratio.rolling(window=5, min_periods=3).mean()
    efficiency_momentum = efficiency_ratio - efficiency_ma_5
    
    intraday_momentum = efficiency_momentum * same_direction.astype(float)
    
    # 4. Liquidity-Adjusted Gap Reversal
    # Opening gap
    gap_pct = (data['open'] - data['close'].shift(1)) / (data['close'].shift(1) + 1e-6)
    
    # Effective spread proxy
    effective_spread = data['amount'] / (data['volume'] + 1e-6)
    spread_ma = effective_spread.rolling(window=10, min_periods=5).mean()
    normalized_spread = effective_spread / (spread_ma + 1e-6)
    
    # Mean reversion logic
    gap_reversal = -gap_pct * np.abs(gap_pct)
    liquidity_reversal = gap_reversal / (normalized_spread + 1e-6)
    
    # 5. Regime-Adaptive Order Flow
    # Volatility regime detection
    true_range = np.maximum(data['high'] - data['low'], 
                           np.maximum(np.abs(data['high'] - data['close'].shift(1)),
                                     np.abs(data['low'] - data['close'].shift(1))))
    
    vol_regime = true_range.rolling(window=10, min_periods=5).std()
    regime_change = vol_regime.pct_change(3)
    
    # Directional volume accumulation
    buying_days = data['close'] > data['open']
    selling_days = data['close'] < data['open']
    
    signed_volume = data['volume'] * buying_days.astype(float) - data['volume'] * selling_days.astype(float)
    cumulative_flow = signed_volume.rolling(window=10, min_periods=5).sum()
    
    # Regime-specific signal
    regime_signal = cumulative_flow * regime_change * (regime_change > 0).astype(float)
    
    # Final factor combination with equal weights
    factor = (0.2 * asym_momentum + 
              0.2 * range_breakout + 
              0.2 * intraday_momentum + 
              0.2 * liquidity_reversal + 
              0.2 * regime_signal)
    
    # Normalize the final factor
    factor_normalized = (factor - factor.rolling(window=20, min_periods=10).mean()) / \
                       (factor.rolling(window=20, min_periods=10).std() + 1e-6)
    
    return factor_normalized

In [5]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import pearsonr


# 读取合并后的数据
train_df = pd.read_csv("train_data.csv", parse_dates=['date'])
train_df.set_index(['stock_code', 'date'], inplace=True)
train_df.sort_index(inplace=True)

# 计算每只股票的因子值
train_df['factor'] = train_df.groupby('stock_code').apply(lambda g: heuristics_v2(g)).reset_index(level=0, drop=True)

# 计算未来6日收益率
train_df['future_return_6d'] = train_df.groupby('stock_code')['close'].shift(-20) / train_df['close'] - 1

# 取所有日期
start_date = pd.Timestamp('2016-01-01')
end_date = pd.Timestamp('2020-01-01')
all_dates = train_df.index.get_level_values('date').unique()
all_dates = all_dates[(all_dates >= start_date) & (all_dates <= end_date)]
ic_values = []

for date in all_dates:
    daily = train_df.xs(date, level='date')
    factors = daily['factor']
    returns = daily['future_return_6d']
    mask = factors.notna() & returns.notna() & np.isfinite(factors) & np.isfinite(returns)
    if mask.sum() >= 10:
        ic, _ = pearsonr(factors[mask], returns[mask])
        if not np.isnan(ic):
            ic_values.append(ic)

mean_ic = np.mean(ic_values)
print(f"Mean IC: {mean_ic:.10f}")

Mean IC: -0.0462177474


In [4]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import pearsonr


# 读取合并后的数据
test_df = pd.read_csv("test_data.csv", parse_dates=['date'])
test_df.set_index(['stock_code', 'date'], inplace=True)
test_df.sort_index(inplace=True)

# 计算每只股票的因子值
test_df['factor'] = test_df.groupby('stock_code').apply(lambda g: heuristics_v2(g)).reset_index(level=0, drop=True)

# 计算未来6日收益率
test_df['future_return_6d'] = test_df.groupby('stock_code')['close'].shift(-20) / test_df['close'] - 1

# 取所有日期
start_date = pd.Timestamp('2021-01-01')
end_date = pd.Timestamp('2024-01-01')
all_dates = test_df.index.get_level_values('date').unique()
all_dates = all_dates[(all_dates >= start_date) & (all_dates <= end_date)]
ic_values = []

for date in all_dates:
    daily = test_df.xs(date, level='date')
    factors = daily['factor']
    returns = daily['future_return_6d']
    mask = factors.notna() & returns.notna() & np.isfinite(factors) & np.isfinite(returns)
    if mask.sum() >= 10:
        ic, _ = pearsonr(factors[mask], returns[mask])
        if not np.isnan(ic):
            ic_values.append(ic)

mean_ic = np.mean(ic_values)
print(f"Mean IC: {mean_ic:.10f}")

Mean IC: -0.2266687174
