In [None]:
import os
from os import path

basepath = path.join(os.getcwd(), "export_cn_ex")

In [None]:
import os
import pandas as pd
import numpy as np

with open("csi300.txt", "r") as f:
    lines = [line.strip() for line in f if line.strip()]

df = pd.DataFrame(
    [line.split('\t') for line in lines],
    columns=['symbol', 'start_date', 'end_date']
)

df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

mask = (df['start_date'] >= '2015-12-30') & (df['end_date'] <= '2020-01-01')
stock_codes = np.unique(df[mask]['symbol'].tolist())

print(len(stock_codes))

dfs = []
for stock_code in stock_codes:
    fname = f"{stock_code[:8]}.csv"
    fpath = os.path.join(basepath, fname)
    if os.path.exists(fpath):
        df = pd.read_csv(fpath, parse_dates=['date'])[['date','open','high','low','close','volume','amount']]
        df = df[(df['date'] >= '2015-06-30') & (df['date'] <= '2020-01-31')]
        df['stock_code'] = stock_code[:8]
        dfs.append(df)

train_df = pd.concat(dfs, ignore_index=True)
train_df.set_index(['stock_code', 'date'], inplace=True)
train_df.sort_index(inplace=True)

train_df.to_csv("train_data.csv")
print(train_df.head())

In [None]:
import os
import pandas as pd
import numpy as np

with open("csi300.txt", "r") as f:
    lines = [line.strip() for line in f if line.strip()]

df = pd.DataFrame(
    [line.split('\t') for line in lines],
    columns=['symbol', 'start_date', 'end_date']
)

df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

mask = (df['start_date'] >= '2020-12-30') & (df['end_date'] <= '2024-01-01')
stock_codes = np.unique(df[mask]['symbol'].tolist())

print(len(stock_codes))

dfs = []
for stock_code in stock_codes:
    fname = f"{stock_code[:8]}.csv"
    fpath = os.path.join(basepath, fname)
    if os.path.exists(fpath):
        df = pd.read_csv(fpath, parse_dates=['date'])[['date','open','high','low','close','volume','amount']]
        df = df[(df['date'] >= '2020-06-30') & (df['date'] <= '2024-01-31')]
        df['stock_code'] = stock_code[:8]
        dfs.append(df)

test_df = pd.concat(dfs, ignore_index=True)
test_df.set_index(['stock_code', 'date'], inplace=True)
test_df.sort_index(inplace=True)

test_df.to_csv("test_data.csv")
print(test_df.head())

In [4]:
import pandas as pd

def heuristics_v2(df: pd.DataFrame) -> pd.Series:
    # Momentum-based volume-weighted price reversal factor
    # Combines price momentum with volume confirmation and mean reversion tendencies
    price_change = (df['close'] - df['open']) / (df['open'] + 1e-7)
    volume_weight = df['volume'] / (df['volume'].rolling(window=5).mean() + 1e-7)
    high_low_range = (df['high'] - df['low']) / (df['close'].shift(1) + 1e-7)
    
    # Factor captures momentum acceleration with volume confirmation
    # Positive when strong price moves are supported by above-average volume
    # Negative when price moves lack volume support or show reversal patterns
    factor = price_change * volume_weight * (1 - high_low_range)
    
    # Apply smoothing to reduce noise while preserving signal
    smoothed_factor = factor.rolling(window=3, center=True).mean()
    
    return smoothed_factor

In [4]:
import pandas as pd
import numpy as np

def heuristics_v2(df):
    # Copy dataframe to avoid modifying original
    data = df.copy()
    
    # Calculate basic components
    data['prev_close'] = data['close'].shift(1)
    data['prev_high'] = data['high'].shift(1)
    data['prev_low'] = data['low'].shift(1)
    data['prev_volume'] = data['volume'].shift(1)
    
    # Multi-Frequency Fractal Reversal
    # High-frequency reversal
    high_freq_reversal = ((data['close'] - data['open']) / (data['high'] - data['low'] + 1e-8) * 
                         (data['volume'] / (data['prev_volume'] + 1e-8)) * 
                         (np.abs(data['open'] - data['prev_close']) / (data['prev_close'] + 1e-8)) * 
                         np.sign(data['close'] - data['open']) * -1)
    
    # Medium-frequency reversal
    data['close_diff_5'] = data['close'].diff()
    data['abs_close_diff_5'] = np.abs(data['close_diff_5'])
    data['rolling_sum_abs_5'] = data['abs_close_diff_5'].rolling(window=5, min_periods=1).sum()
    medium_freq_reversal = (((data['close'] - data['close'].shift(5)) / (data['rolling_sum_abs_5'] + 1e-8)) * 
                           ((data['high'] - data['low']) / (np.abs(data['open'] - data['prev_close']) + 1e-8)) * 
                           np.sign(data['close'] - data['open']) * -1)
    
    # Low-frequency reversal
    data['close_diff_20'] = data['close'].diff()
    data['abs_close_diff_20'] = np.abs(data['close_diff_20'])
    data['rolling_sum_abs_20'] = data['abs_close_diff_20'].rolling(window=20, min_periods=1).sum()
    low_freq_reversal = (((data['close'] - data['close'].shift(20)) / (data['rolling_sum_abs_20'] + 1e-8)) * 
                        (np.abs(data['open'] - data['prev_close']) / (np.abs(data['close'] - data['close'].shift(20)) + 1e-8)) * 
                        (data['volume'] / (data['amount'] + 1e-8)) * 
                        np.sign(data['close'] - data['open']))
    
    # Bid-Ask Fractal Reversal
    # Bid reversal
    bid_reversal = (((data['close'] - data['low']) / (data['high'] - data['low'] + 1e-8)) * 
                   (np.abs(data['open'] - data['prev_close']) / (data['prev_close'] + 1e-8)) * 
                   (data['volume'] / (data['amount'] + 1e-8)) * 
                   np.sign(data['close'] - data['open']) * -1)
    
    # Ask reversal
    ask_reversal = (((data['high'] - data['close']) / (data['high'] - data['low'] + 1e-8)) * 
                   (np.abs(data['open'] - data['prev_close']) / (data['prev_close'] + 1e-8)) * 
                   (data['volume'] / (data['amount'] + 1e-8)) * 
                   np.sign(data['close'] - data['open']) * -1)
    
    # Spread reversal
    data['prev_range'] = data['high'].shift(1) - data['low'].shift(1)
    spread_reversal = ((np.abs((data['high'] - data['low']) - data['prev_range']) / (data['prev_range'] + 1e-8)) * 
                      (np.abs(data['open'] - data['prev_close']) / (data['high'] - data['low'] + 1e-8)) * 
                      (data['volume'] / (data['prev_volume'] + 1e-8)) * 
                      np.sign(data['close'] - data['open']) * -1)
    
    # Volume-Fractal Integration
    # Core reversal
    core_reversal = ((np.abs(data['open'] - data['prev_close']) / (data['prev_close'] + 1e-8)) * 
                    np.sign(data['close'] - data['open']) * 
                    (data['volume'] / (data['amount'] + 1e-8)) * 
                    np.sign(data['close'] - data['open']) * -1)
    
    # Volume momentum
    volume_momentum = (((data['close'] - data['prev_close']) * 
                       (data['volume'] / (data['amount'] + 1e-8)) * 
                       (np.abs(data['open'] - data['prev_close']) / (data['prev_close'] + 1e-8)) * 
                       np.sign(data['close'] - data['open']) * -1))
    
    # Microstructure breakout
    microstructure_breakout = (core_reversal * 
                              ((data['close'] - data['open']) / (data['high'] - data['low'] + 1e-8)) * 
                              (data['volume'] / (data['amount'] + 1e-8)) * 
                              np.sign(data['close'] - data['open']) * -1)
    
    # Price-Volume Divergence
    # Momentum acceleration
    data['close_diff_1'] = data['close'].diff()
    data['close_diff_2'] = data['close'].diff(2)
    data['close_diff_3'] = data['close'].diff(3)
    momentum_acceleration = (((data['close_diff_1'] / (data['close_diff_2'].shift(1) + 1e-8)) - 
                            (data['close_diff_2'].shift(1) / (data['close_diff_3'].shift(2) + 1e-8))) * 
                           np.sign(data['close'] - data['open']))
    
    # Volume divergence
    volume_divergence = (((data['close_diff_1'] / (data['prev_close'] + 1e-8)) - 
                         ((data['volume'] - data['prev_volume']) / (data['prev_volume'] + 1e-8))) * 
                        (np.abs(data['open'] - data['prev_close']) / (data['high'] - data['low'] + 1e-8)))
    
    # Reversal Regime Classification
    # Frequency regimes
    freq_regime = ((high_freq_reversal > 0.6) & 
                   (medium_freq_reversal > 0.5) & 
                   (low_freq_reversal < -0.3))
    
    # Bid-ask regimes
    bid_ask_regime = ((bid_reversal > 0.5) & 
                      (ask_reversal > 0.5) & 
                      (spread_reversal < 0.2))
    
    # Volume regimes
    volume_regime = ((volume_momentum > 0) & (core_reversal > 0.1))
    
    # Hierarchical Reversal Factor
    strong_reversal = freq_regime & bid_ask_regime & volume_regime
    moderate_reversal = ((freq_regime | bid_ask_regime) & volume_regime)
    weak_reversal = volume_regime | (core_reversal > 0)
    weak_continuation = (volume_momentum < 0) | (core_reversal < 0)
    moderate_continuation = (bid_reversal < 0.5) & (ask_reversal < 0.5)
    strong_continuation = freq_regime & bid_ask_regime & (volume_momentum < 0)
    
    # Assign hierarchical scores
    factor = pd.Series(index=data.index, dtype=float)
    factor[strong_reversal] = 1.0
    factor[moderate_reversal] = 0.7
    factor[weak_reversal] = 0.3
    factor[weak_continuation] = -0.3
    factor[moderate_continuation] = -0.7
    factor[strong_continuation] = -1.0
    
    # For neutral cases, use weighted average of components
    neutral_mask = factor.isna()
    if neutral_mask.any():
        weighted_components = (
            high_freq_reversal * 0.15 +
            medium_freq_reversal * 0.15 +
            low_freq_reversal * 0.15 +
            bid_reversal * 0.1 +
            ask_reversal * 0.1 +
            spread_reversal * 0.05 +
            core_reversal * 0.1 +
            volume_momentum * 0.1 +
            microstructure_breakout * 0.05 +
            momentum_acceleration * 0.025 +
            volume_divergence * 0.025
        )
        factor[neutral_mask] = weighted_components[neutral_mask]
    
    return factor.fillna(0)

In [12]:
object_n = 5

In [13]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import pearsonr
import warnings

warnings.filterwarnings('ignore')


# 读取合并后的数据
train_df = pd.read_csv("train_data.csv", parse_dates=['date'])
train_df.set_index(['stock_code', 'date'], inplace=True)
train_df.sort_index(inplace=True)

# 计算每只股票的因子值
train_df['factor'] = train_df.groupby('stock_code').apply(lambda g: heuristics_v2(g)).reset_index(level=0, drop=True)

# 计算未来6日收益率
train_df['future_return'] = train_df.groupby('stock_code')['close'].shift(-object_n) / train_df['close'] - 1

# 取所有日期
start_date = pd.Timestamp('2016-01-01')
end_date = pd.Timestamp('2020-01-01')
all_dates = train_df.index.get_level_values('date').unique()
all_dates = all_dates[(all_dates >= start_date) & (all_dates <= end_date)]
ic_values = []

for date in all_dates:
    daily = train_df.xs(date, level='date')
    factors = daily['factor']
    returns = daily['future_return']
    mask = factors.notna() & returns.notna() & np.isfinite(factors) & np.isfinite(returns)
    print(f"Date: {date.date()}, Valid Samples: {mask.sum()}")
    if mask.sum() >= 10:
        ic, _ = pearsonr(factors[mask], returns[mask])
        if not np.isnan(ic):
            ic_values.append(ic)

mean_ic = np.mean(ic_values)
print(f"Mean IC: {mean_ic:.10f}")

Date: 2016-01-04, Valid Samples: 414
Date: 2016-01-05, Valid Samples: 414
Date: 2016-01-06, Valid Samples: 414
Date: 2016-01-07, Valid Samples: 414
Date: 2016-01-08, Valid Samples: 414
Date: 2016-01-11, Valid Samples: 414
Date: 2016-01-12, Valid Samples: 414
Date: 2016-01-13, Valid Samples: 414
Date: 2016-01-14, Valid Samples: 414
Date: 2016-01-15, Valid Samples: 414
Date: 2016-01-18, Valid Samples: 414
Date: 2016-01-19, Valid Samples: 414
Date: 2016-01-20, Valid Samples: 414
Date: 2016-01-21, Valid Samples: 414
Date: 2016-01-22, Valid Samples: 414
Date: 2016-01-25, Valid Samples: 414
Date: 2016-01-26, Valid Samples: 414
Date: 2016-01-27, Valid Samples: 414
Date: 2016-01-28, Valid Samples: 414
Date: 2016-01-29, Valid Samples: 414
Date: 2016-02-01, Valid Samples: 414
Date: 2016-02-02, Valid Samples: 414
Date: 2016-02-03, Valid Samples: 414
Date: 2016-02-04, Valid Samples: 414
Date: 2016-02-05, Valid Samples: 414
Date: 2016-02-15, Valid Samples: 414
Date: 2016-02-16, Valid Samples: 414
D

In [14]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import pearsonr
import warnings

warnings.filterwarnings('ignore')


# 读取合并后的数据
test_df = pd.read_csv("test_data.csv", parse_dates=['date'])
test_df.set_index(['stock_code', 'date'], inplace=True)
test_df.sort_index(inplace=True)

# 计算每只股票的因子值
test_df['factor'] = test_df.groupby('stock_code').apply(lambda g: heuristics_v2(g)).reset_index(level=0, drop=True)

# 计算未来6日收益率
test_df['future_return'] = test_df.groupby('stock_code')['close'].shift(-object_n) / test_df['close'] - 1

# 取所有日期
start_date = pd.Timestamp('2021-01-01')
end_date = pd.Timestamp('2024-01-01')
all_dates = test_df.index.get_level_values('date').unique()
all_dates = all_dates[(all_dates >= start_date) & (all_dates <= end_date)]
ic_values = []

for date in all_dates:
    daily = test_df.xs(date, level='date')
    factors = daily['factor']
    returns = daily['future_return']
    mask = factors.notna() & returns.notna() & np.isfinite(factors) & np.isfinite(returns)
    print(f"Date: {date.date()}, Valid Samples: {mask.sum()}")
    if mask.sum() >= 10:
        ic, _ = pearsonr(factors[mask], returns[mask])
        if not np.isnan(ic):
            ic_values.append(ic)

mean_ic = np.mean(ic_values)
print(f"Mean IC: {mean_ic:.10f}")

Date: 2021-01-04, Valid Samples: 387
Date: 2021-01-05, Valid Samples: 387
Date: 2021-01-06, Valid Samples: 387
Date: 2021-01-07, Valid Samples: 387
Date: 2021-01-08, Valid Samples: 387
Date: 2021-01-11, Valid Samples: 387
Date: 2021-01-12, Valid Samples: 387
Date: 2021-01-13, Valid Samples: 387
Date: 2021-01-14, Valid Samples: 387
Date: 2021-01-15, Valid Samples: 387
Date: 2021-01-18, Valid Samples: 387
Date: 2021-01-19, Valid Samples: 387
Date: 2021-01-20, Valid Samples: 387
Date: 2021-01-21, Valid Samples: 387
Date: 2021-01-22, Valid Samples: 387
Date: 2021-01-25, Valid Samples: 387
Date: 2021-01-26, Valid Samples: 387
Date: 2021-01-27, Valid Samples: 387
Date: 2021-01-28, Valid Samples: 387
Date: 2021-01-29, Valid Samples: 387
Date: 2021-02-01, Valid Samples: 387
Date: 2021-02-02, Valid Samples: 387
Date: 2021-02-03, Valid Samples: 387
Date: 2021-02-04, Valid Samples: 387
Date: 2021-02-05, Valid Samples: 387
Date: 2021-02-08, Valid Samples: 387
Date: 2021-02-09, Valid Samples: 387
D