In [7]:
import os
from os import path
import pandas as pd
import numpy as np
import sys
basepath = path.join(os.getcwd(), "export_cn_ex")

In [None]:
import os
import pandas as pd
import numpy as np

with open("csi500.txt", "r") as f:
    lines = [line.strip() for line in f if line.strip()]

df = pd.DataFrame(
    [line.split('\t') for line in lines],
    columns=['symbol', 'start_date', 'end_date']
)

df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

mask = (df['start_date'] >= '2015-12-30') & (df['end_date'] <= '2020-01-01')
stock_codes = np.unique(df[mask]['symbol'].tolist())

print(len(stock_codes))

dfs = []
for stock_code in stock_codes:
    fname = f"{stock_code[:8]}.csv"
    fpath = os.path.join(basepath, fname)
    if os.path.exists(fpath):
        df = pd.read_csv(fpath, parse_dates=['date'])[['date','open','high','low','close','volume','amount']]
        df = df[(df['date'] >= '2015-06-30') & (df['date'] <= '2020-01-31')]
        df['stock_code'] = stock_code[:8]
        dfs.append(df)

train_df = pd.concat(dfs, ignore_index=True)
train_df.set_index(['stock_code', 'date'], inplace=True)
train_df.sort_index(inplace=True)

train_df.to_csv("train_data.csv")
print(train_df.head())

In [None]:
import os
import pandas as pd
import numpy as np

with open("csi500.txt", "r") as f:
    lines = [line.strip() for line in f if line.strip()]

df = pd.DataFrame(
    [line.split('\t') for line in lines],
    columns=['symbol', 'start_date', 'end_date']
)

df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

mask = (df['start_date'] >= '2020-12-30') & (df['end_date'] <= '2024-01-01')
stock_codes = np.unique(df[mask]['symbol'].tolist())

print(len(stock_codes))

dfs = []
for stock_code in stock_codes:
    fname = f"{stock_code[:8]}.csv"
    fpath = os.path.join(basepath, fname)
    if os.path.exists(fpath):
        df = pd.read_csv(fpath, parse_dates=['date'])[['date','open','high','low','close','volume','amount']]
        df = df[(df['date'] >= '2020-06-30') & (df['date'] <= '2024-01-31')]
        df['stock_code'] = stock_code[:8]
        dfs.append(df)

test_df = pd.concat(dfs, ignore_index=True)
test_df.set_index(['stock_code', 'date'], inplace=True)
test_df.sort_index(inplace=True)

test_df.to_csv("test_data.csv")
print(test_df.head())

In [11]:
import pandas as pd

def heuristics_v2(df: pd.DataFrame) -> pd.Series:
    # 1-day momentum: immediate price movement
    momentum = df['close'] / df['close'].shift(1) - 1
    
    # Daily volatility: intraday range normalized by midpoint
    volatility = (df['high'] - df['low']) / ((df['high'] + df['low']) / 2 + 1e-7)
    
    # Volatility-normalized momentum with volume confirmation
    factor = (momentum / (volatility + 1e-7))
    
    return factor

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import pearsonr


# 读取合并后的数据
train_df = pd.read_csv("train_data.csv", parse_dates=['date'])
train_df.set_index(['stock_code', 'date'], inplace=True)
train_df.sort_index(inplace=True)

# 计算每只股票的因子值
train_df['factor'] = train_df.groupby('stock_code').apply(lambda g: heuristics_v2(g)).reset_index(level=0, drop=True)

# 计算未来6日收益率
train_df['future_return_6d'] = train_df.groupby('stock_code')['close'].shift(-6) / train_df['close'] - 1

# 取所有日期
start_date = pd.Timestamp('2016-01-01')
end_date = pd.Timestamp('2020-01-01')
all_dates = train_df.index.get_level_values('date').unique()
all_dates = all_dates[(all_dates >= start_date) & (all_dates <= end_date)]
ic_values = []

for date in all_dates:
    daily = train_df.xs(date, level='date')
    factors = daily['factor']
    returns = daily['future_return_6d']
    mask = factors.notna() & returns.notna() & np.isfinite(factors) & np.isfinite(returns)
    if mask.sum() >= 10:
        ic, _ = pearsonr(factors[mask], returns[mask])
        if not np.isnan(ic):
            ic_values.append(ic)

mean_ic = np.mean(ic_values)
print(f"Mean IC: {mean_ic:.10f}")

Mean IC: 0.1174962949


In [13]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import pearsonr


# 读取合并后的数据
test_df = pd.read_csv("test_data.csv", parse_dates=['date'])
test_df.set_index(['stock_code', 'date'], inplace=True)
test_df.sort_index(inplace=True)

# 计算每只股票的因子值
test_df['factor'] = test_df.groupby('stock_code').apply(lambda g: heuristics_v2(g)).reset_index(level=0, drop=True)

# 计算未来6日收益率
test_df['future_return_6d'] = test_df.groupby('stock_code')['close'].shift(-6) / test_df['close'] - 1

# 取所有日期
start_date = pd.Timestamp('2021-01-01')
end_date = pd.Timestamp('2024-01-01')
all_dates = test_df.index.get_level_values('date').unique()
all_dates = all_dates[(all_dates >= start_date) & (all_dates <= end_date)]
ic_values = []

for date in all_dates:
    daily = test_df.xs(date, level='date')
    factors = daily['factor']
    returns = daily['future_return_6d']
    mask = factors.notna() & returns.notna() & np.isfinite(factors) & np.isfinite(returns)
    if mask.sum() >= 10:
        ic, _ = pearsonr(factors[mask], returns[mask])
        if not np.isnan(ic):
            ic_values.append(ic)

mean_ic = np.mean(ic_values)
print(f"Mean IC: {mean_ic:.10f}")

Mean IC: 0.0215574314
