In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体和图表风格
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style('whitegrid')


In [None]:
# 从数据库加载数据
from alphahome.common.db_manager import DBManager
from datetime import datetime, timedelta
import os

# 初始化数据库连接
config_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'config.yml')
db = DBManager(config_path)

# 设置时间范围
end_date = datetime.now()
start_date = end_date - timedelta(days=365)  # 一年的数据

# 加载股票日线数据
sql = """
SELECT 
    trade_date as date,
    ts_code,
    close,
    open,
    high,
    low,
    vol as volume,
    amount,
    turnover_rate,
    pe,
    pb
FROM 
    stock_daily_basic
WHERE 
    trade_date BETWEEN %s AND %s
"""

df = db.execute_query(sql, (start_date, end_date))

# 数据预处理
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['ts_code', 'date'])

print(f"数据加载完成。\n数据时间范围：{df['date'].min()} 至 {df['date'].max()}")
print(f"股票数量：{df['ts_code'].nunique()}")
print("\n数据预览：")
df.head()


In [None]:
def calculate_factors(df):
    """计算多个类别的因子
    
    Args:
        df (pd.DataFrame): 包含必要字段的DataFrame
        
    Returns:
        pd.DataFrame: 添加因子列的DataFrame
    """
    result = df.copy()
    
    # 按股票分组进行计算
    grouped = result.groupby('ts_code')
    
    # 1. 动量因子
    result['momentum_20d'] = grouped['close'].pct_change(20)  # 20日动量
    result['momentum_60d'] = grouped['close'].pct_change(60)  # 60日动量
    
    # 2. 反转因子
    result['reversal_5d'] = -grouped['close'].pct_change(5)  # 5日反转
    result['reversal_10d'] = -grouped['close'].pct_change(10)  # 10日反转
    
    # 3. 波动率因子
    # 计算日收益率
    result['daily_return'] = grouped['close'].pct_change(1)
    # 20日波动率
    result['volatility_20d'] = grouped['daily_return'].rolling(20).std()
    # 偏度和峰度
    result['skewness_20d'] = grouped['daily_return'].rolling(20).skew()
    result['kurtosis_20d'] = grouped['daily_return'].rolling(20).kurt()
    
    # 4. 换手率因子
    # 20日平均换手率
    result['turnover_20d_mean'] = grouped['turnover_rate'].rolling(20).mean()
    # 换手率波动率
    result['turnover_20d_std'] = grouped['turnover_rate'].rolling(20).std()
    
    # 5. 估值因子
    # PE的倒数（盈利收益率）
    result['ep_ratio'] = 1 / result['pe']
    # PB的倒数（净资产收益率）
    result['bp_ratio'] = 1 / result['pb']
    
    # 计算未来5日收益率（用于因子分析）
    result['forward_return_5d'] = grouped['close'].pct_change(-5)
    
    return result

# 计算因子
df_with_factors = calculate_factors(df)

# 显示因子列
factor_columns = [col for col in df_with_factors.columns if any(x in col for x in 
                 ['momentum', 'reversal', 'volatility', 'turnover', 'ep_ratio', 'bp_ratio'])]
print("计算得到的因子列表：")
for col in factor_columns:
    print(f"- {col}")

# 显示因子数据预览
print("\n因子数据预览：")
df_with_factors[['ts_code', 'date'] + factor_columns].head()


In [None]:
def preprocess_factors(df, factor_cols, quantile_range=(0.01, 0.99)):
    """对因子进行预处理
    
    Args:
        df (pd.DataFrame): 包含因子的DataFrame
        factor_cols (list): 需要处理的因子列名列表
        quantile_range (tuple): 去极值的分位数范围，默认(0.01, 0.99)
        
    Returns:
        pd.DataFrame: 预处理后的DataFrame
    """
    result = df.copy()
    
    # 按日期分组处理
    for date in result['date'].unique():
        date_mask = result['date'] == date
        date_data = result[date_mask]
        
        for factor in factor_cols:
            factor_data = date_data[factor]
            
            # 1. 去极值
            lower_q, upper_q = factor_data.quantile(quantile_range)
            factor_data = np.clip(factor_data, lower_q, upper_q)
            
            # 2. 标准化
            factor_mean = factor_data.mean()
            factor_std = factor_data.std()
            if factor_std != 0:  # 避免除以0
                factor_data = (factor_data - factor_mean) / factor_std
            
            # 更新数据
            result.loc[date_mask, factor] = factor_data
    
    # 3. 处理缺失值
    # 对于时序数据，使用向前填充
    result[factor_cols] = result.groupby('ts_code')[factor_cols].fillna(method='ffill')
    # 剩余的缺失值（如序列开始的NA）用0填充
    result[factor_cols] = result[factor_cols].fillna(0)
    
    return result

# 对因子进行预处理
df_processed = preprocess_factors(df_with_factors, factor_columns)

# 显示处理前后的统计信息
print("处理前后的因子统计信息对比：")
for factor in factor_columns[:3]:  # 选择前三个因子展示
    print(f"\n因子: {factor}")
    print("处理前:")
    print(df_with_factors[factor].describe())
    print("\n处理后:")
    print(df_processed[factor].describe())


In [None]:
# 4.1 IC分析
def calculate_ic_metrics(df, factor_cols, forward_return_col='forward_return_5d'):
    """计算因子的IC指标
    
    Args:
        df (pd.DataFrame): 数据框
        factor_cols (list): 因子列表
        forward_return_col (str): 未来收益率列名
    
    Returns:
        pd.DataFrame: IC统计指标
    """
    ic_stats = []
    
    for factor in factor_cols:
        daily_ic = []
        # 按日期计算IC
        for date in df['date'].unique():
            date_data = df[df['date'] == date]
            if len(date_data) >= 30:  # 要求至少30只股票
                ic = date_data[factor].corr(date_data[forward_return_col], method='spearman')
                daily_ic.append({'date': date, 'ic': ic})
        
        # 转换为DataFrame
        ic_df = pd.DataFrame(daily_ic)
        
        if not ic_df.empty:
            # 计算IC统计指标
            ic_mean = ic_df['ic'].mean()
            ic_std = ic_df['ic'].std()
            ic_ir = ic_mean / ic_std if ic_std != 0 else 0
            ic_positive_ratio = (ic_df['ic'] > 0).mean()
            ic_negative_ratio = (ic_df['ic'] < 0).mean()
            
            ic_stats.append({
                'factor': factor,
                'IC_Mean': ic_mean,
                'IC_Std': ic_std,
                'IC_IR': ic_ir,
                'IC_Positive_Ratio': ic_positive_ratio,
                'IC_Negative_Ratio': ic_negative_ratio,
                'IC_Abs_Mean': abs(ic_mean)
            })
    
    return pd.DataFrame(ic_stats)

# 计算IC指标
ic_metrics = calculate_ic_metrics(df_processed, factor_columns)

# 按IC绝对值排序
ic_metrics_sorted = ic_metrics.sort_values('IC_Abs_Mean', ascending=False)

print("因子IC分析结果：")
print(ic_metrics_sorted)

# 绘制IC均值和IR的条形图
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.bar(ic_metrics_sorted['factor'], ic_metrics_sorted['IC_Mean'])
plt.xticks(rotation=45)
plt.title('因子IC均值')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.bar(ic_metrics_sorted['factor'], ic_metrics_sorted['IC_IR'])
plt.xticks(rotation=45)
plt.title('因子IC_IR')
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
# 4.2 分层测试
def quantile_analysis(df, factor_cols, n_quantiles=5, forward_return_col='forward_return_5d'):
    """进行分层测试分析
    
    Args:
        df (pd.DataFrame): 数据框
        factor_cols (list): 因子列表
        n_quantiles (int): 分组数量
        forward_return_col (str): 未来收益率列名
    
    Returns:
        dict: 分层测试结果
    """
    results = {}
    
    for factor in factor_cols:
        quantile_returns = []
        
        # 按日期进行分组分析
        for date in df['date'].unique():
            date_data = df[df['date'] == date].copy()
            
            if len(date_data) >= n_quantiles * 10:  # 确保每组至少10只股票
                # 按因子值分组
                date_data['quantile'] = pd.qcut(date_data[factor], 
                                              q=n_quantiles, 
                                              labels=range(1, n_quantiles + 1))
                
                # 计算各组收益率
                group_returns = date_data.groupby('quantile')[forward_return_col].mean()
                quantile_returns.append({
                    'date': date,
                    **{f'Q{q}': ret for q, ret in group_returns.items()}
                })
        
        if quantile_returns:
            # 转换为DataFrame
            returns_df = pd.DataFrame(quantile_returns)
            
            # 计算统计指标
            avg_returns = returns_df.iloc[:, 1:].mean()
            std_returns = returns_df.iloc[:, 1:].std()
            sharpe = avg_returns / std_returns
            
            # 计算多空组合收益
            long_short = avg_returns.iloc[-1] - avg_returns.iloc[0]
            long_short_std = np.sqrt(std_returns.iloc[-1]**2 + std_returns.iloc[0]**2)
            long_short_sharpe = long_short / long_short_std if long_short_std != 0 else 0
            
            results[factor] = {
                'avg_returns': avg_returns,
                'sharpe': sharpe,
                'long_short_return': long_short,
                'long_short_sharpe': long_short_sharpe
            }
    
    return results

# 进行分层测试
quantile_results = quantile_analysis(df_processed, factor_columns[:4])  # 选择前4个因子展示

# 绘制分层测试结果
plt.figure(figsize=(15, 10))
for i, (factor, result) in enumerate(quantile_results.items(), 1):
    plt.subplot(2, 2, i)
    
    # 绘制各分位数收益率
    plt.bar(range(1, 6), result['avg_returns'])
    plt.title(f'{factor}\n多空收益: {result["long_short_return"]:.4f}, Sharpe: {result["long_short_sharpe"]:.2f}')
    plt.xlabel('分位数')
    plt.ylabel('平均收益率')
    plt.grid(True)

plt.tight_layout()
plt.show()

# 打印详细结果
print("\n分层测试详细结果：")
for factor, result in quantile_results.items():
    print(f"\n{factor}:")
    print("分位数收益率：")
    for q, ret in result['avg_returns'].items():
        print(f"Q{q}: {ret:.4f}")
    print(f"多空组合收益率: {result['long_short_return']:.4f}")
    print(f"多空组合Sharpe: {result['long_short_sharpe']:.2f}")


In [None]:
# 4.3 因子间相关性分析
def analyze_factor_correlation(df, factor_cols):
    """分析因子间的相关性
    
    Args:
        df (pd.DataFrame): 数据框
        factor_cols (list): 因子列表
    """
    # 计算因子间的相关系数矩阵
    corr_matrix = df[factor_cols].corr(method='spearman')
    
    # 绘制相关性热力图
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, 
                annot=True,  # 显示具体数值
                fmt='.2f',   # 数值格式
                cmap='coolwarm',  # 配色方案
                center=0,    # 将0设为中心色
                square=True)  # 保持方形
    plt.title('因子间相关性矩阵')
    plt.tight_layout()
    plt.show()
    
    # 识别高度相关的因子对
    high_corr_pairs = []
    for i in range(len(factor_cols)):
        for j in range(i+1, len(factor_cols)):
            corr = corr_matrix.iloc[i, j]
            if abs(corr) > 0.7:  # 设置相关性阈值
                high_corr_pairs.append({
                    'factor1': factor_cols[i],
                    'factor2': factor_cols[j],
                    'correlation': corr
                })
    
    if high_corr_pairs:
        print("\n高相关因子对（|相关系数| > 0.7）：")
        for pair in high_corr_pairs:
            print(f"{pair['factor1']} - {pair['factor2']}: {pair['correlation']:.3f}")
    else:
        print("\n没有发现高度相关的因子对。")

# 分析因子间相关性
analyze_factor_correlation(df_processed, factor_columns)
