In [1]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import tushare as ts
from tqdm import tqdm
import os
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr



输入数据处理:

按 ts_code 和 trade_date 对数据排序，以确保时间序列处理正确。
通过 groupby 按股票代码分组。
计算均线 (Volume Moving Average):

滚动窗口平均值 (rolling(window)) 用于计算指定窗口期的成交量均线。
指数加权移动平均值 (ewm) 用于计算更灵敏的成交量变化。
风险指标 (Risk Factors):

annualized_vol_{window} 使用滚动窗口计算年度化波动率。
动量指标 (Momentum Factors):

结合成交量均值 (vol_mean_20) 和百分比变化均值 (pct_chg_mean_20) 计算动量因子。
真实波动范围 (ATR):

使用 high, low, 和 pre_close 的最大变化计算波动范围。
滚动窗口计算 ATR（Average True Range）。

In [2]:
def construct_factors(df):
    df = df.sort_values(by=["ts_code", "trade_date"])
    for window in [5,10, 20, 60, 120]:
        df[f"vol_avg_{window}"] = df.groupby("ts_code")["vol"].transform(lambda x: x.rolling(window=window).mean())
    df["vol_ema_5"] = df.groupby("ts_code")["vol"].transform(lambda x: x.ewm(span=5).mean())
    df["turnover_10_to_120"] = df["vol_avg_10"] / df["vol_avg_120"]
    df["turnover_5_to_120"] = df["vol_avg_5"] / df["vol_avg_120"]
    # Risk 
    for window in [20, 60, 120]:
        df[f"annualized_vol_{window}"] = (
            df.groupby("ts_code")["pct_chg"]
            .transform(lambda x: x.rolling(window=window).var() * np.sqrt(252))
        )
    # Momentum
    df["vol_mean_20"] = df.groupby("ts_code")["vol"].transform(lambda x: x.rolling(window=20).mean())
    df["pct_chg_mean_20"] = df.groupby("ts_code")["pct_chg"].transform(lambda x: x.rolling(window=20).mean())
    df["momentum"] = (df["vol"] / df["vol_mean_20"]) * df["pct_chg_mean_20"]
    # ATR
    df["true_range"] = df[["high", "low", "pre_close"]].apply(
        lambda x: max(x["high"] - x["low"], abs(x["high"] - x["pre_close"]), abs(x["low"] - x["pre_close"])), axis=1
    )
    for window in [6, 14]:
        df[f"atr_{window}"] = df.groupby("ts_code")["true_range"].transform(lambda x: x.rolling(window=window).mean())
    return df

In [3]:
input_data = pd.read_csv('data/filtered_data.csv').sort_values(by=["ts_code"])
input_data

Unnamed: 0.1,Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,amount
0,726,000001.SZ,2021-01-04,19.10,19.10,18.44,18.60,19.34,-0.74,-3.8263,1554216.43,2891682.312
480,246,000001.SZ,2022-12-26,12.99,13.04,12.71,12.77,12.98,-0.21,-1.6179,797119.87,1021903.963
481,245,000001.SZ,2022-12-27,12.87,13.22,12.87,13.11,12.77,0.34,2.6625,886004.12,1160090.119
482,244,000001.SZ,2022-12-28,13.16,13.38,13.00,13.14,13.11,0.03,0.2288,791191.98,1042402.080
483,243,000001.SZ,2022-12-29,13.07,13.13,12.85,13.03,13.14,-0.11,-0.8371,666890.09,865144.967
...,...,...,...,...,...,...,...,...,...,...,...,...
3444459,3443742,873833.BJ,2023-11-08,16.51,20.18,16.51,16.66,10.00,6.66,66.6000,131602.39,243945.769
3444460,3443741,873833.BJ,2023-11-09,14.44,14.99,12.99,13.04,16.66,-3.62,-21.7287,78489.05,107106.137
3444461,3443740,873833.BJ,2023-11-10,12.45,13.95,12.45,13.10,13.04,0.06,0.4601,52486.12,70047.275
3444446,3443755,873833.BJ,2022-10-24,6.65,7.50,6.65,7.50,8.00,-0.50,-6.2500,60.00,44.150


In [5]:
factor_data = construct_factors(input_data)

0                  NaN
1                  NaN
2                  NaN
3                  NaN
4          1618034.434
              ...     
3444492      27356.694
3444493      24886.204
3444494      22648.518
3444495      20072.748
3444496      22366.128
Name: vol_avg_5, Length: 3444497, dtype: float64

In [7]:
factor_data = factor_data.dropna()
print(factor_data.columns)
factor_data

Index(['Unnamed: 0', 'ts_code', 'trade_date', 'open', 'high', 'low', 'close',
       'pre_close', 'change', 'pct_chg', 'vol', 'amount', 'vol_avg_5',
       'vol_avg_10', 'vol_avg_20', 'vol_avg_60', 'vol_avg_120', 'vol_ema_5',
       'turnover_10_to_120', 'turnover_5_to_120', 'annualized_vol_20',
       'annualized_vol_60', 'annualized_vol_120', 'vol_mean_20',
       'pct_chg_mean_20', 'momentum', 'true_range', 'atr_6', 'atr_14'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,...,turnover_5_to_120,annualized_vol_20,annualized_vol_60,annualized_vol_120,vol_mean_20,pct_chg_mean_20,momentum,true_range,atr_6,atr_14
119,607,000001.SZ,2021-07-02,22.72,22.74,21.76,21.81,23.20,-1.39,-5.9914,...,0.816482,86.226949,66.451580,112.570860,623439.8305,-0.403315,-0.692642,1.44,0.875000,0.755000
120,606,000001.SZ,2021-07-05,21.69,22.15,21.09,22.06,21.81,0.25,1.1463,...,0.871769,76.963657,66.606563,110.577824,628569.9240,-0.507970,-0.694465,1.06,0.961667,0.785714
121,605,000001.SZ,2021-07-06,22.08,23.00,21.85,22.78,22.06,0.72,3.2638,...,0.823862,87.913456,69.294126,110.979136,645772.0820,-0.295880,-0.362859,1.15,1.001667,0.840000
122,604,000001.SZ,2021-07-07,22.68,23.05,22.50,22.55,22.78,-0.23,-1.0097,...,0.849379,86.889713,69.571157,103.755043,640715.4295,-0.393690,-0.288145,0.55,0.961667,0.831429
123,603,000001.SZ,2021-07-08,22.78,22.78,21.20,21.51,22.55,-1.04,-4.6120,...,1.025093,100.018376,75.051359,106.411128,690876.8280,-0.648750,-1.368214,1.58,1.126667,0.880714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444183,3443257,873593.BJ,2023-12-25,38.51,39.99,38.17,39.26,38.55,0.71,1.8418,...,1.211946,508.030994,675.621742,558.923698,41052.0510,-0.752735,-0.380989,1.82,3.203333,3.168571
3444184,3443256,873593.BJ,2023-12-26,39.17,39.49,37.56,38.10,39.26,-1.16,-2.9547,...,1.207117,511.884120,678.267356,559.720570,38534.5285,-0.853810,-0.507291,1.93,3.116667,2.985000
3444185,3443255,873593.BJ,2023-12-27,38.10,38.28,36.30,36.83,38.10,-1.27,-3.3333,...,1.205317,510.553967,677.648998,560.798787,36743.4820,-1.105040,-0.762521,1.98,3.095000,2.997857
3444186,3443254,873593.BJ,2023-12-28,36.70,37.65,36.60,36.98,36.83,0.15,0.4073,...,0.927682,495.378857,676.065517,546.405826,34395.6985,-1.253095,-0.693392,1.05,3.021667,2.945714


fetch_financial_data 函数:
使用 fina_indicator API 获取指定股票 (ts_code) 和时间范围的财务数据。
包括处理 API 错误的异常捕获机制。

In [46]:
#financial (basic factors)
def fetch_financial_data(ts_code, start_date, end_date):
    try:
        financial_data = pro.fina_indicator(ts_code=ts_code, start_date=start_date, end_date=end_date)
        return financial_data
    except Exception as e:
        print(f"Error fetching financial data for {ts_code}: {e}")
        return pd.DataFrame() 
load_dotenv()
token = os.getenv("API_TOKEN_TUSHARE")
print(token)
ts.set_token(token)
pro = ts.pro_api()
unique_ts_codes = factor_data['ts_code'].unique()
start_date = factor_data['trade_date'].min()
end_date = factor_data['trade_date'].max() 
print(start_date, end_date)

aa1720cccc1acb18523702e914ca4352428811e10fdf9b01af9d7c0c
2021-07-02 2023-12-29


In [16]:
financial_data_all = pd.DataFrame()
for ts_code in tqdm(unique_ts_codes):
    financial_data = fetch_financial_data(ts_code, start_date, end_date)
    financial_data_all = pd.concat([financial_data_all, financial_data], ignore_index=True)
financial_data_all

  financial_data_all = pd.concat([financial_data_all, financial_data], ignore_index=True)
100%|██████████| 5170/5170 [1:05:23<00:00,  1.32it/s]


Unnamed: 0,ts_code,ann_date,end_date,eps,dt_eps,total_revenue_ps,revenue_ps,capital_rese_ps,surplus_rese_ps,undist_profit_ps,...,ocf_yoy,roe_yoy,bps_yoy,assets_yoy,eqt_yoy,tr_yoy,or_yoy,q_sales_yoy,q_op_qoq,equity_yoy
0,000001.SZ,20230309,20221231,2.20,2.20,9.2701,9.2701,4.1645,0.5556,9.6004,...,169.8230,13.9585,12.100,8.1305,9.9209,6.2061,6.2061,-1.3343,-39.0204,9.9209
1,000001.SZ,20230309,20221231,2.20,2.20,9.2701,9.2701,4.1645,0.5556,9.6004,...,169.8230,13.9585,12.100,8.1305,9.9209,6.2061,6.2061,-1.3343,-39.0204,9.9209
2,000001.SZ,20221025,20220930,1.78,1.78,7.1249,7.1249,4.1645,0.5556,9.4648,...,173.5390,15.0206,9.200,5.5625,7.5701,8.7074,8.7074,8.7815,57.7661,9.3931
3,000001.SZ,20221025,20220930,1.78,1.78,7.1249,7.1249,4.1645,0.5556,9.4648,...,173.5390,15.0206,9.200,5.5625,7.5701,8.7074,8.7074,8.7815,57.7661,9.3931
4,000001.SZ,20220818,20220630,1.03,1.03,4.7420,4.7420,4.1645,0.5556,8.7263,...,229.7580,14.9811,5.100,3.8078,4.2104,8.6703,8.6703,6.8148,-28.8800,9.2539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81016,873593.BJ,20211029,20210930,1.52,1.52,4.6794,4.6794,0.6975,0.0660,2.0937,...,83.9318,,64.988,50.2894,64.9859,40.6356,40.6356,26.8506,-15.7982,
81017,873593.BJ,20210828,20210630,1.08,1.08,3.2431,3.2431,0.6975,0.0660,1.6513,...,139.9919,-1.9453,46.150,32.0467,46.0721,47.7460,47.7460,,-10.8038,62.5101
81018,873593.BJ,20210828,20210630,1.08,1.08,3.2431,3.2431,0.6975,0.0660,1.6513,...,139.9919,-1.9453,46.150,32.0467,46.0721,47.7460,47.7460,,-10.8038,62.5101
81019,873593.BJ,20220428,20210331,,,,,,,,...,,,,,,,,,500.1134,


In [17]:
financial_data_all.to_csv('data/allfindata.csv')

日期格式化:
使用 pd.to_datetime 将 ann_date 和 end_date 转换为 datetime64[ns] 类型。
数据去重:
基于 ts_code 和 ann_date 删除重复行，保留第一条记录。
增长率计算:
revenue_growth: 使用 pct_change 方法按年度计算营业收入增长率。

In [4]:
financial_data = pd.read_csv('data/allfindata.csv')
financial_data = financial_data.dropna()
financial_data['ann_date'] = pd.to_datetime(financial_data['ann_date'], format='%Y%m%d')
financial_data['end_date'] = pd.to_datetime(financial_data['end_date'], format='%Y%m%d')
financial_data = financial_data.sort_values(by=['ts_code', 'ann_date']).drop(labels='Unnamed: 0', axis=1)
financial_data = financial_data[[
    'ts_code', 'ann_date', 'end_date',
    'revenue_ps', 'assets_yoy', 'ocf_yoy', 'ebt_yoy', 'netprofit_yoy',
    'op_income', 'gross_margin', 'ebit', 'profit_dedt', 'bps', 'ocfps', 
    'netprofit_margin', 'quick_ratio', 'current_ratio', 'op_of_gr', 
    'roe', 'eps'
]]
financial_data = financial_data.drop_duplicates(subset=['ts_code', 'ann_date'], keep='first')
financial_data = financial_data.reset_index(drop=True)
financial_data = financial_data.sort_values(by=['ts_code', 'ann_date'])
financial_data['revenue_growth'] = financial_data.groupby('ts_code')['revenue_ps'].pct_change(periods=1) * 100
financial_data.to_csv('data/findata.csv')
financial_data

Unnamed: 0,ts_code,ann_date,end_date,revenue_ps,assets_yoy,ocf_yoy,ebt_yoy,netprofit_yoy,op_income,gross_margin,...,profit_dedt,bps,ocfps,netprofit_margin,quick_ratio,current_ratio,op_of_gr,roe,eps,revenue_growth
0,000002.SZ,2021-08-30,2021-06-30,14.3841,4.5939,-70.0010,-16.5199,-11.6776,1.911667e+10,3.833571e+10,...,1.072729e+10,19.0971,0.5837,9.6783,0.4232,1.1821,13.1575,4.9496,0.9500,
1,000002.SZ,2022-03-31,2021-12-31,38.9491,3.7161,-92.2668,-34.4561,-45.7455,4.589366e+10,9.882064e+10,...,2.238178e+10,20.2964,0.3538,8.4076,0.4001,1.2202,11.6014,9.7832,1.9400,170.778846
2,000002.SZ,2023-03-31,2022-12-31,43.3197,-9.3629,-33.1305,0.3132,0.4162,4.789558e+10,9.851908e+10,...,1.976210e+10,20.8664,0.2365,7.4530,0.4716,1.3132,10.3221,9.4508,1.9500,11.221312
3,000006.SZ,2022-04-16,2021-12-31,2.2878,52.9083,-2213.1579,-30.8269,-37.3507,5.596526e+08,1.341350e+09,...,5.324058e+08,5.6634,-2.8701,17.9206,0.3737,2.1175,24.3595,7.2177,0.4013,
4,000006.SZ,2023-04-28,2022-12-31,2.7406,11.8555,159.7591,-2.7009,-22.5352,6.553849e+08,1.429311e+09,...,4.178751e+08,5.8532,1.7152,12.1099,0.4935,2.0251,19.7860,5.3979,0.3108,19.791940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6293,871981.BJ,2023-04-22,2022-12-31,5.0644,4.2976,-21.3011,-50.4278,-33.4402,1.832008e+07,7.315201e+07,...,2.742244e+07,6.8507,0.8225,11.2572,1.2504,1.6827,7.3016,8.5988,0.5700,
6294,872190.BJ,2022-12-09,2022-06-30,23.4999,-1.3504,22.7611,-28.1889,-29.9581,3.219659e+07,1.326890e+08,...,2.578603e+07,9.6550,-7.1800,2.2806,0.6134,1.5942,2.8819,5.9737,0.5535,
6295,873152.BJ,2022-10-26,2022-09-30,4.5947,-24.2998,-1901.7839,36.9893,42.0245,1.639243e+07,4.080085e+07,...,1.545894e+07,2.3495,-0.7292,8.9331,1.7128,2.5809,7.7707,19.1899,0.4100,
6296,873169.BJ,2022-03-25,2021-06-30,1.6034,30.5473,-67.4732,-22.4457,-22.2671,1.506949e+07,2.478579e+07,...,1.309360e+07,2.2394,0.1000,14.2434,2.2271,3.2292,16.3738,11.7657,0.2300,


In [22]:
financial_data = pd.read_csv('data/findata.csv')

In [25]:
#for each stock fetch total shares 
load_dotenv()
token = os.getenv("API_TOKEN_TUSHARE")
ts.set_token(token)
pro = ts.pro_api()
unique_ts_codes = financial_data['ts_code'].unique()
total_shares_list = []
for ts_code in tqdm(unique_ts_codes):
    share_data = pro.daily_basic(
        ts_code=ts_code,
        start_date='20210101',  # Use a wide range to cover all possible dates
        end_date='20240101',
        fields='ts_code,trade_date,total_share'
    )
    total_shares_list.append(share_data)

total_shares_data = pd.concat(total_shares_list, ignore_index=True)

100%|██████████| 2264/2264 [33:05<00:00,  1.14it/s]


In [27]:
total_shares_data

Unnamed: 0,ts_code,trade_date,total_share
0,000002.SZ,20231229,1.193071e+06
1,000002.SZ,20231228,1.193071e+06
2,000002.SZ,20231227,1.193071e+06
3,000002.SZ,20231226,1.193071e+06
4,000002.SZ,20231225,1.193071e+06
...,...,...,...
1583425,873223.BJ,20220615,1.516000e+04
1583426,873223.BJ,20220614,1.516000e+04
1583427,873223.BJ,20220613,1.516000e+04
1583428,873223.BJ,20220610,1.516000e+04


In [37]:
# Ensure trade_date is in datetime format
financial_data = pd.read_csv('data/findata.csv')
financial_data['trade_date'] = financial_data['end_date']
financial_data['trade_date'] = pd.to_datetime(financial_data['trade_date'])
total_shares_data['trade_date'] = pd.to_datetime(total_shares_data['trade_date'])

# Find common ts_code values
common_ts_codes = set(financial_data['ts_code']).intersection(set(total_shares_data['ts_code']))

# Filter both DataFrames for common ts_code values
financial_data = financial_data[financial_data['ts_code'].isin(common_ts_codes)]
total_shares_data = total_shares_data[total_shares_data['ts_code'].isin(common_ts_codes)]

# Ensure both DataFrames are sorted correctly
financial_data = financial_data.sort_values(by=['ts_code', 'trade_date'])
total_shares_data = total_shares_data.sort_values(by=['ts_code', 'trade_date'])

def find_closest_share(row, total_shares_data):
    # Filter total_shares_data for the same stock
    stock_data = total_shares_data[total_shares_data['ts_code'] == row['ts_code']]
    # Find the date difference and get the closest date
    stock_data['date_diff'] = (stock_data['trade_date'] - row['trade_date']).abs()
    closest_row = stock_data.loc[stock_data['date_diff'].idxmin()]
    return closest_row['total_share']

financial_data['total_share'] = financial_data.apply(
    lambda row: find_closest_share(row, total_shares_data), axis=1
)
financial_data

Unnamed: 0.1,Unnamed: 0,ts_code,ann_date,end_date,revenue_ps,assets_yoy,ocf_yoy,ebt_yoy,netprofit_yoy,op_income,...,netprofit_margin,quick_ratio,current_ratio,op_of_gr,roe,eps,revenue_growth,trade_date,year,total_share
0,0,000002.SZ,2021-08-30,2021-06-30,14.3841,4.5939,-70.0010,-16.5199,-11.6776,1.911667e+10,...,9.6783,0.4232,1.1821,13.1575,4.9496,0.9500,,2021-06-30,2021,1.161773e+06
1,1,000002.SZ,2022-03-31,2021-12-31,38.9491,3.7161,-92.2668,-34.4561,-45.7455,4.589366e+10,...,8.4076,0.4001,1.2202,11.6014,9.7832,1.9400,170.778846,2021-12-31,2021,1.162538e+06
2,2,000002.SZ,2023-03-31,2022-12-31,43.3197,-9.3629,-33.1305,0.3132,0.4162,4.789558e+10,...,7.4530,0.4716,1.3132,10.3221,9.4508,1.9500,11.221312,2022-12-31,2022,1.163071e+06
3,3,000006.SZ,2022-04-16,2021-12-31,2.2878,52.9083,-2213.1579,-30.8269,-37.3507,5.596526e+08,...,17.9206,0.3737,2.1175,24.3595,7.2177,0.4013,,2021-12-31,2021,1.349995e+05
4,4,000006.SZ,2023-04-28,2022-12-31,2.7406,11.8555,159.7591,-2.7009,-22.5352,6.553849e+08,...,12.1099,0.4935,2.0251,19.7860,5.3979,0.3108,19.791940,2022-12-31,2022,1.349995e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6293,6293,871981.BJ,2023-04-22,2022-12-31,5.0644,4.2976,-21.3011,-50.4278,-33.4402,1.832008e+07,...,11.2572,1.2504,1.6827,7.3016,8.5988,0.5700,,2022-12-31,2022,7.646800e+03
6294,6294,872190.BJ,2022-12-09,2022-06-30,23.4999,-1.3504,22.7611,-28.1889,-29.9581,3.219659e+07,...,2.2806,0.6134,1.5942,2.8819,5.9737,0.5535,,2022-06-30,2022,6.250000e+03
6295,6295,873152.BJ,2022-10-26,2022-09-30,4.5947,-24.2998,-1901.7839,36.9893,42.0245,1.639243e+07,...,8.9331,1.7128,2.5809,7.7707,19.1899,0.4100,,2022-09-30,2022,7.610800e+03
6296,6296,873169.BJ,2022-03-25,2021-06-30,1.6034,30.5473,-67.4732,-22.4457,-22.2671,1.506949e+07,...,14.2434,2.2271,3.2292,16.3738,11.7657,0.2300,,2021-06-30,2021,7.855000e+03


In [38]:
financial_data_shares= financial_data.drop('Unnamed: 0',axis=1)

In [40]:
financial_data_shares['market_value'] = financial_data_shares['bps'] * financial_data_shares['total_share']
financial_data_shares.to_csv('data/findatawshares.csv')
financial_data_shares

Unnamed: 0,ts_code,ann_date,end_date,revenue_ps,assets_yoy,ocf_yoy,ebt_yoy,netprofit_yoy,op_income,gross_margin,...,quick_ratio,current_ratio,op_of_gr,roe,eps,revenue_growth,trade_date,year,total_share,market_value
0,000002.SZ,2021-08-30,2021-06-30,14.3841,4.5939,-70.0010,-16.5199,-11.6776,1.911667e+10,3.833571e+10,...,0.4232,1.1821,13.1575,4.9496,0.9500,,2021-06-30,2021,1.161773e+06,2.218650e+07
1,000002.SZ,2022-03-31,2021-12-31,38.9491,3.7161,-92.2668,-34.4561,-45.7455,4.589366e+10,9.882064e+10,...,0.4001,1.2202,11.6014,9.7832,1.9400,170.778846,2021-12-31,2021,1.162538e+06,2.359534e+07
2,000002.SZ,2023-03-31,2022-12-31,43.3197,-9.3629,-33.1305,0.3132,0.4162,4.789558e+10,9.851908e+10,...,0.4716,1.3132,10.3221,9.4508,1.9500,11.221312,2022-12-31,2022,1.163071e+06,2.426910e+07
3,000006.SZ,2022-04-16,2021-12-31,2.2878,52.9083,-2213.1579,-30.8269,-37.3507,5.596526e+08,1.341350e+09,...,0.3737,2.1175,24.3595,7.2177,0.4013,,2021-12-31,2021,1.349995e+05,7.645562e+05
4,000006.SZ,2023-04-28,2022-12-31,2.7406,11.8555,159.7591,-2.7009,-22.5352,6.553849e+08,1.429311e+09,...,0.4935,2.0251,19.7860,5.3979,0.3108,19.791940,2022-12-31,2022,1.349995e+05,7.901791e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6293,871981.BJ,2023-04-22,2022-12-31,5.0644,4.2976,-21.3011,-50.4278,-33.4402,1.832008e+07,7.315201e+07,...,1.2504,1.6827,7.3016,8.5988,0.5700,,2022-12-31,2022,7.646800e+03,5.238593e+04
6294,872190.BJ,2022-12-09,2022-06-30,23.4999,-1.3504,22.7611,-28.1889,-29.9581,3.219659e+07,1.326890e+08,...,0.6134,1.5942,2.8819,5.9737,0.5535,,2022-06-30,2022,6.250000e+03,6.034375e+04
6295,873152.BJ,2022-10-26,2022-09-30,4.5947,-24.2998,-1901.7839,36.9893,42.0245,1.639243e+07,4.080085e+07,...,1.7128,2.5809,7.7707,19.1899,0.4100,,2022-09-30,2022,7.610800e+03,1.788157e+04
6296,873169.BJ,2022-03-25,2021-06-30,1.6034,30.5473,-67.4732,-22.4457,-22.2671,1.506949e+07,2.478579e+07,...,2.2271,3.2292,16.3738,11.7657,0.2300,,2021-06-30,2021,7.855000e+03,1.759049e+04


结果展示：
以下是每个因子对应的结果数据中的列名：

成长因子
营业收入增长率(TTM): revenue_growth
总资产增长率(TTM): assets_yoy
经营活动产生的现金流量净额增长率(TTM): ocf_yoy
利润总额增长率(TTM): ebt_yoy
净利润增长率(TTM): netprofit_yoy
基础因子
营业总收入(TTM): revenue_ps
营业利润(TTM): op_income
毛利(TTM): gross_margin
息税前利润: ebit
净利润(TTM): profit_dedt
市值: market_value
现金流市值比: cash_flow_to_mv
营收市值比: revenue_to_mv
质量因子
销售净利率: netprofit_margin
速动比率: quick_ratio
流动比率(单季度): current_ratio
营业利润率: op_of_gr
权益回报率(TTM): roe
情绪因子
10日平均换手率: vol_avg_10
20日平均换手率: vol_avg_20
60日平均换手率: vol_avg_60
120日平均换手率: vol_avg_120
成交量的5日指数移动平均: vol_ema_5
10日平均换手率 / 120日平均换手率: turnover_10_to_120
5日平均换手率 / 120日平均换手率: turnover_5_to_120
风险因子
20日年化收益方差: annualized_vol_20
60日年化收益方差: annualized_vol_60
120日年化收益方差: annualized_vol_120
动量因子
动量因子: momentum
每股因子
每股营业总收入: revenue_ps
每股收益(TTM): eps

In [6]:
factor_data = pd.read_csv('data/sent_factor.csv')
factor_data['trade_date'] = pd.to_datetime(factor_data['trade_date'])
factor_data = factor_data.sort_values(by=['ts_code', 'trade_date'])
factor_data

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,...,turnover_5_to_120,annualized_vol_20,annualized_vol_60,annualized_vol_120,vol_mean_20,pct_chg_mean_20,momentum,true_range,atr_6,atr_14
0,119,607,000001.SZ,2021-07-02,22.72,22.74,21.76,21.81,23.20,-1.39,...,0.816482,86.226949,66.451580,112.570860,623439.8305,-0.403315,-0.692642,1.44,0.875000,0.755000
1,120,606,000001.SZ,2021-07-05,21.69,22.15,21.09,22.06,21.81,0.25,...,0.871769,76.963657,66.606563,110.577824,628569.9240,-0.507970,-0.694465,1.06,0.961667,0.785714
2,121,605,000001.SZ,2021-07-06,22.08,23.00,21.85,22.78,22.06,0.72,...,0.823862,87.913456,69.294126,110.979136,645772.0820,-0.295880,-0.362859,1.15,1.001667,0.840000
3,122,604,000001.SZ,2021-07-07,22.68,23.05,22.50,22.55,22.78,-0.23,...,0.849379,86.889713,69.571157,103.755043,640715.4295,-0.393690,-0.288145,0.55,0.961667,0.831429
4,123,603,000001.SZ,2021-07-08,22.78,22.78,21.20,21.51,22.55,-1.04,...,1.025093,100.018376,75.051359,106.411128,690876.8280,-0.648750,-1.368214,1.58,1.126667,0.880714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2820860,3444183,3443257,873593.BJ,2023-12-25,38.51,39.99,38.17,39.26,38.55,0.71,...,1.211946,508.030994,675.621742,558.923698,41052.0510,-0.752735,-0.380989,1.82,3.203333,3.168571
2820861,3444184,3443256,873593.BJ,2023-12-26,39.17,39.49,37.56,38.10,39.26,-1.16,...,1.207117,511.884120,678.267356,559.720570,38534.5285,-0.853810,-0.507291,1.93,3.116667,2.985000
2820862,3444185,3443255,873593.BJ,2023-12-27,38.10,38.28,36.30,36.83,38.10,-1.27,...,1.205317,510.553967,677.648998,560.798787,36743.4820,-1.105040,-0.762521,1.98,3.095000,2.997857
2820863,3444186,3443254,873593.BJ,2023-12-28,36.70,37.65,36.60,36.98,36.83,0.15,...,0.927682,495.378857,676.065517,546.405826,34395.6985,-1.253095,-0.693392,1.05,3.021667,2.945714


In [46]:
#examine financial data
financial_data = financial_data_shares
financial_data

Unnamed: 0,ts_code,ann_date,end_date,revenue_ps,assets_yoy,ocf_yoy,ebt_yoy,netprofit_yoy,op_income,gross_margin,...,quick_ratio,current_ratio,op_of_gr,roe,eps,revenue_growth,trade_date,year,total_share,market_value
0,000002.SZ,2021-08-30,2021-06-30,14.3841,4.5939,-70.0010,-16.5199,-11.6776,1.911667e+10,3.833571e+10,...,0.4232,1.1821,13.1575,4.9496,0.9500,,2021-06-30,2021,1.161773e+06,2.218650e+07
1,000002.SZ,2022-03-31,2021-12-31,38.9491,3.7161,-92.2668,-34.4561,-45.7455,4.589366e+10,9.882064e+10,...,0.4001,1.2202,11.6014,9.7832,1.9400,170.778846,2021-12-31,2021,1.162538e+06,2.359534e+07
2,000002.SZ,2023-03-31,2022-12-31,43.3197,-9.3629,-33.1305,0.3132,0.4162,4.789558e+10,9.851908e+10,...,0.4716,1.3132,10.3221,9.4508,1.9500,11.221312,2022-12-31,2022,1.163071e+06,2.426910e+07
3,000006.SZ,2022-04-16,2021-12-31,2.2878,52.9083,-2213.1579,-30.8269,-37.3507,5.596526e+08,1.341350e+09,...,0.3737,2.1175,24.3595,7.2177,0.4013,,2021-12-31,2021,1.349995e+05,7.645562e+05
4,000006.SZ,2023-04-28,2022-12-31,2.7406,11.8555,159.7591,-2.7009,-22.5352,6.553849e+08,1.429311e+09,...,0.4935,2.0251,19.7860,5.3979,0.3108,19.791940,2022-12-31,2022,1.349995e+05,7.901791e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6293,871981.BJ,2023-04-22,2022-12-31,5.0644,4.2976,-21.3011,-50.4278,-33.4402,1.832008e+07,7.315201e+07,...,1.2504,1.6827,7.3016,8.5988,0.5700,,2022-12-31,2022,7.646800e+03,5.238593e+04
6294,872190.BJ,2022-12-09,2022-06-30,23.4999,-1.3504,22.7611,-28.1889,-29.9581,3.219659e+07,1.326890e+08,...,0.6134,1.5942,2.8819,5.9737,0.5535,,2022-06-30,2022,6.250000e+03,6.034375e+04
6295,873152.BJ,2022-10-26,2022-09-30,4.5947,-24.2998,-1901.7839,36.9893,42.0245,1.639243e+07,4.080085e+07,...,1.7128,2.5809,7.7707,19.1899,0.4100,,2022-09-30,2022,7.610800e+03,1.788157e+04
6296,873169.BJ,2022-03-25,2021-06-30,1.6034,30.5473,-67.4732,-22.4457,-22.2671,1.506949e+07,2.478579e+07,...,2.2271,3.2292,16.3738,11.7657,0.2300,,2021-06-30,2021,7.855000e+03,1.759049e+04


In [47]:
#merge financial and stock data 
financial_data = financial_data.dropna()
financial_data['trade_date'] = financial_data['end_date']
factor_data['trade_date'] = pd.to_datetime(factor_data['trade_date'])
financial_data['trade_date'] = pd.to_datetime(financial_data['trade_date'])

factor_data['year'] = factor_data['trade_date'].dt.year
financial_data['year'] = financial_data['trade_date'].dt.year
merged_data = pd.merge(
    factor_data,
    financial_data,
    on=['ts_code', 'year'],
    how='inner',  
    suffixes=('_factor', '_financial')  
)

merged_data = merged_data.drop(columns=['year'])
merged_data = merged_data.drop(['Unnamed: 0.1','Unnamed: 0'], axis=1)
merged_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  financial_data['trade_date'] = financial_data['end_date']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  financial_data['trade_date'] = pd.to_datetime(financial_data['trade_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  financial_data['year'] = financial_data['trade_date'].dt.year


Unnamed: 0,ts_code,trade_date_factor,open,high,low,close,pre_close,change,pct_chg,vol,...,netprofit_margin,quick_ratio,current_ratio,op_of_gr,roe,eps,revenue_growth,trade_date_financial,total_share,market_value
0,000002.SZ,2021-07-02,24.78,24.78,24.04,24.05,24.98,-0.93,-3.7230,887290.38,...,8.4076,0.4001,1.2202,11.6014,9.7832,1.94,170.778846,2021-12-31,1.162538e+06,2.359534e+07
1,000002.SZ,2021-07-05,24.06,24.09,23.55,23.82,24.05,-0.23,-0.9563,605991.53,...,8.4076,0.4001,1.2202,11.6014,9.7832,1.94,170.778846,2021-12-31,1.162538e+06,2.359534e+07
2,000002.SZ,2021-07-06,23.90,24.75,23.76,24.48,23.82,0.66,2.7708,844134.79,...,8.4076,0.4001,1.2202,11.6014,9.7832,1.94,170.778846,2021-12-31,1.162538e+06,2.359534e+07
3,000002.SZ,2021-07-07,24.20,24.48,23.97,24.04,24.48,-0.44,-1.7974,598540.74,...,8.4076,0.4001,1.2202,11.6014,9.7832,1.94,170.778846,2021-12-31,1.162538e+06,2.359534e+07
4,000002.SZ,2021-07-08,24.15,24.25,23.71,24.05,24.04,0.01,0.0416,589155.23,...,8.4076,0.4001,1.2202,11.6014,9.7832,1.94,170.778846,2021-12-31,1.162538e+06,2.359534e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832414,871642.BJ,2022-12-28,7.40,7.51,7.40,7.48,7.41,0.07,0.9447,720.51,...,17.5794,1.1179,1.4081,16.8165,11.3906,0.28,131.310680,2022-12-31,1.036703e+04,2.616327e+04
832415,871642.BJ,2022-12-29,7.59,7.59,7.42,7.52,7.48,0.04,0.5348,502.34,...,6.0679,1.1952,1.5364,5.5286,1.1733,0.03,-44.272798,2022-06-30,1.036703e+04,2.357047e+04
832416,871642.BJ,2022-12-29,7.59,7.59,7.42,7.52,7.48,0.04,0.5348,502.34,...,17.5794,1.1179,1.4081,16.8165,11.3906,0.28,131.310680,2022-12-31,1.036703e+04,2.616327e+04
832417,871642.BJ,2022-12-30,7.70,7.70,7.41,7.49,7.52,-0.03,-0.3989,325.18,...,6.0679,1.1952,1.5364,5.5286,1.1733,0.03,-44.272798,2022-06-30,1.036703e+04,2.357047e+04


因子标准化、MAD中位数绝对偏差法去极值、市值中性化报告

1. 因子标准化 (Factor Standardization)

方法论:
因子标准化旨在对因子数据进行归一化处理，将不同因子的值转换为具有相同量纲的数据，便于比较和分析。常用方法是 z-score 标准化

实现:
在代码中，通过以下逻辑实现标准化：

for col in factor_columns:
    df[col] = (df[col] - df[col].mean()) / df[col].std()

上述代码逐列对因子值进行标准化，计算其均值和标准差，最终生成均值为 0、标准差为 1 的标准化因子值。

结果:
因子标准化后，各因子值的分布得以调整，消除了不同因子因数值量级不同带来的影响，便于模型训练和分析。

2. MAD 中位数绝对偏差法去极值 (MAD Outlier Removal)

方法论:
MAD (Median Absolute Deviation) 是一种基于中位数的稳健去极值方法，适用于去除数据中的异常值。相比均值和标准差，MAD 更不易受到极端值的影响。
实现:
以下代码实现了 MAD 去极值：

def mad_outlier_removal(df, factor_columns, threshold=3):
    for col in factor_columns:
        median = df[col].median()
        mad = (df[col] - median).abs().median()
        upper_limit = median + threshold * mad
        lower_limit = median - threshold * mad
        df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    return df

上述逻辑逐列计算 MAD，并将超出范围的值裁剪为设定的上下限。

结果:
去极值后，因子数据中的异常值被平滑处理，使得数据分布更加稳定，减少了异常值对模型的干扰。

3. 市值中性化 (Market Value Neutralization)

方法论:
市值中性化是通过线性回归消除因子值与市值之间的相关性，使因子值独立于市值的影响。
实现:
以下代码实现了市值中性化：

def market_value_neutralize(df, factor_columns, mv_column):
    for col in factor_columns:
        X = df[mv_column].values.reshape(-1, 1)  # 市值作为自变量
        y = df[col].values  # 因子值作为因变量
        model = LinearRegression()
        model.fit(X, y)
        # 计算残差 (因子中性化)
        df[col] = y - model.predict(X)
    return df

上述代码通过 LinearRegression 模型拟合每个因子的市值依赖性，并生成去除市值影响的因子值。
结果:
完成市值中性化后，因子值中不再包含市值相关的线性影响，因子对目标变量的解释力更加纯粹。

In [11]:
# 因子标准化 (Z-score 标准化)
def standardize_factors(df, factor_columns):
    for col in factor_columns:
        df[col] = (df[col] - df[col].mean()) / df[col].std()
    return df
#MAD 中位数绝对偏差法去极值
def mad_outlier_removal(df, factor_columns, threshold=3):
    for col in factor_columns:
        median = df[col].median()
        mad = (df[col] - median).abs().median()
        upper_limit = median + threshold * mad
        lower_limit = median - threshold * mad
        df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    return df
# 市值中性化
def market_value_neutralize(df, factor_columns, mv_column):
    for col in factor_columns:
        X = df[mv_column].values.reshape(-1, 1) 
        y = df[col].values 
        model = LinearRegression()
        model.fit(X, y)
        # 计算残差 (因子中性化)
        df[col] = y - model.predict(X)
    return df
def filter_valid_factors(df, factor_columns):
    # Find the intersection of DataFrame columns and the factor list
    valid_columns = [col for col in factor_columns if col in df.columns]
    return valid_columns

In [48]:
factor_names = [
    "revenue_growth", "assets_yoy", "ocf_yoy", "ebt_yoy", "netprofit_yoy",
    "revenue_ps", "op_income", "gross_margin", "ebit", "profit_dedt",
    "market_value", "cash_flow_to_mv", "revenue_to_mv", "netprofit_margin",
    "quick_ratio", "current_ratio", "op_of_gr", "roe", "vol_avg_10",
    "vol_avg_20", "vol_avg_60", "vol_avg_120", "vol_ema_5",
    "turnover_10_to_120", "turnover_5_to_120", "annualized_vol_20",
    "annualized_vol_60", "annualized_vol_120", "momentum", "eps"
]
valid_columns = filter_valid_factors(merged_data, factor_names)
factor_data_normalized = mad_outlier_removal(merged_data, valid_columns, threshold=3)
factor_data_normalized = market_value_neutralize(factor_data_normalized, valid_columns, mv_column='market_value')
factor_data_normalized = standardize_factors(factor_data_normalized, valid_columns)
factor_data_normalized

Unnamed: 0,ts_code,trade_date_factor,open,high,low,close,pre_close,change,pct_chg,vol,...,netprofit_margin,quick_ratio,current_ratio,op_of_gr,roe,eps,revenue_growth,trade_date_financial,total_share,market_value
0,000002.SZ,2021-07-02,24.78,24.78,24.04,24.05,24.98,-0.93,-3.7230,887290.38,...,0.173591,-1.219510,-0.077487,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594
1,000002.SZ,2021-07-05,24.06,24.09,23.55,23.82,24.05,-0.23,-0.9563,605991.53,...,0.173591,-1.219510,-0.077487,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594
2,000002.SZ,2021-07-06,23.90,24.75,23.76,24.48,23.82,0.66,2.7708,844134.79,...,0.173591,-1.219510,-0.077487,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594
3,000002.SZ,2021-07-07,24.20,24.48,23.97,24.04,24.48,-0.44,-1.7974,598540.74,...,0.173591,-1.219510,-0.077487,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594
4,000002.SZ,2021-07-08,24.15,24.25,23.71,24.05,24.04,0.01,0.0416,589155.23,...,0.173591,-1.219510,-0.077487,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832414,871642.BJ,2022-12-28,7.40,7.51,7.40,7.48,7.41,0.07,0.9447,720.51,...,2.269195,0.358833,0.157296,1.960412,1.557308,0.731828,1.012685,2022-12-31,1.036703e+04,-1.217777
832415,871642.BJ,2022-12-29,7.59,7.59,7.42,7.52,7.48,0.04,0.5348,502.34,...,0.809610,0.553504,0.429806,0.642430,0.136419,0.039418,-1.045321,2022-06-30,1.036703e+04,-1.227028
832416,871642.BJ,2022-12-29,7.59,7.59,7.42,7.52,7.48,0.04,0.5348,502.34,...,2.269195,0.358833,0.157296,1.960412,1.557308,0.731828,1.012685,2022-12-31,1.036703e+04,-1.217777
832417,871642.BJ,2022-12-30,7.70,7.70,7.41,7.49,7.52,-0.03,-0.3989,325.18,...,0.809610,0.553504,0.429806,0.642430,0.136419,0.039418,-1.045321,2022-06-30,1.036703e+04,-1.227028


In [49]:
factor_data_normalized.to_csv('data/normalized.csv')

因子IC、Rank_IC与IR计算报告
1. 概念解释
IC（信息系数）：
IC 是用来衡量因子值与未来收益之间的线性关系的指标。简单来说，它反映了一个因子是否能够有效预测未来的收益。如果 IC 是正的，说明因子的预测方向与实际收益方向一致；如果是负的，则说明预测方向相反。

Rank_IC（排名信息系数）：
Rank_IC 是衡量因子值排名与未来收益排名之间关系的指标。它关注的是因子值的排序是否与实际收益的排序一致。相比 IC，Rank_IC 更不容易受到异常值的影响。

IR（信息比率）：
IR 是用来衡量因子预测能力稳定性的重要指标。它反映了一个因子在不同时间段的预测能力是否稳定。如果 IR 高，说明因子在预测未来收益时的表现比较稳定；如果 IR 低，说明因子的预测能力可能时好时坏。

2. 计算步骤
第一步：准备数据
我们准备了一组因子数据和未来收益率数据。因子数据包括多个变量，比如营业收入增长率、总资产增长率等。未来收益率数据则表示不同时间周期内的股票收益，比如未来1天、未来5天、未来20天的收益率。

第二步：分组计算 IC 和 Rank_IC
我们按照时间（交易日期）对数据进行分组，然后计算每个时间段内的因子值和对应的未来收益之间的关系。

IC 测量的是因子值与未来收益之间的线性关系。
Rank_IC 测量的是因子值的排序与未来收益排序之间的关系。
第三步：计算 IR
IR 是基于每个因子的 IC 值计算得出的。我们统计每个因子的 IC 在所有时间段内的平均值，同时评估这些 IC 是否稳定。如果因子在每个时间段的表现都比较一致，那么它的 IR 值就会比较高。

3. 结果分析
IC 和 Rank_IC 的结果：

IC 的大小直接反映了因子预测未来收益的能力。正值表示预测效果好，负值表示预测方向错误。
Rank_IC 的大小则更多反映因子排序的有效性。数值越高，说明因子的排序越准确。
例如，营业收入增长率因子在某些时间段表现为正 IC，说明它在这些时间段具有较好的预测能力。
IR 的结果：

IR 值越高，说明因子的预测能力越稳定。
如果一个因子的 IC 均值高，但波动也很大（即不稳定），那么它的 IR 会比较低，这意味着该因子可能在不同时间段的表现差异较大。
通过 IR，我们可以筛选出稳定性较好的因子用于后续分析或投资策略。
4. 总结
IC 和 Rank_IC 是衡量因子预测能力的基础指标， 它们帮助我们判断哪些因子能够有效预测未来收益。
IR 是一个更综合的指标， 它结合了因子的预测能力和稳定性，帮助我们优先选择高效且稳定的因子。
通过本次分析，我们不仅获得了每个因子的预测表现，还能依据这些指标更科学地优化投资策略。

In [3]:
#读取因子数据
factor_data_normalized = pd.read_csv('data/normalized.csv')

In [4]:
#展示因子数据
factor_data_normalized.columns

Index(['Unnamed: 0', 'ts_code', 'trade_date_factor', 'open', 'high', 'low',
       'close', 'pre_close', 'change', 'pct_chg', 'vol', 'amount', 'vol_avg_5',
       'vol_avg_10', 'vol_avg_20', 'vol_avg_60', 'vol_avg_120', 'vol_ema_5',
       'turnover_10_to_120', 'turnover_5_to_120', 'annualized_vol_20',
       'annualized_vol_60', 'annualized_vol_120', 'vol_mean_20',
       'pct_chg_mean_20', 'momentum', 'true_range', 'atr_6', 'atr_14',
       'ann_date', 'end_date', 'revenue_ps', 'assets_yoy', 'ocf_yoy',
       'ebt_yoy', 'netprofit_yoy', 'op_income', 'gross_margin', 'ebit',
       'profit_dedt', 'bps', 'ocfps', 'netprofit_margin', 'quick_ratio',
       'current_ratio', 'op_of_gr', 'roe', 'eps', 'revenue_growth',
       'trade_date_financial', 'total_share', 'market_value'],
      dtype='object')

In [5]:
#计算未来收益
# Define the return periods (e.g., 1-day, 5-day, 20-day returns)
return_periods = [1, 5, 20]
# Calculate future returns for each period
for period in return_periods:
    column_name = f'future_return_{period}d'
    factor_data_normalized[column_name] = factor_data_normalized.groupby('ts_code')['close'].shift(-period) / factor_data_normalized['close'] - 1
factor_data_normalized

Unnamed: 0.1,Unnamed: 0,ts_code,trade_date_factor,open,high,low,close,pre_close,change,pct_chg,...,op_of_gr,roe,eps,revenue_growth,trade_date_financial,total_share,market_value,future_return_1d,future_return_5d,future_return_20d
0,0,000002.SZ,2021-07-02,24.78,24.78,24.04,24.05,24.98,-0.93,-3.7230,...,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594,-0.009563,0.000416,-0.140956
1,1,000002.SZ,2021-07-05,24.06,24.09,23.55,23.82,24.05,-0.23,-0.9563,...,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594,0.027708,0.000000,-0.122586
2,2,000002.SZ,2021-07-06,23.90,24.75,23.76,24.48,23.82,0.66,2.7708,...,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594,-0.017974,-0.026144,-0.147059
3,3,000002.SZ,2021-07-07,24.20,24.48,23.97,24.04,24.48,-0.44,-1.7974,...,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594,0.000416,-0.034526,-0.145175
4,4,000002.SZ,2021-07-08,24.15,24.25,23.71,24.05,24.04,0.01,0.0416,...,0.350925,0.342513,1.184919,1.498343,2021-12-31,1.162538e+06,1.546594,0.000416,-0.022037,-0.127235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832414,832414,871642.BJ,2022-12-28,7.40,7.51,7.40,7.48,7.41,0.07,0.9447,...,1.960412,1.557308,0.731828,1.012685,2022-12-31,1.036703e+04,-1.217777,0.005348,,
832415,832415,871642.BJ,2022-12-29,7.59,7.59,7.42,7.52,7.48,0.04,0.5348,...,0.642430,0.136419,0.039418,-1.045321,2022-06-30,1.036703e+04,-1.227028,0.000000,,
832416,832416,871642.BJ,2022-12-29,7.59,7.59,7.42,7.52,7.48,0.04,0.5348,...,1.960412,1.557308,0.731828,1.012685,2022-12-31,1.036703e+04,-1.217777,-0.003989,,
832417,832417,871642.BJ,2022-12-30,7.70,7.70,7.41,7.49,7.52,-0.03,-0.3989,...,0.642430,0.136419,0.039418,-1.045321,2022-06-30,1.036703e+04,-1.227028,0.000000,,


In [6]:
#计算IC和Rank_IC函数
def calculate_ic(data, factor_col, return_col):
    # 计算IC和Rank_IC
    data = data.dropna(axis=0)
    # Pearson 相关系数 (IC)
    ic = data[[factor_col, return_col]].corr().iloc[0, 1]
    # Spearman 相关系数 (Rank_IC)
    rank_ic = spearmanr(data[factor_col], data[return_col])[0]
    return ic, rank_ic

In [7]:
# 全部因子名称
# List of factors
factors = [
    'revenue_growth', 'assets_yoy', 'ocf_yoy', 'ebt_yoy', 'netprofit_yoy',
    'revenue_ps', 'op_income', 'gross_margin', 'ebit', 'profit_dedt',
    'market_value', 'netprofit_margin', 'quick_ratio', 'current_ratio',
    'op_of_gr', 'roe', 'vol_avg_10', 'vol_avg_20', 'vol_avg_60',
    'vol_avg_120', 'vol_ema_5', 'turnover_10_to_120', 'turnover_5_to_120',
    'annualized_vol_20', 'annualized_vol_60', 'annualized_vol_120',
    'momentum', 'eps'
]
future_return_cols = ['future_return_1d', 'future_return_5d', 'future_return_20d']

In [8]:
# Initialize a results list
results = []
# Loop through each factor and calculate IC and Rank_IC for each timeframe
for factor in tqdm(factors):
    for future_return in future_return_cols:
            grouped_results = []
            # Group by trade_date_factor
            for date, group in factor_data_normalized.groupby('trade_date_factor'):
                group = group[[factor, future_return]].dropna()  
                if len(group) > 1: 
                    # Calculate IC and Rank_IC using the provided function
                    ic, rank_ic = calculate_ic(group, factor, future_return)
                    # Store the results
                    grouped_results.append({
                        'trade_date': date,
                        'Factor': factor,
                        'Future_Return': future_return,
                        'IC': ic,
                        'Rank_IC': rank_ic
                    })
                # Append results to the main list
                results.extend(grouped_results)
ic_rank_ic_results = pd.DataFrame(results)
ic_rank_ic_results

  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(data[factor_col], data[return_col])[0]
  rank_ic = spearmanr(da

Unnamed: 0,trade_date,Factor,Future_Return,IC,Rank_IC
0,2021-07-02,revenue_growth,future_return_1d,0.053582,0.073929
1,2021-07-02,revenue_growth,future_return_1d,0.053582,0.073929
2,2021-07-05,revenue_growth,future_return_1d,-0.011547,-0.020001
3,2021-07-02,revenue_growth,future_return_1d,0.053582,0.073929
4,2021-07-05,revenue_growth,future_return_1d,-0.011547,-0.020001
...,...,...,...,...,...
5639895,2022-12-12,eps,future_return_20d,-0.076925,-0.077297
5639896,2022-12-13,eps,future_return_20d,0.036395,0.084874
5639897,2022-12-14,eps,future_return_20d,0.037488,0.072258
5639898,2022-12-15,eps,future_return_20d,0.044156,0.079289


In [9]:
# 计算 IR
ir_results = []
for (factor, future_return), group in ic_rank_ic_results.groupby(['Factor', 'Future_Return']):
    mean_ic = group['IC'].mean()
    std_ic = group['IC'].std()
    ir = mean_ic / std_ic if std_ic != 0 else None  
    ir_results.append({
        'Factor': factor,
        'Future_Return': future_return,
        'Mean_IC': mean_ic,
        'Std_IC': std_ic,
        'IR': ir
    })
# Convert results to a DataFrame
ir_results_df = pd.DataFrame(ir_results)
ir_results_df

Unnamed: 0,Factor,Future_Return,Mean_IC,Std_IC,IR
0,annualized_vol_120,future_return_1d,-0.005409,0.142298,-0.038012
1,annualized_vol_120,future_return_20d,-0.059723,0.151001,-0.395513
2,annualized_vol_120,future_return_5d,-0.022912,0.149591,-0.153161
3,annualized_vol_20,future_return_1d,-0.001826,0.144454,-0.012640
4,annualized_vol_20,future_return_20d,-0.064210,0.133180,-0.482129
...,...,...,...,...,...
79,vol_avg_60,future_return_20d,-0.031646,0.089872,-0.352124
80,vol_avg_60,future_return_5d,-0.009788,0.097950,-0.099932
81,vol_ema_5,future_return_1d,-0.001822,0.109739,-0.016605
82,vol_ema_5,future_return_20d,-0.038957,0.103464,-0.376528


In [10]:
ic_ir_df = pd.merge(ic_rank_ic_results, ir_results_df, how='left', on=['Factor', 'Future_Return'])
ic_ir_df
#结果展示

Unnamed: 0,trade_date,Factor,Future_Return,IC,Rank_IC,Mean_IC,Std_IC,IR
0,2021-07-02,revenue_growth,future_return_1d,0.053582,0.073929,0.006938,0.161213,0.043034
1,2021-07-02,revenue_growth,future_return_1d,0.053582,0.073929,0.006938,0.161213,0.043034
2,2021-07-05,revenue_growth,future_return_1d,-0.011547,-0.020001,0.006938,0.161213,0.043034
3,2021-07-02,revenue_growth,future_return_1d,0.053582,0.073929,0.006938,0.161213,0.043034
4,2021-07-05,revenue_growth,future_return_1d,-0.011547,-0.020001,0.006938,0.161213,0.043034
...,...,...,...,...,...,...,...,...
5639895,2022-12-12,eps,future_return_20d,-0.076925,-0.077297,0.009401,0.090348,0.104055
5639896,2022-12-13,eps,future_return_20d,0.036395,0.084874,0.009401,0.090348,0.104055
5639897,2022-12-14,eps,future_return_20d,0.037488,0.072258,0.009401,0.090348,0.104055
5639898,2022-12-15,eps,future_return_20d,0.044156,0.079289,0.009401,0.090348,0.104055


In [11]:
ic_ir_df.to_csv('data/ic_ir_df.csv')

IC_Rank加权因子得分计算报告

1. 背景与概念

在量化投资中，我们通常需要综合多个因子的预测能力，为每只股票计算一个最终的得分。这一过程通常称为因子得分计算。IC_Rank加权是一种常见的加权方法，它基于因子排序能力（Rank_IC）来分配权重。

IC_Rank加权的核心理念：

Rank_IC（排名信息系数）：衡量因子值排名与未来收益排名之间的相关性。数值越高，因子的排序能力越强。

加权原则：Rank_IC 越高的因子，其权重越大；Rank_IC 越低的因子，其权重越小。

目标：通过因子的 Rank_IC 对每只股票的因子值加权，得到综合得分。

第一步：准备数据

因子值数据（factor_data）：包括每只股票的多个因子值，按时间和股票编号排列。

Rank_IC 数据（rank_ic_data）：每个因子的 Rank_IC 值，表示因子的排序能力。

第二步：权重归一化

Rank_IC 值需要归一化处理，使得所有因子的权重总和为 1。

第三步：分组计算得分

按时间（如 trade_date）对 factor_data 分组，分别处理每个时间段内的数据。

对每个时间段内的股票，按照公式计算综合得分

In [2]:
def normalize_weights(rank_ic_data, type):
    # Normalize Rank_IC values so that the sum equals 1
    rank_ic_data['Weight'] = rank_ic_data[type] / rank_ic_data[type].sum()
    return rank_ic_data[['Factor', 'Weight']]
def calculate_ic_rank_weighted_scores(factor_data, rank_ic_data, method = 'Rank_IC'):
    weights = normalize_weights(rank_ic_data, method)
    weighted_scores = []
    for date, group in tqdm(factor_data.groupby('trade_date_factor')):
        group = group.set_index('ts_code')  
        score_data = pd.DataFrame(index=group.index) 
        for factor in weights['Factor'].unique():
            if factor in group.columns:
                weight = weights.loc[weights['Factor'] == factor, 'Weight'].values[0]
                score_data[factor] = group[factor] * weight
        score_data['IC_Rank_Score'] = score_data.sum(axis=1)
        score_data['trade_date'] = date  
        weighted_scores.append(score_data.reset_index())

    weighted_scores_df = pd.concat(weighted_scores, ignore_index=True)
    return weighted_scores_df[['ts_code', 'trade_date', 'IC_Rank_Score']]

3. 结果分析

通过计算，每只股票在每个时间段都得到了一个综合得分，代表其在多个因子综合作用下的表现

得分解读：

得分越高的股票，说明在多个因子的加权作用下，其未来表现可能越好。

得分较低的股票，则可能在因子综合评价中处于劣势。

In [6]:
ic_ir_df = pd.read_csv('data/ic_ir_df.csv')
ic_ir_df['IC_IR_weight'] = ic_ir_df['Mean_IC'] * ic_ir_df['IR']
factor_data_normalized = pd.read_csv('data/normalized.csv')

In [7]:
weighted_scores = calculate_ic_rank_weighted_scores(factor_data_normalized, ic_ir_df, method='IC_IR_weight')
weighted_scores

100%|██████████| 366/366 [31:31<00:00,  5.17s/it]  


Unnamed: 0,ts_code,trade_date,IC_Rank_Score
0,000002.SZ,2021-07-02,-3.126902e-08
1,000008.SZ,2021-07-02,-6.291505e-07
2,000009.SZ,2021-07-02,3.981148e-07
3,000012.SZ,2021-07-02,4.923310e-07
4,000016.SZ,2021-07-02,1.709334e-07
...,...,...,...
832414,838030.BJ,2022-12-30,-1.981457e-07
832415,838030.BJ,2022-12-30,-1.527890e-07
832416,838810.BJ,2022-12-30,7.066044e-07
832417,871642.BJ,2022-12-30,-1.655591e-07


In [13]:
weighted_scores_IC_rank = pd.read_csv('data/weighted_scores_rank_IC.csv')
weighted_scores_IC = pd.read_csv('data/weighted_scores_IC.csv')
weighted_scores_IC_IR = pd.read_csv('data/weighted_scores_IC_IR.csv')
weighted_scores_agg = pd.merge(left=weighted_scores_IC_rank, right=weighted_scores_IC, how='inner', on=['ts_code', 'trade_date'])
weighted_scores_agg = pd.merge(left=weighted_scores_agg, right=weighted_scores_IC, how='inner', on=['ts_code', 'trade_date'])
weighted_scores_agg = weighted_scores_agg.reset_index(drop=True).drop(labels=['Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0'], axis=1)
weighted_scores_agg

Unnamed: 0,ts_code,trade_date,IC_Rank_Score_x,IC_Rank_Score_y,IC_Rank_Score
0,000002.SZ,2021-07-02,0.000009,0.000029,0.000029
1,000008.SZ,2021-07-02,0.000005,0.000097,0.000097
2,000009.SZ,2021-07-02,-0.000018,-0.000119,-0.000119
3,000012.SZ,2021-07-02,-0.000012,-0.000078,-0.000078
4,000016.SZ,2021-07-02,-0.000004,0.000013,0.000013
...,...,...,...,...,...
2548960,871642.BJ,2022-12-30,0.000022,0.000078,0.000078
2548961,871642.BJ,2022-12-30,0.000017,0.000111,0.000111
2548962,871642.BJ,2022-12-30,0.000017,0.000111,0.000078
2548963,871642.BJ,2022-12-30,0.000017,0.000078,0.000111


In [None]:
weighted_scores_agg.to_csv('data/weighted_scores_agg.csv')