In [345]:
import numpy as np 
import pandas as pd
from statsmodels import api as sm
from scipy import stats

___

## Risk Proxy

In [346]:
proxy6 = pd.read_csv("./Data_Submission/6_proxies_window30month.csv").drop(columns="Unnamed: 0")
proxy6["time"] = pd.to_datetime(proxy6["time"]).dt.to_period("M")
# 改列名
proxy6.rename(columns={"time": "date", 'id': 'asset'}, inplace=True)
proxy6 = proxy6.sort_values(by=["asset", "date"]).reset_index(drop=True)
# age 取倒数
proxy6["age"] = 1 / proxy6["age"]
# 删掉asset列的后缀
proxy6["asset"] = proxy6["asset"].str.slice(0, 6)
proxy6.head(20)

Unnamed: 0,date,asset,beta_MKT,residual,ivol,ret,retvol,age,disp,CFVOL
0,2014-01,1,1.399288,0.027778,0.061489,-0.069388,0.113328,0.003676,0.000419,
1,2014-02,1,1.403706,-0.076547,0.063209,-0.023684,0.113281,0.003663,0.000487,
2,2014-03,1,1.464743,-0.068598,0.06456,-0.032345,0.113047,0.00365,0.000767,
3,2014-04,1,1.477887,0.027113,0.064676,0.034355,0.112855,0.003636,0.000708,
4,2014-05,1,1.468453,0.007723,0.064564,0.032316,0.111651,0.003623,0.000708,
5,2014-06,1,1.542292,0.019366,0.064616,0.048693,0.111841,0.00361,0.000708,
6,2014-07,1,1.510558,-0.101499,0.067427,0.096872,0.11246,0.003597,0.00057,
7,2014-08,1,1.560131,-0.029834,0.067642,-0.057038,0.113178,0.003584,0.000619,0.003055
8,2014-09,1,1.473321,-0.062953,0.068511,-0.010732,0.111907,0.003571,0.000619,0.003068
9,2014-10,1,1.499607,0.003433,0.068469,0.087771,0.11249,0.003559,0.000379,0.003081


In [347]:
# 筛出date在2014-01到2024-01之间的数据
proxy6 = proxy6[(proxy6["date"] >= "2014-01") & (proxy6["date"] < "2024-01")]

proxy6

Unnamed: 0,date,asset,beta_MKT,residual,ivol,ret,retvol,age,disp,CFVOL
0,2014-01,000001,1.399288,0.027778,0.061489,-0.069388,0.113328,0.003676,0.000419,
1,2014-02,000001,1.403706,-0.076547,0.063209,-0.023684,0.113281,0.003663,0.000487,
2,2014-03,000001,1.464743,-0.068598,0.064560,-0.032345,0.113047,0.003650,0.000767,
3,2014-04,000001,1.477887,0.027113,0.064676,0.034355,0.112855,0.003636,0.000708,
4,2014-05,000001,1.468453,0.007723,0.064564,0.032316,0.111651,0.003623,0.000708,
...,...,...,...,...,...,...,...,...,...,...
495172,2023-08,T00018,-0.000013,0.000005,0.000006,0.000000,0.000000,,,
495173,2023-09,T00018,-0.000019,0.000002,0.000006,0.000000,0.000000,,,
495174,2023-10,T00018,-0.000015,-0.000003,0.000006,0.000000,0.000000,,,
495175,2023-11,T00018,-0.000009,-0.000006,0.000007,0.000000,0.000000,,,


# cgo

In [348]:
cgo = pd.read_csv("./Data_Submission/monthly_cgo.csv").drop(columns=['Unnamed: 0', 'date'])
cgo.rename(columns={"month": "date"}, inplace=True)

# asset 列补齐6位
cgo["asset"] = cgo["asset"].apply(lambda x: str(x).zfill(6))
# 建立 time 
cgo["date"] = pd.to_datetime(cgo["date"]).dt.to_period("M")
cgo = cgo.sort_values(by=["asset", "date"]).reset_index(drop=True)

cgo.head(100)

Unnamed: 0,asset,price,ref_price,cgo,date,turnover
0,000006,134.50423,151.621274,-0.082665,2014-01,0.185758
1,000006,132.34970,149.597034,-0.058902,2014-02,0.192694
2,000006,167.74555,146.789898,-0.045868,2014-03,0.422786
3,000006,149.27815,154.473968,-0.018014,2014-04,0.503996
4,000006,144.96909,153.527388,-0.015897,2014-05,0.119384
...,...,...,...,...,...,...
95,000006,168.93588,189.302830,-0.141027,2021-12,0.229979
96,000006,162.49662,186.462227,-0.091509,2022-01,0.149080
97,000006,165.52686,184.952797,-0.077893,2022-02,0.101913
98,000006,174.99636,181.783827,-0.116091,2022-03,0.181121


In [349]:
cgo = cgo[(cgo["date"] >= "2014-01") & (cgo["date"] < "2024-01")]

cgo

Unnamed: 0,asset,price,ref_price,cgo,date,turnover
0,000006,134.50423,151.621274,-0.082665,2014-01,0.185758
1,000006,132.34970,149.597034,-0.058902,2014-02,0.192694
2,000006,167.74555,146.789898,-0.045868,2014-03,0.422786
3,000006,149.27815,154.473968,-0.018014,2014-04,0.503996
4,000006,144.96909,153.527388,-0.015897,2014-05,0.119384
...,...,...,...,...,...,...
151315,689009,30.63000,36.345720,-0.153832,2023-08,0.123550
151316,689009,34.83000,35.851665,-0.022580,2023-09,0.151428
151317,689009,32.76000,35.738279,-0.078403,2023-10,0.064330
151318,689009,35.28000,35.497298,-0.062475,2023-11,0.167780


# 筛选

In [350]:
csi500_mask = pd.read_pickle("./Data_Submission/csi500_mask_monthly.pkl")

# 转换为长表
csi500_long = csi500_mask.stack().reset_index()
csi500_long.columns = ['date', 'stock', 'mask']

csi500_long["date"] = pd.to_datetime(csi500_long["date"]).dt.to_period("M")
csi500_long["stock"] = csi500_long["stock"].str.slice(0, 6)

csi500_long


Unnamed: 0,date,stock,mask
0,2007-01,000005,False
1,2007-01,000006,True
2,2007-01,000008,False
3,2007-01,000009,False
4,2007-01,000012,False
...,...,...,...
361013,2024-10,688777,True
361014,2024-10,688778,True
361015,2024-10,688779,True
361016,2024-10,688819,True


In [351]:
# 只保留 mask == True 的行
csi500_components = csi500_long[csi500_long['mask']].copy()

# 丢弃 mask 列（因为全是 True）
csi500_components = csi500_components[['date', 'stock']].reset_index(drop=True)
# rename
csi500_components.rename(columns={'stock': 'asset'}, inplace=True)
csi500_components = csi500_components.sort_values(by=['asset', 'date']).reset_index(drop=True)
csi500_components = csi500_components[(csi500_components["date"] >= "2014-01") & (csi500_components["date"] < "2024-01")]

csi500_components

Unnamed: 0,date,asset
120,2014-01,000006
121,2014-02,000006
122,2014-03,000006
123,2014-04,000006
124,2014-05,000006
...,...,...
106985,2023-08,689009
106986,2023-09,689009
106987,2023-10,689009
106988,2023-11,689009


In [None]:
untradable = pd.read_pickle('./Data_Submission/UNTRADABLE.pkl').reset_index()
blacklist = pd.read_pickle('./Data_Submission/BLACKLIST.pkl').reset_index()
trading_calendar = pd.read_csv('./Data_Submission/stock_calendar_2008_2024.csv')

# 合并两个黑名单
combined = pd.concat([blacklist, untradable], ignore_index=True)

# 按 (date, asset) 去重（保留唯一组合）
combined_dedup = combined.drop_duplicates(subset=['date', 'asset']).reset_index(drop=True)

# 排序
combined_dedup = combined_dedup.sort_values(by=['asset', 'date']).reset_index(drop=True)
trading_calendar = trading_calendar.sort_values(by=['date']).reset_index(drop=True)

# asset列只保留前6位
combined_dedup['asset'] = combined_dedup['asset'].str[:6]

trading_calendar['date'] = pd.to_datetime(trading_calendar['date'])
combined_dedup['date'] = pd.to_datetime(combined_dedup['date'])
combined_dedup = combined_dedup[(combined_dedup['date']>="2014-01-01") & (combined_dedup['date']<'2024-01-01')].reset_index(drop=True)
trading_calendar = trading_calendar[(trading_calendar['date']>="2014-01-01") & (trading_calendar['date']<'2024-01-01')].reset_index(drop=True)

# 找到 trading_calendar 中每个月的最后一个交易日
monthly_last = (
    trading_calendar
    .groupby(trading_calendar['date'].dt.to_period('M'))['date']
    .max()
    .reset_index(drop=True)
)

# 筛选 combined_dedup，只保留这些月末交易日的数据
black = combined_dedup[combined_dedup['date'].isin(monthly_last)].sort_values(by=['date', 'asset']).reset_index(drop=True)

black

Unnamed: 0,date,asset
0,2014-01-30,000017
1,2014-01-30,000018
2,2014-01-30,000035
3,2014-01-30,000037
4,2014-01-30,000045
...,...,...
64044,2023-12-29,873679
64045,2023-12-29,873693
64046,2023-12-29,873703
64047,2023-12-29,873726


In [353]:
# black to month
blacklist_month = black.copy()
blacklist_month['date'] = blacklist_month['date'].dt.to_period('M')

# 执行 merge 并标记
merged = csi500_components.merge(
    blacklist_month,
    on=['date', 'asset'],
    how='left',
    indicator=True
)

# 被保留的（不在黑名单中）
clean_csi500 = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')

# 被剔除的（在黑名单中）
removed = merged[merged['_merge'] == 'both'].drop(columns='_merge')

In [354]:
clean_csi500

Unnamed: 0,date,asset
0,2014-01,000006
1,2014-02,000006
2,2014-03,000006
3,2014-04,000006
4,2014-05,000006
...,...,...
59995,2023-08,689009
59996,2023-09,689009
59997,2023-10,689009
59998,2023-11,689009


## LOGBM & LOGME & MOM
___

In [355]:
lmm_data = pd.read_csv("./Data_Submission/LOGBM_LOGME_MOM.csv")
lmm_data["date"] = pd.to_datetime(lmm_data["date"]).dt.to_period("M")
lmm_data = lmm_data.sort_values(by=["asset", "date"]).reset_index(drop=True)
lmm_data["asset"] = lmm_data["asset"].str.slice(0, 6)
lmm_data = lmm_data[(lmm_data["date"] >= "2014-01") & (lmm_data["date"] < "2024-01")]

lmm_data

Unnamed: 0,date,asset,LOGBM,LOGME,mom_minus1_0,mom_minus12_minus1,mom_minus36_minus12,mom_minus11_minus2
0,2014-01,000006,-0.364087,22.498130,-0.113590,-0.044386,0.333161,0.191247
1,2014-02,000006,-0.347977,22.481982,-0.016018,-0.100907,0.236880,0.174344
2,2014-03,000006,-0.575377,22.707912,0.253488,0.024275,-0.016162,0.064608
3,2014-04,000006,-0.471066,22.602345,-0.100186,0.313097,-0.029359,-0.078152
4,2014-05,000006,-0.441797,22.573055,-0.028866,0.039759,0.215648,0.265258
...,...,...,...,...,...,...,...,...
151315,2023-08,689009,-1.520869,23.898672,-0.039390,-0.210084,,-0.038110
151316,2023-09,689009,-1.904254,23.941480,0.042502,-0.127904,,0.086875
151317,2023-10,689009,-1.848140,23.885864,-0.058570,0.088437,,0.000899
151318,2023-11,689009,-1.857547,23.895274,0.009454,-0.017675,-0.585856,0.142342


表格合成
___

In [356]:
# inner merge
data_panel = pd.merge(clean_csi500, proxy6, on=['date', 'asset'], how='left')
data_panel = pd.merge(data_panel, cgo, on=['date', 'asset'], how='left')
data_panel = pd.merge(data_panel, lmm_data, on=['date', 'asset'], how='left')
data_panel = data_panel.sort_values(by=['date', 'asset']).reset_index(drop=True)

data_panel

Unnamed: 0,date,asset,beta_MKT,residual,ivol,ret,retvol,age,disp,CFVOL,price,ref_price,cgo,turnover,LOGBM,LOGME,mom_minus1_0,mom_minus12_minus1,mom_minus36_minus12,mom_minus11_minus2
0,2014-01,000006,1.590473,-0.059596,0.069884,-0.113590,0.115292,0.003846,,0.064005,134.50423,151.621274,-0.082665,0.185758,-0.364087,22.498130,-0.113590,-0.044386,0.333161,0.191247
1,2014-01,000021,1.656603,0.206591,0.076926,0.263258,0.133417,0.004202,,0.090787,89.51140,74.851397,0.082632,0.226100,-0.719351,23.007004,0.263258,0.132255,-0.546931,0.183111
2,2014-01,000028,1.060526,0.040834,0.092545,0.142117,0.122377,0.004098,,0.032958,201.94872,141.584955,0.293832,0.137941,-1.971884,23.447015,0.142117,0.316231,0.268081,0.182717
3,2014-01,000030,1.012152,-0.071794,0.211851,-0.034539,0.230610,0.004115,,0.082261,12.30939,13.128609,-0.066552,0.276862,-0.732993,22.754480,-0.034539,-0.337691,0.012128,-0.333333
4,2014-01,000031,1.515985,-0.018262,0.054016,-0.043127,0.102956,0.004132,,0.030274,48.29065,54.824986,-0.148251,0.047347,-0.294459,22.585600,-0.043127,-0.139791,-0.282083,-0.126026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57398,2023-12,688772,,,,0.039188,0.177714,0.040000,0.021115,0.037765,22.27412,20.516041,0.024410,0.387591,-1.303239,23.929748,0.039188,0.141908,,-0.090473
57399,2023-12,688778,,,,-0.005519,0.144525,0.037037,0.011731,0.055941,56.24916,64.883019,-0.201379,0.183250,-0.678592,23.537438,-0.005519,-0.276210,,-0.320315
57400,2023-12,688779,,,,-0.021448,0.107004,0.037037,0.018505,0.087274,7.49710,10.312308,-0.426306,0.392223,-0.639694,23.368255,-0.021448,-0.478536,,-0.467899
57401,2023-12,688819,0.706309,-0.051722,,-0.056757,0.133183,0.029412,0.014327,0.072635,29.31600,34.554888,-0.182941,0.358491,-0.605681,24.024313,-0.056757,-0.180683,,-0.172940


In [357]:
monthly_ret = pd.read_pickle("./Data_Submission/monthly_returns.pkl")

# 转换为长表
monthly_ret = monthly_ret.stack().reset_index()
monthly_ret.columns = ['date', 'stock', 'return']

monthly_ret["date"] = pd.to_datetime(monthly_ret["date"]).dt.to_period("M")
monthly_ret["stock"] = monthly_ret["stock"].str.slice(0, 6)

# sort based on date and stock，然后把未来一期的return对齐到当前行
monthly_ret = monthly_ret.sort_values(by=['stock', 'date']).reset_index(drop=True)
monthly_ret['future_return'] = monthly_ret.groupby('stock')['return'].shift(-1)

monthly_ret = monthly_ret[(monthly_ret["date"] >= "2014-01") & (monthly_ret["date"] < "2024-01")]

monthly_ret

Unnamed: 0,date,stock,return,future_return
227,2014-01,000001,-0.069388,-0.023684
228,2014-02,000001,-0.023684,-0.032345
229,2014-03,000001,-0.032345,0.034355
230,2014-04,000001,0.034355,0.032316
231,2014-05,000001,0.032316,0.048693
...,...,...,...,...
801892,2023-08,T00018,0.000000,0.000000
801893,2023-09,T00018,0.000000,0.000000
801894,2023-10,T00018,0.000000,0.000000
801895,2023-11,T00018,0.000000,0.000000


In [358]:
data_panel = pd.merge(data_panel, monthly_ret[['date', 'stock', 'future_return']].rename(columns={'stock': 'asset'}), on=['date', 'asset'], how='left')

data_panel = data_panel.sort_values(by=['date', 'asset']).reset_index(drop=True)

data_panel

Unnamed: 0,date,asset,beta_MKT,residual,ivol,ret,retvol,age,disp,CFVOL,...,ref_price,cgo,turnover,LOGBM,LOGME,mom_minus1_0,mom_minus12_minus1,mom_minus36_minus12,mom_minus11_minus2,future_return
0,2014-01,000006,1.590473,-0.059596,0.069884,-0.113590,0.115292,0.003846,,0.064005,...,151.621274,-0.082665,0.185758,-0.364087,22.498130,-0.113590,-0.044386,0.333161,0.191247,-0.016018
1,2014-01,000021,1.656603,0.206591,0.076926,0.263258,0.133417,0.004202,,0.090787,...,74.851397,0.082632,0.226100,-0.719351,23.007004,0.263258,0.132255,-0.546931,0.183111,-0.077961
2,2014-01,000028,1.060526,0.040834,0.092545,0.142117,0.122377,0.004098,,0.032958,...,141.584955,0.293832,0.137941,-1.971884,23.447015,0.142117,0.316231,0.268081,0.182717,-0.078858
3,2014-01,000030,1.012152,-0.071794,0.211851,-0.034539,0.230610,0.004115,,0.082261,...,13.128609,-0.066552,0.276862,-0.732993,22.754480,-0.034539,-0.337691,0.012128,-0.333333,0.255537
4,2014-01,000031,1.515985,-0.018262,0.054016,-0.043127,0.102956,0.004132,,0.030274,...,54.824986,-0.148251,0.047347,-0.294459,22.585600,-0.043127,-0.139791,-0.282083,-0.126026,-0.090141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57398,2023-12,688772,,,,0.039188,0.177714,0.040000,0.021115,0.037765,...,20.516041,0.024410,0.387591,-1.303239,23.929748,0.039188,0.141908,,-0.090473,-0.390277
57399,2023-12,688778,,,,-0.005519,0.144525,0.037037,0.011731,0.055941,...,64.883019,-0.201379,0.183250,-0.678592,23.537438,-0.005519,-0.276210,,-0.320315,-0.240666
57400,2023-12,688779,,,,-0.021448,0.107004,0.037037,0.018505,0.087274,...,10.312308,-0.426306,0.392223,-0.639694,23.368255,-0.021448,-0.478536,,-0.467899,-0.230137
57401,2023-12,688819,0.706309,-0.051722,,-0.056757,0.133183,0.029412,0.014327,0.072635,...,34.554888,-0.182941,0.358491,-0.605681,24.024313,-0.056757,-0.180683,,-0.172940,-0.116046


___

## Score

In [359]:
def grouped_qcut(series, q=101):
    out = pd.Series(np.nan, index=series.index)
    valid = series.dropna()
    out.loc[valid.index] = pd.qcut(valid, q, labels=False, duplicates='raise')
    return out
# 假设高 LOGBM 是好股票（被低估），那么它应该有低的 Mispricing Score
data_panel['rank_LOGBM'] = data_panel.groupby('date')['LOGBM'].transform(
    lambda s: 100 - grouped_qcut(s, 101) # 反向
)
# 动量同理，如果是 Winners (高动量) 预期收益高，则应对应低 Mispricing
data_panel['rank_Mom_minus11_minus_2'] = data_panel.groupby('date')['mom_minus11_minus2'].transform(
    lambda s: 100 - grouped_qcut(s, 101)
)
data_panel['score'] = (data_panel['rank_LOGBM'] + data_panel['rank_Mom_minus11_minus_2'])/2

data_panel

Unnamed: 0,date,asset,beta_MKT,residual,ivol,ret,retvol,age,disp,CFVOL,...,LOGBM,LOGME,mom_minus1_0,mom_minus12_minus1,mom_minus36_minus12,mom_minus11_minus2,future_return,rank_LOGBM,rank_Mom_minus11_minus_2,score
0,2014-01,000006,1.590473,-0.059596,0.069884,-0.113590,0.115292,0.003846,,0.064005,...,-0.364087,22.498130,-0.113590,-0.044386,0.333161,0.191247,-0.016018,19.0,32.0,25.5
1,2014-01,000021,1.656603,0.206591,0.076926,0.263258,0.133417,0.004202,,0.090787,...,-0.719351,23.007004,0.263258,0.132255,-0.546931,0.183111,-0.077961,40.0,33.0,36.5
2,2014-01,000028,1.060526,0.040834,0.092545,0.142117,0.122377,0.004098,,0.032958,...,-1.971884,23.447015,0.142117,0.316231,0.268081,0.182717,-0.078858,96.0,33.0,64.5
3,2014-01,000030,1.012152,-0.071794,0.211851,-0.034539,0.230610,0.004115,,0.082261,...,-0.732993,22.754480,-0.034539,-0.337691,0.012128,-0.333333,0.255537,42.0,99.0,70.5
4,2014-01,000031,1.515985,-0.018262,0.054016,-0.043127,0.102956,0.004132,,0.030274,...,-0.294459,22.585600,-0.043127,-0.139791,-0.282083,-0.126026,-0.090141,16.0,73.0,44.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57398,2023-12,688772,,,,0.039188,0.177714,0.040000,0.021115,0.037765,...,-1.303239,23.929748,0.039188,0.141908,,-0.090473,-0.390277,79.0,50.0,64.5
57399,2023-12,688778,,,,-0.005519,0.144525,0.037037,0.011731,0.055941,...,-0.678592,23.537438,-0.005519,-0.276210,,-0.320315,-0.240666,47.0,85.0,66.0
57400,2023-12,688779,,,,-0.021448,0.107004,0.037037,0.018505,0.087274,...,-0.639694,23.368255,-0.021448,-0.478536,,-0.467899,-0.230137,46.0,97.0,71.5
57401,2023-12,688819,0.706309,-0.051722,,-0.056757,0.133183,0.029412,0.014327,0.072635,...,-0.605681,24.024313,-0.056757,-0.180683,,-0.172940,-0.116046,45.0,64.0,54.5


___

## 归一化

In [360]:
key_elements = ['date', 'asset', 'beta_MKT', 'ivol', 'retvol', 'age', 'disp', 'CFVOL', 'LOGBM', 'LOGME', 'mom_minus1_0', 'mom_minus12_minus1', 'mom_minus36_minus12', 'mom_minus11_minus2', 'cgo', 'turnover', 'future_return', 'score']
data_panel = data_panel[key_elements]
data_panel = data_panel.sort_values(by=['date', 'asset']).reset_index(drop=True)

data_panel

Unnamed: 0,date,asset,beta_MKT,ivol,retvol,age,disp,CFVOL,LOGBM,LOGME,mom_minus1_0,mom_minus12_minus1,mom_minus36_minus12,mom_minus11_minus2,cgo,turnover,future_return,score
0,2014-01,000006,1.590473,0.069884,0.115292,0.003846,,0.064005,-0.364087,22.498130,-0.113590,-0.044386,0.333161,0.191247,-0.082665,0.185758,-0.016018,25.5
1,2014-01,000021,1.656603,0.076926,0.133417,0.004202,,0.090787,-0.719351,23.007004,0.263258,0.132255,-0.546931,0.183111,0.082632,0.226100,-0.077961,36.5
2,2014-01,000028,1.060526,0.092545,0.122377,0.004098,,0.032958,-1.971884,23.447015,0.142117,0.316231,0.268081,0.182717,0.293832,0.137941,-0.078858,64.5
3,2014-01,000030,1.012152,0.211851,0.230610,0.004115,,0.082261,-0.732993,22.754480,-0.034539,-0.337691,0.012128,-0.333333,-0.066552,0.276862,0.255537,70.5
4,2014-01,000031,1.515985,0.054016,0.102956,0.004132,,0.030274,-0.294459,22.585600,-0.043127,-0.139791,-0.282083,-0.126026,-0.148251,0.047347,-0.090141,44.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57398,2023-12,688772,,,0.177714,0.040000,0.021115,0.037765,-1.303239,23.929748,0.039188,0.141908,,-0.090473,0.024410,0.387591,-0.390277,64.5
57399,2023-12,688778,,,0.144525,0.037037,0.011731,0.055941,-0.678592,23.537438,-0.005519,-0.276210,,-0.320315,-0.201379,0.183250,-0.240666,66.0
57400,2023-12,688779,,,0.107004,0.037037,0.018505,0.087274,-0.639694,23.368255,-0.021448,-0.478536,,-0.467899,-0.426306,0.392223,-0.230137,71.5
57401,2023-12,688819,0.706309,,0.133183,0.029412,0.014327,0.072635,-0.605681,24.024313,-0.056757,-0.180683,,-0.172940,-0.182941,0.358491,-0.116046,54.5


In [361]:
data_panel.isna().sum()

date                      0
asset                     0
beta_MKT               2319
ivol                   7541
retvol                    0
age                       0
disp                   2502
CFVOL                  2706
LOGBM                     7
LOGME                     1
mom_minus1_0              0
mom_minus12_minus1        0
mom_minus36_minus12    3272
mom_minus11_minus2        0
cgo                     273
turnover                  3
future_return             0
score                     7
dtype: int64

In [362]:
from scipy.stats.mstats import winsorize

# 定义要处理的列
cols_to_process = [
    'beta_MKT', 'ivol', 'retvol', 'age', 'disp', 'CFVOL', 'LOGBM', 'LOGME',
    'mom_minus1_0', 'mom_minus12_minus1', 'mom_minus36_minus12', 
    'mom_minus11_minus2', 'turnover','cgo', 'score'
]

# 1. Winsorize：1% - 99% 每个截面（按 date）
def winsorize_series(series, limits=(0.01, 0.01)):
    # winsorize 会自动忽略 NaN
    return winsorize(series, limits=limits, nan_policy='omit')

# 应用 winsorize（按 date 分组）
for col in cols_to_process:
    data_panel[col] = data_panel.groupby('date')[col].transform(
        lambda x: winsorize_series(x, limits=(0.01, 0.01))
    )

# 2. 截面标准化（Z-score）：跳过 NaN
def zscore_series(series):
    return (series - series.mean()) / series.std()

for col in cols_to_process:
    data_panel[col] = data_panel.groupby('date')[col].transform(zscore_series)

data_panel

Unnamed: 0,date,asset,beta_MKT,ivol,retvol,age,disp,CFVOL,LOGBM,LOGME,mom_minus1_0,mom_minus12_minus1,mom_minus36_minus12,mom_minus11_minus2,cgo,turnover,future_return,score
0,2014-01,000006,0.714892,-0.385213,-0.253707,-0.698990,,-0.195242,0.852785,-0.290561,-1.010538,-0.434441,1.405009,0.228706,-0.437893,-0.381935,-0.016018,-1.735858
1,2014-01,000021,0.902618,-0.210387,0.403084,-0.672945,,0.330226,0.302500,0.840230,2.201666,0.015993,-1.109247,0.205026,0.858371,-0.227697,-0.077961,-0.956005
2,2014-01,000028,-0.789488,0.177394,0.003037,-0.682054,,-0.804418,-1.637611,1.817995,1.169076,0.485133,1.219088,0.203879,2.514612,-0.564752,-0.078858,1.029076
3,2014-01,000030,-0.926812,3.139430,3.924975,-0.680567,,0.162950,0.281369,0.279086,-0.336719,-1.182369,0.487879,-1.298161,-0.311534,-0.033620,0.255537,1.454450
4,2014-01,000031,0.503439,-0.779173,-0.700685,-0.679068,,-0.857062,0.960635,-0.096190,-0.409915,-0.677725,-0.352625,-0.694764,-0.952216,-0.911117,-0.090141,-0.388839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57398,2023-12,688772,,,1.188984,2.479040,1.459955,-0.483142,-0.825818,0.268728,1.004084,0.500160,,-0.035467,1.150507,0.641422,-0.390277,0.647214
57399,2023-12,688778,,,0.525038,2.227011,0.561135,0.007475,0.054129,-0.952300,0.288576,-0.910079,,-0.952374,-0.640407,-0.292542,-0.240666,0.714458
57400,2023-12,688779,,,-0.225561,2.227011,1.209956,0.853260,0.108926,-1.478866,0.033653,-1.592488,,-1.541132,-2.424496,0.662592,-0.230137,0.961017
57401,2023-12,688819,-0.840286,,0.298146,1.578404,0.809786,0.458108,0.156839,0.563051,-0.531444,-0.587884,,-0.364451,-0.494163,0.508415,-0.116046,0.198923


In [363]:
data_panel.to_pickle("./Data_Submission/final_data_panel_proxies_window30month.pkl")

___