In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# 1. 基础配置

In [2]:
root_path = Path('../')
BM_path = root_path / 'data' / 'value_market.h5'
csi500_path = root_path / 'data' / 'csi500_mask_monthly.pkl'
mcap_path = root_path / 'data' / 'mcap.pkl'

backtest = pd.to_datetime('2014-01-01')
end = pd.to_datetime('2024-01-01')

# 2. 构建指数掩码与多层索引
# - 依据 CSI500 月度掩码筛出样本股票；
# - 生成 (date, asset) 的 MultiIndex 方便后续对齐。

In [3]:
mask = pd.read_pickle(csi500_path)
mask_slice = mask.loc[backtest:end] # 选取时间区间

tickers = mask_slice.columns[mask_slice.any(axis=0)].tolist()   # 只要这一段时间内曾经为 True 的股票就纳入
tickers = sorted(tickers)
tickers = list(set(tickers))
mask = mask[tickers]
print(len(tickers))

dates = pd.date_range(start=backtest, end=end, freq='ME')
multi_idx = pd.MultiIndex.from_product(
    [dates, tickers],
    names=['date', 'asset']
)
multi_idx

1261


MultiIndex([('2014-01-31', '000417.SZ'),
            ('2014-01-31', '600021.SH'),
            ('2014-01-31', '600348.SH'),
            ('2014-01-31', '000049.SZ'),
            ('2014-01-31', '600141.SH'),
            ('2014-01-31', '603568.SH'),
            ('2014-01-31', '000685.SZ'),
            ('2014-01-31', '688063.SH'),
            ('2014-01-31', '600096.SH'),
            ('2014-01-31', '002624.SZ'),
            ...
            ('2023-12-31', '600429.SH'),
            ('2023-12-31', '300308.SZ'),
            ('2023-12-31', '000683.SZ'),
            ('2023-12-31', '600549.SH'),
            ('2023-12-31', '000572.SZ'),
            ('2023-12-31', '600352.SH'),
            ('2023-12-31', '002292.SZ'),
            ('2023-12-31', '600392.SH'),
            ('2023-12-31', '000735.SZ'),
            ('2023-12-31', '600067.SH')],
           names=['date', 'asset'], length=151320)

# 3. 读取并处理 BM 数据
# - 读取 HDF 中的 BM（账面市值比）；
# - 月末频率对齐，计算对数值 LOGBM；
# - 重建与 `multi_idx` 同步的索引。


In [4]:
BM = pd.read_hdf(BM_path, key='data')
BM = BM[['BM']]
BM = BM.loc[backtest:end]
BM = BM.reset_index()
BM.columns = ['date', 'asset', 'BM']
BM

Unnamed: 0,date,asset,BM
0,2014-01-02,000001.SZ,0.946701
1,2014-01-02,000002.SZ,0.775675
2,2014-01-02,000004.SZ,0.078996
3,2014-01-02,000005.SZ,0.286862
4,2014-01-02,000006.SZ,0.571755
...,...,...,...
8898869,2023-12-29,688799.SH,0.363888
8898870,2023-12-29,688800.SH,0.305521
8898871,2023-12-29,688819.SH,0.545703
8898872,2023-12-29,688981.SH,0.339432


In [5]:
BM = (
    BM.set_index('date')
      .groupby('asset')['BM']     # 只对 BM 这一列做 resample
      .resample('ME')
      .last()
      .reset_index()
)
BM

Unnamed: 0,asset,date,BM
0,000001.SZ,2014-01-31,0.874508
1,000001.SZ,2014-02-28,0.895656
2,000001.SZ,2014-03-31,1.093016
3,000001.SZ,2014-04-30,1.105950
4,000001.SZ,2014-05-31,1.071352
...,...,...,...
440824,689009.SH,2023-08-31,0.218522
440825,689009.SH,2023-09-30,0.148934
440826,689009.SH,2023-10-31,0.157530
440827,689009.SH,2023-11-30,0.156055


In [6]:
BM = BM.set_index(['date', 'asset']).sort_index(level=[0, 1])

BM['LOGBM'] = np.log(BM['BM'])
BM = BM[['LOGBM']]
BM = BM.reindex(index=multi_idx)
BM

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,Unnamed: 1_level_0,LOGBM
date,asset,Unnamed: 2_level_1
2014-01-31,000417.SZ,-0.403396
2014-01-31,600021.SH,-0.188718
2014-01-31,600348.SH,-0.028393
2014-01-31,000049.SZ,-2.983736
2014-01-31,600141.SH,-0.390284
...,...,...
2023-12-31,600352.SH,0.169366
2023-12-31,002292.SZ,-1.335133
2023-12-31,600392.SH,-0.704384
2023-12-31,000735.SZ,-0.469816


# 4. 读取并处理市值数据

In [7]:
mcap = pd.read_pickle(mcap_path)
mcap = mcap.stack(dropna=False).to_frame(name='mcap')
mcap = mcap.loc[backtest:end]
mcap = mcap[mcap.index.get_level_values('asset').isin(tickers)]
mcap['LOGME'] = np.log(mcap['mcap'])
mcap = mcap[['LOGME']]
mcap = mcap.reindex(index=multi_idx)
mcap

  mcap = mcap.stack(dropna=False).to_frame(name='mcap')


Unnamed: 0_level_0,Unnamed: 1_level_0,LOGME
date,asset,Unnamed: 2_level_1
2014-01-31,000417.SZ,22.204540
2014-01-31,600021.SH,22.981338
2014-01-31,600348.SH,23.390907
2014-01-31,000049.SZ,22.902458
2014-01-31,600141.SH,22.347917
...,...,...
2023-12-31,600352.SH,24.031177
2023-12-31,002292.SZ,23.283483
2023-12-31,600392.SH,23.604921
2023-12-31,000735.SZ,22.617015


# 5. 合并与导出
# - 拼接 LOGBM 与 LOGME；
# - 保存为 `BM_mcap.pkl`。

In [8]:
data = pd.concat([BM, mcap], axis=1)
data

Unnamed: 0_level_0,Unnamed: 1_level_0,LOGBM,LOGME
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-31,000417.SZ,-0.403396,22.204540
2014-01-31,600021.SH,-0.188718,22.981338
2014-01-31,600348.SH,-0.028393,23.390907
2014-01-31,000049.SZ,-2.983736,22.902458
2014-01-31,600141.SH,-0.390284,22.347917
...,...,...,...
2023-12-31,600352.SH,0.169366,24.031177
2023-12-31,002292.SZ,-1.335133,23.283483
2023-12-31,600392.SH,-0.704384,23.604921
2023-12-31,000735.SZ,-0.469816,22.617015


In [9]:
data.to_pickle(root_path / 'data' / 'BM_mcap.pkl')