In [1]:
# !conda activate Investor-Sentiment
# !conda install -c conda-forge statsmodels

# 情绪投资策略(2014-2022年全A股市场)

In [2]:
%%time
# from tqdm import tqdm
import pandas as pd
import numpy as np
# from statsmodels.regression.rolling import RollingOLS

CPU times: user 1.66 s, sys: 470 ms, total: 2.13 s
Wall time: 470 ms


## 1.提取用于计算的面板数据

In [3]:
%%time
import os

if not os.path.exists('../DataSets/TEMP_PANEL_FINAL.parquet'):
    from sqlalchemy import create_engine

    ENGINE = create_engine('mysql+mysqlconnector://root:1111@localhost:3306')
    (pd.read_sql_table('TEMP_PANEL_FINAL', ENGINE,
                       schema='COLIN_PANEL',
                       columns=['ts_code', 'trade_date', 'pct_chg', 'total_mv',
                                'riskfree_return', 'index_return', 'img_neg', 'tex_neg']
                       )
     .astype(dtype={'ts_code': 'category', 'trade_date': 'category'})
     .to_parquet('../DataSets/TEMP_PANEL_FINAL.parquet'))


CPU times: user 509 µs, sys: 629 µs, total: 1.14 ms
Wall time: 708 µs


In [4]:
%%time
df_panel = pd.read_parquet('../DataSets/TEMP_PANEL_FINAL.parquet')
# df_panel=df_panel.iloc[:10000, :]
df_panel

CPU times: user 363 ms, sys: 244 ms, total: 607 ms
Wall time: 282 ms


Unnamed: 0,ts_code,trade_date,pct_chg,total_mv,riskfree_return,index_return,img_neg,tex_neg
0,000001.SZ,20140102,-0.1641,10025400.0,5.910,-0.3454,0.000000,0.000000
1,000001.SZ,20140103,-2.4524,9779450.0,6.201,-1.3436,0.000000,0.333333
2,000001.SZ,20140106,-2.1804,9566320.0,6.475,-2.2762,0.285714,0.142857
3,000001.SZ,20140107,-0.3428,9533530.0,5.925,-0.0284,0.000000,0.333333
4,000001.SZ,20140108,1.1192,9640100.0,5.650,0.1747,0.333333,0.000000
...,...,...,...,...,...,...,...,...
7134605,873527.BJ,20221122,-2.0496,54518.8,1.947,0.0118,,
7134606,873527.BJ,20221123,0.1101,54578.8,1.949,0.1049,,
7134607,873527.BJ,20221124,0.0000,54578.8,1.940,-0.4431,,
7134608,873527.BJ,20221125,-0.2200,54458.7,1.924,0.5049,,


## 2.构造分组高低组合

#### 2.1 计算面板数据的异质波动率IDVOL

In [5]:
%%time
from statsmodels.regression.rolling import RollingOLS


def roll_idvol(df_code: pd.DataFrame, ols_window: int, var_ma: int) -> pd.DataFrame:
    """
    滚动OLS回归求异质波动率
    """
    try:
        # 定义回归变量 CAPM回归: (rm-rf)=a+b*(RM-rf)
        df_ols = pd.DataFrame()
        df_ols['Y'] = df_code['pct_chg'] - df_code['riskfree_return']
        df_ols['const'] = 1  #带截距项回归
        df_ols['X'] = df_code['index_return'] - df_code['riskfree_return']

        # 估计参数
        model = RollingOLS(endog=df_ols['Y'].values, exog=df_ols[['const', 'X']], window=ols_window)
        df_para = model.fit().params

        # 预测残差
        df_para['residual'] = df_para['const'] + df_ols['X']*df_para['X'] - df_ols['Y']

        #计算月波动率
        df_para['idvol'] = df_para['residual'].rolling(var_ma).apply(lambda x: np.var(x, ddof=1))

        return pd.concat([df_code[['ts_code', 'trade_date', 'pct_chg', 'total_mv',
                                   'index_return', 'img_neg', 'tex_neg']], df_para[['idvol']]],
                         axis=1)

    except Exception as e:
        print(e)
        return pd.DataFrame()


# 多线程分组计算
if not os.path.exists('../DataSets/TEMP_PANEL_FINAL_IDVOL.parquet'):
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=True)
    df_out = (df_panel.groupby('ts_code')
              [['ts_code', 'trade_date', 'pct_chg', 'total_mv',
                'riskfree_return', 'index_return', 'img_neg', 'tex_neg']]
              .parallel_apply(lambda x: roll_idvol(x, 5, 30)))

    # 保存异质波动率计算结果
    df_out.to_parquet('../DataSets/TEMP_PANEL_FINAL_IDVOL.parquet')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=625), Label(value='0 / 625'))), HB…

index 4 is out of bounds for axis 0 with size 2
index 4 is out of bounds for axis 0 with size 1
index 4 is out of bounds for axis 0 with size 2
index 4 is out of bounds for axis 0 with size 3
CPU times: user 14.9 s, sys: 3.85 s, total: 18.7 s
Wall time: 9min 44s


### 2.2 按照异质波动率分组