# 情绪投资策略(2014-2022年全A股市场)

## 数据准备

### 1.1 加载面板数据

In [1]:
%%time
import sys
import os
import numpy as np
import cudf
import pandas as pd

sys.path.append('/home/ubuntu/notebook/Investor-Sentiment')


def load_data():
    # 数据集:个股K线面板数据,个股基本面数据
    data_list = ['ASHARE_BAR_PANEL.parquet', 'ASHARE_BASIC_PANEL.parquet']
    if not set(data_list).issubset(os.listdir('./DataSets/')):
        from loader.findata_loader import DownLoader
        DownLoader(MAX_CORE=10).load_data()


load_data()

CPU times: user 1.3 s, sys: 788 ms, total: 2.09 s
Wall time: 1.55 s


In [2]:
%%time

def extract_panel():
    # 个股K线数据
    df_bar = (cudf.read_parquet('./DataSets/ASHARE_BAR_PANEL.parquet', columns=['trade_date', 'ts_code', 'pct_chg'])
              .rename(columns={'pct_chg': 'share_return'}))

    # 个股基本面数据
    df_basic = cudf.read_parquet('./DataSets/ASHARE_BASIC_PANEL.parquet', columns=['trade_date', 'ts_code', 'total_mv'])

    # 合并
    df_p = cudf.concat([df_bar, df_basic], join="left", axis=1, sort=True)

    # 压缩数据
    df_p.index.levels[1].astype('category', inplace=True)
    return df_p


def extract_time_series():
    # 股指数据
    from utils.sql import DB
    db_loader = DB()
    df_share_index = (
        pd.read_sql_table('399300.SZ', db_loader.ENGINE, 'FIN_DAILY_INDEX', columns=['trade_date', 'pct_chg'])
        .astype(dtype={'trade_date': 'uint32'}).set_index('trade_date')
        .rename(columns={'pct_chg': 'shareindex_return'}))

    # shibor数据
    df_shibor = (pd.read_sql_table('SHIBOR', db_loader.ENGINE, 'FIN_DAILY_INDEX', columns=['trade_date', '3m'])
                 .astype(dtype={'trade_date': 'uint32'}).set_index('trade_date')
                 .rename(columns={'3m': 'riskfree_return'})/360)

    # 情绪数据
    df_sent = pd.concat([pd.read_sql_table('IMG_SENT', db_loader.ENGINE, 'SENT_DAILY').astype(dtype={'trade_date': 'uint32'})
                        .set_index('trade_date').rename(columns={'neg_index': 'img_neg'}),
                         pd.read_sql_table('TEX_SENT', db_loader.ENGINE, 'SENT_DAILY').astype(dtype={'trade_date': 'uint32'})
                        .set_index('trade_date').rename(columns={'neg_index': 'tex_neg'})], axis=1)
    return (
        cudf.from_pandas(pd.concat([df_share_index, df_shibor], join="inner", axis=1, sort=True)),
        pd.concat([df_sent, df_share_index, df_shibor], join="inner", axis=1, sort=True))


def extract_merge():
    df_p = extract_panel()
    df_t, _ = extract_time_series()
    df_m = cudf.merge(left=df_p.reset_index(), right=df_t.reset_index(),
                      left_on='trade_date', right_on='trade_date', how="left",
                      sort=True)
    return df_m.set_index(['trade_date', 'ts_code']).sort_index(ascending=[True, True])


df_panel = extract_merge()
_, df_time_series = extract_time_series()

CPU times: user 2.04 s, sys: 1.24 s, total: 3.28 s
Wall time: 3.29 s


### 1.2 筛选和清洗数据

In [3]:
df_panel = df_panel[df_panel.index.get_level_values('trade_date') >= 20140101].to_pandas()
df_panel

Unnamed: 0_level_0,Unnamed: 1_level_0,share_return,total_mv,shareindex_return,riskfree_return
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20140102,000001.SZ,-0.1641,1.002537e+07,-0.3454,0.015460
20140102,000002.SZ,-0.4972,8.799966e+06,-0.3454,0.015460
20140102,000004.SZ,1.3734,9.917646e+04,-0.3454,0.015460
20140102,000005.SZ,-0.4000,2.276691e+05,-0.3454,0.015460
20140102,000006.SZ,-1.2164,6.574476e+05,-0.3454,0.015460
...,...,...,...,...,...
20221130,872374.BJ,-1.7259,,0.1199,0.006092
20221201,301290.SZ,-8.7349,4.856403e+05,1.0831,0.006103
20221201,301311.SZ,12.7436,5.414400e+05,1.0831,0.006103
20221201,870199.BJ,-3.1447,1.498420e+05,1.0831,0.006103


## 2.构造截面异质波动率与市值高低组合

#### 2.1 计算面板数据的异质波动率IDVOL

In [4]:
%%time
from statsmodels.regression.rolling import RollingOLS
# 多线程分组计算
from pandarallel import pandarallel


def roll_idvol(df_code: pd.DataFrame, ols_window: int, var_ma: int) -> pd.DataFrame:
    """
    滚动OLS回归求异质波动率
    """
    try:
        # 索引
        # 估计参数
        model_ols = RollingOLS(endog=df_code[['Y']], exog=df_code[['CONST', 'X']], window=ols_window, )
        df_para = model_ols.fit().params.rename(columns={'CONST': 'Alpha', 'X': 'Beta'})
        # 预测残差 已经对齐了
        df_con = pd.concat([df_code, df_para], axis=1, join='inner')
        df_con['Residual'] = df_con['Alpha'] + df_con['Beta']*df_con['X'] - df_con['Y']
        # 计算月波动率
        df_con['Idvol'] = df_con['Residual'].rolling(var_ma).apply(lambda x: np.var(x, ddof=1))
        return df_con[['share_return', 'total_mv', 'Idvol']]

    except Exception as e:
        print(e)
        return pd.DataFrame(columns=['trade_date', 'ts_code']).set_index(['trade_date', 'ts_code'])


def cal_panel_ols():
    # 定义回归变量 CAPM回归: (rm-rf)=a+b*(RM-rf)
    df_panel['Y'] = df_panel['share_return'] - df_panel['riskfree_return']
    df_panel['CONST'] = 1  # 带截距项回归
    df_panel['X'] = df_panel['shareindex_return'] - df_panel['riskfree_return']

    # 多线程加速
    pandarallel.initialize(progress_bar=True)
    df_out = (df_panel.groupby(level=['ts_code'])[['share_return', 'total_mv', 'Y', 'CONST', 'X']]
              .parallel_apply(lambda x: roll_idvol(x, 5, 30)).droplevel(2))
    # 保存
    df_out.to_parquet('./DataSets/ASHARE_OLS_PANEL.parquet', engine='pyarrow', index=True)


#
if not os.path.exists('./DataSets/ASHARE_OLS_PANEL.parquet'):
    cal_panel_ols()

CPU times: user 275 ms, sys: 15.5 ms, total: 290 ms
Wall time: 289 ms


### 2.2 按照异质波动率分组

上面的面板数据计算完成后,从这里开始运行

In [10]:
import pandas as pd
import numpy as np
import cudf

QUANTILE = 0.5
df_ols_panel = cudf.read_parquet('./DataSets/ASHARE_OLS_PANEL.parquet').reset_index().set_index(['trade_date', 'ts_code']).sort_index()
df_ols_panel

Unnamed: 0_level_0,Unnamed: 1_level_0,share_return,total_mv,Idvol
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20140102,000001.SZ,-0.1641,1.002537e+07,
20140102,000002.SZ,-0.4972,8.799966e+06,
20140102,000004.SZ,1.3734,9.917646e+04,
20140102,000005.SZ,-0.4000,2.276691e+05,
20140102,000006.SZ,-1.2164,6.574476e+05,
...,...,...,...,...
20221128,873122.BJ,-2.0158,1.259199e+05,19.27363078
20221128,873169.BJ,-1.5198,5.443271e+04,1.319910083
20221128,873223.BJ,-0.2660,5.898188e+04,0.730762806
20221128,873339.BJ,-0.5093,1.290394e+05,


In [11]:
%%time

# 分组
df_ols_panel['idvol_top'] = df_ols_panel['Idvol'].groupby(level=['trade_date']).transform(lambda x: x.quantile(QUANTILE))
df_ols_panel['idvol_group'] = np.where(df_ols_panel['Idvol'].to_pandas() >= df_ols_panel['idvol_top'].to_pandas(), "HIGH", "LOW")
df_ols_panel = df_ols_panel.reset_index().set_index(['trade_date', 'idvol_group', 'ts_code']).sort_index()

# 求组中市值加权系数,并求回报
df_ols_panel['mv_ratio'] = (df_ols_panel['total_mv']/df_ols_panel
                            .groupby(level=['trade_date', 'idvol_group'])['total_mv'].transform('sum'))
# 求组中回报
df_ols_panel['idvol_vw_ratio'] = df_ols_panel['mv_ratio']*df_ols_panel['share_return']
df_ols_panel['idvol_group_return'] = (df_ols_panel.groupby(level=['trade_date', 'idvol_group'])['idvol_vw_ratio'].transform('sum'))

df_ols_panel

CPU times: user 3.07 s, sys: 987 ms, total: 4.05 s
Wall time: 4.06 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,share_return,total_mv,Idvol,idvol_top,mv_ratio,idvol_vw_ratio,idvol_group_return
trade_date,idvol_group,ts_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20140102,LOW,000001.SZ,-0.1641,1.002537e+07,,,0.003834,-0.000629,0.104007
20140102,LOW,000002.SZ,-0.4972,8.799966e+06,,,0.003365,-0.001673,0.104007
20140102,LOW,000004.SZ,1.3734,9.917646e+04,,,0.000038,0.000052,0.104007
20140102,LOW,000005.SZ,-0.4000,2.276691e+05,,,0.000087,-0.000035,0.104007
20140102,LOW,000006.SZ,-1.2164,6.574476e+05,,,0.000251,-0.000306,0.104007
...,...,...,...,...,...,...,...,...,...
20221128,LOW,872925.BJ,-0.3987,7.386323e+04,1.366920583,2.944717594,0.000013,-0.000005,-0.769940
20221128,LOW,873169.BJ,-1.5198,5.443271e+04,1.319910083,2.944717594,0.000010,-0.000015,-0.769940
20221128,LOW,873223.BJ,-0.2660,5.898188e+04,0.730762806,2.944717594,0.000011,-0.000003,-0.769940
20221128,LOW,873339.BJ,-0.5093,1.290394e+05,,2.944717594,0.000023,-0.000012,-0.769940


In [12]:
# 提取分组数据
df_ols_panel = df_ols_panel[['idvol_group_return']].to_pandas().droplevel('ts_code')
df_ols_panel = df_ols_panel[~df_ols_panel.index.duplicated(keep='last')]
# 转为时间序列数据
df_series = (df_ols_panel.reset_index()
             .pivot(index='trade_date', columns='idvol_group', values='idvol_group_return'))
# 合并其他时间序列数据
df_series = df_series.join(df_time_series, how='inner').dropna(axis=0)
df_series

Unnamed: 0_level_0,HIGH,LOW,img_neg,tex_neg,shareindex_return,riskfree_return
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20140225,-4.016731,-1.827917,0.000000,0.000000,-2.5559,0.015440
20140226,0.985354,0.193332,0.666667,0.000000,0.2547,0.015393
20140227,-0.781264,0.026462,0.000000,0.000000,-0.4297,0.015378
20140228,1.174221,0.582117,0.333333,0.666667,1.1542,0.015358
20140303,1.989484,0.722860,0.333333,0.333333,0.5231,0.015331
...,...,...,...,...,...,...
20220621,-0.810936,0.125217,0.238095,0.285714,-0.1124,0.005556
20220622,-0.807400,-1.345733,0.227273,0.227273,-1.2702,0.005556
20220623,2.629145,1.459178,0.363636,0.363636,1.7153,0.005556
20220624,1.323591,0.901009,0.200000,0.300000,1.1716,0.005556


## 3.按照观测窗口构造投资策略

In [8]:
def cal_return(df, MA):
    df[f'img_neg_m{MA}'] = (df['img_neg'].rolling(MA).mean())

    # 历史均值
    df['is_ma_img'] = (df['img_neg'] >= df[f'img_neg_m{MA}'])
    df['is_ma_img'] = df['is_ma_img'].shift(1)

    # 高于均值投资
    df['img_return'] = np.where(df['is_ma_img'], -1*(df['is_ma_img']*df['HIGH']), df['index_return'])

    # 换算
    df.dropna(axis=0, inplace=True)

    df['mv_csi300'] = (df['index_return'] + 100)/100
    df['mv_img'] = (df['img_return'] + 100)/100

    df['mv_csi300'] = df['mv_csi300'].cumprod(axis=0)
    df['mv_img'] = df['mv_img'].cumprod(axis=0)

    return df.rename(columns={'mv_img': f'mv_img_{MA}'})


def start():
    df_in = df_series
    for i in [5]:
        df_in = cal_return(df_in, i)
    df_in = df_in[[i for i in df_in.columns if 'mv_' in i]]
