# 情绪投资策略(2014-2022年全A股市场)

## 1.数据准备

In [1]:
import sys
import os
import numpy as np
import cudf  #CUDA计算
import pandas as pd

sys.path.append('/home/ubuntu/notebook/Investor-Sentiment')
sys.path.append('/usr/local/stata17/utilities')
from pystata import config  #Stata
from statsmodels.regression.rolling import RollingOLS  #滚动回归
from pandarallel import pandarallel  #多线程groupby Apply

config.init('mp')


  ___  ____  ____  ____  ____ ©
 /__    /   ____/   /   ____/      17.0
___/   /   /___/   /   /___/       MP—Parallel Edition

 Statistics and Data Science       Copyright 1985-2021 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        stata@stata.com

Stata license: Single-user 8-core , expiring  1 Jan 2025
Serial number: 501709301094
  Licensed to: Colin's Stata
               Love U

Notes:
      1. Unicode is supported; see help unicode_advice.
      2. More than 2 billion observations are allowed; see help obs_advice.
      3. Maximum number of variables is set to 5,000; see help set_maxvar.


### 1.1 下载和合并面板数据

In [2]:
%%time

# 下载数据
def load_data():
    # 数据集:个股K线面板数据,个股基本面数据
    data_list = ['ASHARE_BAR_PANEL.parquet', 'ASHARE_BASIC_PANEL.parquet']
    from loader.findata_loader import DownLoader
    if not set(data_list).issubset(os.listdir('./DataSets/')): DownLoader(MAX_CORE=10).load_data()


load_data()

CPU times: user 72.5 ms, sys: 10.6 ms, total: 83.2 ms
Wall time: 82.1 ms


In [10]:
%%time

# 提取面板数据
def extract_panel():
    # 个股K线数据
    df_bar = (
        cudf.read_parquet('./DataSets/ASHARE_BAR_PANEL.parquet', columns=['trade_date', 'ts_code', 'pct_chg'])
        .rename(columns={'pct_chg': 'share_return'})
    )

    # 个股基本面数据
    df_basic = cudf.read_parquet('./DataSets/ASHARE_BASIC_PANEL.parquet', columns=['trade_date', 'ts_code', 'total_mv'])

    # 合并
    df_p = cudf.concat([df_bar, df_basic], join="left", axis=1, sort=True)

    # 压缩数据
    df_p.index.levels[1].astype('category', inplace=True)

    return df_p


# 提取时间序列数据
def extract_time_series():
    # 股指数据
    from utils.sql import DB
    db_loader = DB()
    df_share_index = (
        pd.read_sql_table('399300.SZ', db_loader.ENGINE, 'FIN_DAILY_INDEX', columns=['trade_date', 'pct_chg'])
        .astype(dtype={'trade_date': 'uint32'}).set_index('trade_date').rename(columns={'pct_chg': 'shareindex_return'})
    )

    # shibor数据
    df_shibor = (
            pd.read_sql_table('SHIBOR', db_loader.ENGINE, 'FIN_DAILY_INDEX', columns=['trade_date', '3m'])
            .astype(dtype={'trade_date': 'uint32'}).set_index('trade_date').rename(columns={'3m': 'riskfree_return'})/360
    )

    # 情绪数据
    df_sent = pd.concat(
        [pd.read_sql_table('IMG_SENT', db_loader.ENGINE, 'SENT_DAILY').astype(dtype={'trade_date': 'uint32'})
         .set_index('trade_date').rename(columns={'neg_index': 'img_neg'}),
         pd.read_sql_table('TEX_SENT', db_loader.ENGINE, 'SENT_DAILY').astype(dtype={'trade_date': 'uint32'})
         .set_index('trade_date').rename(columns={'neg_index': 'tex_neg'})
         ], axis=1
    )

    return (cudf.from_pandas(pd.concat([df_share_index, df_shibor], join="inner", axis=1, sort=True)),
            pd.concat([df_sent, df_share_index, df_shibor], join="inner", axis=1, sort=True))


# 合并数据
def extract_merge():
    df_p = extract_panel()
    df_t, _ = extract_time_series()
    df_m = cudf.merge(
        left=df_p.reset_index(), right=df_t.reset_index(), left_on='trade_date', right_on='trade_date', how="left", sort=True
    )
    return df_m.set_index(['trade_date', 'ts_code']).sort_index(ascending=[True, True])


df_panel = extract_merge()
_, df_time_series = extract_time_series()

CPU times: user 1.01 s, sys: 852 ms, total: 1.86 s
Wall time: 1.87 s


### 1.2 筛选和清洗数据

In [11]:
df_panel = df_panel[df_panel.index.get_level_values('trade_date') >= 20140101]
df_panel

Unnamed: 0_level_0,Unnamed: 1_level_0,share_return,total_mv,shareindex_return,riskfree_return
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20140102,000001.SZ,-0.1641,10025372.09,-0.3454,0.015460
20140102,000002.SZ,-0.4972,8799966.25,-0.3454,0.015460
20140102,000004.SZ,1.3734,99176.4638,-0.3454,0.015460
20140102,000005.SZ,-0.4000,227669.0681,-0.3454,0.015460
20140102,000006.SZ,-1.2164,657447.5874,-0.3454,0.015460
...,...,...,...,...,...
20221130,872374.BJ,-1.7259,,0.1199,0.006092
20221201,301290.SZ,-8.7349,485640.3232,1.0831,0.006103
20221201,301311.SZ,12.7436,541440.0,1.0831,0.006103
20221201,870199.BJ,-3.1447,149842.0,1.0831,0.006103


## 2.构造截面异质波动率与市值高低组合

#### 2.1 计算面板数据的异质波动率IDVOL

In [29]:
%%time

# 滚动OLS回归求异质波动率
def roll_idvol(df_code: pd.DataFrame, ols_window: int, var_ma: int) -> pd.DataFrame:
    try:
        # 估计参数
        model_ols = RollingOLS(endog=df_code[['Y']].to_pandas(), exog=df_code[['CONST', 'X']].to_pandas(), window=ols_window)
        df_para = model_ols.fit().params.rename(columns={'CONST': 'Alpha', 'X': 'Beta'})

        # 预测残差 已经对齐了
        df_con = cudf.concat([df_code, cudf.from_pandas(df_para)], axis=1, join='inner')
        df_con['Residual'] = df_con['Alpha'] + df_con['Beta']*df_con['X'] - df_con['Y']

        # 计算月波动率
        df_con['Idvol'] = df_con['Residual'].rolling(var_ma).var(ddof=1)
        return df_con[['share_return', 'total_mv', 'Idvol']]

    except IndexError:
        return cudf.DataFrame(columns=['trade_date', 'ts_code']).set_index(['trade_date', 'ts_code'])


# 分组计算
def cal_panel_ols():
    # 定义回归变量 CAPM回归: (rm-rf)=a+b*(RM-rf)
    df_panel['Y'] = df_panel['share_return'] - df_panel['riskfree_return']
    df_panel['CONST'] = 1  # 带截距项回归
    df_panel['X'] = df_panel['shareindex_return'] - df_panel['riskfree_return']

    # 多线程加速
    # pandarallel.initialize(progress_bar=True)
    df_out = (
        df_panel.groupby(level=['ts_code'])[['share_return', 'total_mv', 'Y', 'CONST', 'X']]
        .apply(lambda x: roll_idvol(x, 5, 30))
    )
    return df_out
    # 保存


#
cal_panel_ols()



CPU times: user 3min 9s, sys: 1min, total: 4min 9s
Wall time: 2min 7s


Unnamed: 0_level_0,Unnamed: 1_level_0,share_return,total_mv,Idvol
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20140102,000001.SZ,-0.1641,1.002537e+07,
20140103,000001.SZ,-2.4524,9.779451e+06,
20140106,000001.SZ,-2.1804,9.566320e+06,
20140107,000001.SZ,-0.3428,9.533530e+06,
20140108,000001.SZ,1.1192,9.640096e+06,
...,...,...,...,...
20151028,603998.SH,-3.0001,5.224250e+05,3.805418467
20151029,603998.SH,1.0850,5.280943e+05,3.111689124
20151030,603998.SH,1.3155,5.350392e+05,3.173017989
20151102,603998.SH,1.2185,5.415589e+05,3.141213307


In [None]:
%%time

# 滚动OLS回归求异质波动率
def roll_idvol(df_code: pd.DataFrame, ols_window: int, var_ma: int) -> pd.DataFrame:
    try:
        # 估计参数
        model_ols = RollingOLS(endog=df_code[['Y']], exog=df_code[['CONST', 'X']], window=ols_window)
        df_para = model_ols.fit().params.rename(columns={'CONST': 'Alpha', 'X': 'Beta'})

        # 预测残差 已经对齐了
        df_con = pd.concat([df_code, df_para], axis=1, join='inner')
        df_con['Residual'] = df_con['Alpha'] + df_con['Beta']*df_con['X'] - df_con['Y']

        # 计算月波动率
        df_con['Idvol'] = df_con['Residual'].rolling(var_ma).apply(lambda x: np.var(x, ddof=1))
        return df_con[['share_return', 'total_mv', 'Idvol']]

    except Exception as e:
        print(e)
        return pd.DataFrame(columns=['trade_date', 'ts_code']).set_index(['trade_date', 'ts_code'])


# 分组计算
def cal_panel_ols():
    # 定义回归变量 CAPM回归: (rm-rf)=a+b*(RM-rf)
    df_panel['Y'] = df_panel['share_return'] - df_panel['riskfree_return']
    df_panel['CONST'] = 1  # 带截距项回归
    df_panel['X'] = df_panel['shareindex_return'] - df_panel['riskfree_return']

    # 多线程加速
    pandarallel.initialize(progress_bar=True)
    df_out = (
        df_panel.groupby(level=['ts_code'])[['share_return', 'total_mv', 'Y', 'CONST', 'X']]
        .parallel_apply(lambda x: roll_idvol(x, 5, 30)).droplevel(2)
    )

    # 保存
    df_out.to_parquet('./DataSets/ASHARE_OLS_PANEL.parquet', engine='pyarrow', index=True)


#
if not os.path.exists('./DataSets/ASHARE_OLS_PANEL.parquet'):
    cal_panel_ols()

### 2.2 面板数据异质波动率分组

上面的面板数据计算完成后,从这里开始运行

In [31]:
%%time
QUANTILE = 0.5
df_ols_panel = cudf.read_parquet('./DataSets/ASHARE_OLS_PANEL.parquet').reset_index().set_index(['trade_date', 'ts_code']).sort_index()
df_ols_panel

CPU times: user 123 ms, sys: 87.3 ms, total: 210 ms
Wall time: 209 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,share_return,total_mv,Idvol
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20140102,000001.SZ,-0.1641,1.002537e+07,
20140102,000002.SZ,-0.4972,8.799966e+06,
20140102,000004.SZ,1.3734,9.917646e+04,
20140102,000005.SZ,-0.4000,2.276691e+05,
20140102,000006.SZ,-1.2164,6.574476e+05,
...,...,...,...,...
20221128,873122.BJ,-2.0158,1.259199e+05,19.27363078
20221128,873169.BJ,-1.5198,5.443271e+04,1.319910083
20221128,873223.BJ,-0.2660,5.898188e+04,0.730762806
20221128,873339.BJ,-0.5093,1.290394e+05,


In [32]:
%%time

# 分组
df_ols_panel['idvol_top'] = df_ols_panel['Idvol'].groupby(level=['trade_date']).transform(lambda x: x.quantile(QUANTILE))
df_ols_panel['idvol_group'] = np.where(df_ols_panel['Idvol'].to_pandas() >= df_ols_panel['idvol_top'].to_pandas(), "HIGH", "LOW")
df_ols_panel = df_ols_panel.reset_index().set_index(['trade_date', 'idvol_group', 'ts_code']).sort_index()

# 求组中市值加权系数,并求回报
df_ols_panel['mv_ratio'] = (
        df_ols_panel['total_mv']/df_ols_panel.groupby(level=['trade_date', 'idvol_group'])['total_mv'].transform('sum'))

# 求组中回报
df_ols_panel['idvol_vw_ratio'] = df_ols_panel['mv_ratio']*df_ols_panel['share_return']
df_ols_panel['idvol_group_return'] = (df_ols_panel.groupby(level=['trade_date', 'idvol_group'])['idvol_vw_ratio'].transform('sum'))
df_ols_panel

CPU times: user 3.08 s, sys: 985 ms, total: 4.07 s
Wall time: 4.07 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,share_return,total_mv,Idvol,idvol_top,mv_ratio,idvol_vw_ratio,idvol_group_return
trade_date,idvol_group,ts_code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20140102,LOW,000001.SZ,-0.1641,1.002537e+07,,,0.003834,-0.000629,0.104007
20140102,LOW,000002.SZ,-0.4972,8.799966e+06,,,0.003365,-0.001673,0.104007
20140102,LOW,000004.SZ,1.3734,9.917646e+04,,,0.000038,0.000052,0.104007
20140102,LOW,000005.SZ,-0.4000,2.276691e+05,,,0.000087,-0.000035,0.104007
20140102,LOW,000006.SZ,-1.2164,6.574476e+05,,,0.000251,-0.000306,0.104007
...,...,...,...,...,...,...,...,...,...
20221128,LOW,872925.BJ,-0.3987,7.386323e+04,1.366920583,2.944717594,0.000013,-0.000005,-0.769940
20221128,LOW,873169.BJ,-1.5198,5.443271e+04,1.319910083,2.944717594,0.000010,-0.000015,-0.769940
20221128,LOW,873223.BJ,-0.2660,5.898188e+04,0.730762806,2.944717594,0.000011,-0.000003,-0.769940
20221128,LOW,873339.BJ,-0.5093,1.290394e+05,,2.944717594,0.000023,-0.000012,-0.769940


In [None]:
# 提取分组数据
df_ols_panel = df_ols_panel[['idvol_group_return']].to_pandas().droplevel('ts_code')
df_ols_panel = df_ols_panel[~df_ols_panel.index.duplicated(keep='last')]
# 转为时间序列数据
df_series = (df_ols_panel.reset_index().pivot(index='trade_date', columns='idvol_group', values='idvol_group_return'))
# 合并其他时间序列数据
df_series = df_series.join(df_time_series, how='inner').dropna(axis=0).reset_index()
df_series

## 3.VAR模型分析

### 3.1 回归前数据处理

In [5]:
# 增加平方项
def add_square_column(df, square_column: list): return pd.concat([df, df[square_column].pow(2).add_suffix('_s')], axis=1)


# 增加日期虚拟变量
def add_dummy_column(df, dummy_column: str):
    df_weekday = pd.get_dummies(pd.to_datetime(df[dummy_column], format='%Y%m%d').dt.weekday, prefix='weekday', drop_first=True)
    df_month = pd.get_dummies(pd.to_datetime(df[dummy_column], format='%Y%m%d').dt.month, prefix='month', drop_first=True)
    return pd.concat([df, df_weekday, df_month], axis=1)


df_series_ols = add_dummy_column(add_square_column(df_series, ['shareindex_return']), 'trade_date')
df_series_ols

NameError: name 'df_series' is not defined

### 3.2 回归结果

In [None]:
%%stata -d df_series_ols -force
sum

In [3]:
%%stata?

In [4]:
%%stata -d df_series_ols -force  -ret DICTIONARY
# @formatter:off
ge time = _n
tsset time
var shareindex_return img_neg shareindex_return_s, lags(1/5) exog(month_*weekday_*)
# @formatter:on

NameError: name 'df_series_ols' is not defined

In [None]:
DICTIONARY

## 4.按照观测窗口构造投资策略

In [None]:
def cal_return(df, ma):
    df[f'img_neg_m{ma}'] = (df['img_neg'].rolling(ma).mean())

    # 历史均值
    df['sell_signal'] = df['img_neg'] >= df[f'img_neg_m{ma}']
    df['sell_signal'] = df['sell_signal'].shift(1)

    # 高于均值投资
    df['img_return'] = np.where(df['sell_signal'], -1*(df['sell_signal']*df['HIGH']), df['shareindex_return'])

    # 去掉空行
    df.dropna(axis=0, inplace=True)

    # 换算
    df['mv_shareindex'] = ((df['shareindex_return'] + 100)/100)
    df['mv_img'] = ((df['img_return'] + 100)/100)
    df['mv_shareindex'] = df['mv_shareindex'].cumprod(axis=0)
    df['mv_img'] = df['mv_img'].cumprod(axis=0)

    return df.rename(columns={'mv_img': f'mv_img_{ma}'})


def start():
    df_in = df_series
    for i in [5, ]:
        df_in = cal_return(df_in, i)

    return df_in

# start()