# 情绪投资策略(2014-2022年全A股市场)

## 数据准备

### 1.1 加载数据

In [1]:
import os
import sys

sys.path.append('/home/ubuntu/notebook/Investor-Sentiment')

# 数据集:个股K线面板数据,个股基本面数据
data_list = ['ASHARE_BAR_PANEL.parquet', 'ASHARE_BASIC_PANEL.parquet']
if not set(data_list).issubset(os.listdir('./DataSets/')):
    from loader.findata_loader import DownLoader

    DownLoader(MAX_CORE=10).load_data()



In [3]:
%%time
import cudf

df = (cudf.read_parquet('./DataSets/ASHARE_BAR_PANEL.parquet', columns=['trade_date', 'ts_code', 'pct_chg'])
      .set_index(['trade_date', 'ts_code']))
df

CPU times: user 78.4 ms, sys: 57.1 ms, total: 136 ms
Wall time: 134 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,pct_chg
trade_date,ts_code,Unnamed: 2_level_1
19970626,000001.SZ,
19970627,000001.SZ,
19970702,000001.SZ,
19970703,000001.SZ,
19970704,000001.SZ,
...,...,...
20221122,873527.BJ,-2.0496
20221123,873527.BJ,0.1101
20221124,873527.BJ,0.0
20221125,873527.BJ,-0.22


## 1.提取用于计算的面板数据

In [None]:
%%time
# from tqdm import tqdm
import pandas as pd
import tqdm
import numpy as np
# from statsmodels.regression.rolling import RollingOLS

In [None]:
%%time
import os

if not os.path.exists('../DataSets/TEMP_PANEL_FINAL.parquet'):
    from sqlalchemy import create_engine

    ENGINE = create_engine('mysql+mysqlconnector://root:1111@localhost:3306')
    (pd.read_sql_table('TEMP_PANEL_FINAL', ENGINE,
                       schema='COLIN_PANEL',
                       columns=['ts_code', 'trade_date', 'pct_chg', 'total_mv',
                                'riskfree_return', 'index_return', 'img_neg', 'tex_neg']
                       )
     .astype(dtype={'ts_code': 'category', 'trade_date': 'category'})
     .to_parquet('../DataSets/TEMP_PANEL_FINAL.parquet'))


In [None]:
%%time
import pandas as pd

df_panel = pd.read_parquet('../DataSets/TEMP_PANEL_FINAL.parquet')
df_panel

## 2.构造截面异质波动率与市值高低组合

#### 2.1 计算面板数据的异质波动率IDVOL

In [None]:
%%time
from statsmodels.regression.rolling import RollingOLS
import os
import pandas as pd
import numpy as np


def roll_idvol(df_code: pd.DataFrame, ols_window: int, var_ma: int) -> pd.DataFrame:
    """
    滚动OLS回归求异质波动率
    """
    try:
        # 定义回归变量 CAPM回归: (rm-rf)=a+b*(RM-rf)
        df_ols = pd.DataFrame()
        df_ols['Y'] = df_code['pct_chg'] - df_code['riskfree_return']
        df_ols['const'] = 1  #带截距项回归
        df_ols['X'] = df_code['index_return'] - df_code['riskfree_return']

        # 估计参数
        model = RollingOLS(endog=df_ols['Y'].values, exog=df_ols[['const', 'X']], window=ols_window)
        df_para = model.fit().params

        # 预测残差
        df_para['residual'] = df_para['const'] + df_ols['X']*df_para['X'] - df_ols['Y']

        #计算月波动率
        df_para['idvol'] = df_para['residual'].rolling(var_ma).apply(lambda x: np.var(x, ddof=1))

        return pd.concat([df_code[['ts_code', 'trade_date', 'pct_chg', 'total_mv',
                                   'index_return', 'img_neg', 'tex_neg']], df_para[['idvol']]],
                         axis=1)

    except Exception as e:
        print(e)
        return pd.DataFrame()


if not os.path.exists('../DataSets/TEMP_PANEL_FINAL_IDVOL.parquet'):
    df_panel['riskfree_return'] = df_panel['riskfree_return']/360
    # 多线程分组计算
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=True)
    df_out = (df_panel.groupby('ts_code')
              [['ts_code', 'trade_date', 'pct_chg', 'total_mv',
                'riskfree_return', 'index_return', 'img_neg', 'tex_neg']]
              .parallel_apply(lambda x: roll_idvol(x, 5, 30)))

    # 保存异质波动率计算结果
    df_out.to_parquet('../DataSets/TEMP_PANEL_FINAL_IDVOL.parquet')

### 2.2 按照异质波动率分组

上面的面板数据计算完成后,从这里开始运行

In [None]:
%%time
import pandas as pd
import numpy as np

QUANTILE = 0.4
df_panel = (
    pd.read_parquet('../DataSets/TEMP_PANEL_FINAL_IDVOL.parquet')
    .drop(columns='ts_code').reset_index().drop(columns='level_1').set_index(['trade_date', 'ts_code']).sort_index()
)

# 分组
df_panel['idvol_top'] = df_panel['idvol'].groupby(level=['trade_date']).transform(
    lambda x: x.quantile(QUANTILE))
df_panel['idvol_group'] = np.where(df_panel['idvol'] >= df_panel['idvol_top'], 'HIGH', "LOW")
df_panel['idvol_group'] = df_panel['idvol_group'].astype('category')
df_panel = df_panel.reset_index().set_index(['trade_date', 'idvol_group', 'ts_code']).sort_index()

# 求组中市值加权系数,并求回报
df_panel['mv_ratio'] = (df_panel['total_mv']/
                        df_panel.groupby(level=['trade_date', 'idvol_group'])['total_mv']
                        .transform(lambda x: sum(x)))
# 求组中回报
df_panel['idvol_vw_ratio'] = df_panel['mv_ratio']*df_panel['pct_chg']
df_panel['idvol_group_return'] = (df_panel.groupby(level=['trade_date', 'idvol_group'])['idvol_vw_ratio']
                                  .transform(lambda x: sum(x)))
df_panel

In [None]:
# 提取分组数据
df_panel = df_panel[['idvol_group_return', 'img_neg', 'tex_neg']].reset_index().set_index(['trade_date', 'idvol_group'])
df_panel = df_panel[~df_panel.index.duplicated(keep='last')].reset_index()
df_panel

In [None]:
# 转为时间序列数据
df_series = (df_panel
             .pivot(index='trade_date', columns='idvol_group', values='idvol_group_return')
             .reset_index().astype(dtype={'trade_date': 'str'}).set_index('trade_date'))
df_series

In [None]:
# 连接其他数据
from sqlalchemy import create_engine

ENGINE = create_engine('mysql+mysqlconnector://root:1111@localhost:3306')
df_index = (pd.read_sql_table('TEMP_MERGE_INDEX', ENGINE, schema='FIN_DAILY_INDEX')
            .set_index('trade_date').sort_index())
df_new = df_series.join(df_index).dropna(axis=0)
df_new

## 3.按照观测窗口构造投资策略

In [None]:
def cal_return(df, MA):
    df[f'img_neg_m{MA}'] = (df['img_neg'].rolling(MA).mean())

    # 历史均值
    df['is_ma_img'] = (df['img_neg'] >= df[f'img_neg_m{MA}'])
    df['is_ma_img'] = df['is_ma_img'].shift(1)

    # 高于均值投资
    df['img_return'] = np.where(df['is_ma_img'], -1*(df['is_ma_img']*df['HIGH']), df['index_return'])

    # 换算
    df.dropna(axis=0, inplace=True)

    df['mv_csi300'] = (df['index_return'] + 100)/100
    df['mv_img'] = (df['img_return'] + 100)/100

    df['mv_csi300'] = df['mv_csi300'].cumprod(axis=0)
    df['mv_img'] = df['mv_img'].cumprod(axis=0)

    return df.rename(columns={'mv_img': f'mv_img_{MA}'})


df_in = df_new
for i in [5]:
    df_in = cal_return(df_in, i)
df_in = df_in[[i for i in df_in.columns if 'mv_' in i]]
df_in

In [None]:
df_in.to_csv('../DataSets/invest.csv')