# 情绪对股票市场的影响

## 1.数据准备
风险偏好
事件

In [1]:
import sys
import os
import numpy as np
import cudf  #CUDA计算
import pandas as pd

sys.path.append('/home/ubuntu/notebook/Investor-Sentiment')
sys.path.append('/usr/local/stata17/utilities')
from pystata import config  #Stata
from statsmodels.regression.rolling import RollingOLS  #滚动回归
from statsmodels.regression.linear_model import OLS  #OLS回归
from pandarallel import pandarallel  #多线程groupby Apply

config.init('mp')

# ------------------------------数据集路径----------------------------------#
DATASETS_PATH = './DataSets/'


  ___  ____  ____  ____  ____ ©
 /__    /   ____/   /   ____/      17.0
___/   /   /___/   /   /___/       MP—Parallel Edition

 Statistics and Data Science       Copyright 1985-2021 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        stata@stata.com

Stata license: Single-user 8-core , expiring  1 Jan 2025
Serial number: 501709301094
  Licensed to: Colin's Stata
               Love U

Notes:
      1. Unicode is supported; see help unicode_advice.
      2. More than 2 billion observations are allowed; see help obs_advice.
      3. Maximum number of variables is set to 5,000; see help set_maxvar.


### 1.1 下载和合并面板数据

In [2]:
# 下载数据
def load_data():
    # 数据集:个股K线面板数据,个股基本面数据
    data_list = ['ASHARE_BAR_PANEL.parquet', 'ASHARE_BASIC_PANEL.parquet']
    from loader.findata_loader import DownLoader
    if not set(data_list).issubset(os.listdir('./DataSets/')): DownLoader(MAX_CORE=10).load_data()


load_data()

其他时间序列数据源

In [3]:
# 获取时间序列数据:指数一致预期
def get_time_con_idx() -> pd.DataFrame:
    df_con_idx = cudf.read_parquet(DATASETS_PATH + 'CON_FORECAST_IDX.parquet')
    # 筛选研究样本
    df_con_idx = df_con_idx[df_con_idx['INDEX_CODE'] == '000300']
    df_con_idx['CON_DATE'] = df_con_idx['CON_DATE'].dt.strftime('%Y%m%d')

    # 格式处理
    df_con_idx = df_con_idx.astype(dtype={'CON_DATE': 'uint32', 'CON_YEAR': 'uint32'})

    # 筛选样本期
    df_con_idx = df_con_idx[df_con_idx['CON_DATE'] >= 20131231]

    # 筛选短期预期
    df_con_idx = df_con_idx[(df_con_idx['CON_DATE'] // 10000) == df_con_idx['CON_YEAR']]
    df_con_idx['CON_PRICE'] = df_con_idx['CON_PE'] * df_con_idx['CON_EPS']

    # 计算回报
    df_con_idx['con_return'] = ((df_con_idx['CON_PRICE'] / df_con_idx['CON_PRICE'].shift(1)) - 1) * 100

    # 筛选
    df_con_idx = df_con_idx[['CON_DATE', 'con_return', 'CON_PE']]

    # 方便合并
    df_con_idx = df_con_idx.rename(columns={'CON_DATE': 'trade_date'}).set_index('trade_date')
    return df_con_idx.to_pandas()


get_time_con_idx()

Unnamed: 0_level_0,con_return,CON_PE
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1
20131231,,8.5859
20140102,-0.498829,7.5674
20140103,-1.154334,7.4791
20140106,-1.516669,7.3722
20140107,-0.208883,7.3652
...,...,...
20221024,-2.524273,9.7854
20221025,-0.187077,9.7775
20221026,0.495724,9.8452
20221027,-0.554767,9.8273


提取

In [4]:
# 提取面板数据
def extract_panel():
    # 个股K线数据
    df_bar = (
            cudf.read_parquet('./DataSets/ASHARE_BAR_PANEL.parquet', columns=['trade_date', 'ts_code', 'pct_chg'])
            .rename(columns={'pct_chg': 'share_return'})
    )

    # 个股基本面数据
    df_basic = cudf.read_parquet('./DataSets/ASHARE_BASIC_PANEL.parquet', columns=['trade_date', 'ts_code', 'total_mv'])

    # 合并
    df_p = cudf.concat([df_bar, df_basic], join="left", axis=1, sort=True)

    # 压缩数据
    df_p.index.levels[1].astype('category', inplace=True)

    return df_p


# 提取时间序列数据
def extract_time_series():
    # 股指数据
    from utils.sql import DB
    db_loader = DB()
    df_share_index = (
            pd.read_sql_table('399300.SZ', db_loader.ENGINE, 'FIN_DAILY_INDEX', columns=['trade_date', 'pct_chg'])
            .astype(dtype={'trade_date': 'uint32'}).set_index('trade_date').rename(columns={'pct_chg': 'shareindex_return'})
    )

    # shibor数据
    df_shibor = (
            pd.read_sql_table('SHIBOR', db_loader.ENGINE, 'FIN_DAILY_INDEX', columns=['trade_date', '3m'])
            .astype(dtype={'trade_date': 'uint32'}).set_index('trade_date').rename(columns={'3m': 'riskfree_return'}) / 360
    )

    # 情绪数据
    df_sent = pd.concat(
            [pd.read_sql_table('IMG_SENT', db_loader.ENGINE, 'SENT_DAILY').astype(dtype={'trade_date': 'uint32'})
             .set_index('trade_date').rename(columns={'neg_index': 'img_neg'}),
             pd.read_sql_table('TEX_SENT', db_loader.ENGINE, 'SENT_DAILY').astype(dtype={'trade_date': 'uint32'})
             .set_index('trade_date').rename(columns={'neg_index': 'tex_neg'})
             ], axis=1
    )

    # 一致预期数据
    df_con_idx = get_time_con_idx()

    return (cudf.from_pandas(pd.concat([df_share_index, df_shibor], join="inner", axis=1, sort=True)),
            pd.concat([df_sent, df_share_index, df_shibor, df_con_idx], join="inner", axis=1, sort=True))


# 合并数据
def extract_merge():
    df_p = extract_panel()
    df_t, _ = extract_time_series()
    df_m = cudf.merge(
            left=df_p.reset_index(), right=df_t.reset_index(), left_on='trade_date', right_on='trade_date', how="left",
            sort=True
    )
    return df_m.set_index(['trade_date', 'ts_code']).sort_index(ascending=[True, True])


df_panel = extract_merge()
_, df_time_series = extract_time_series()

# 数据筛选
df_panel = df_panel[df_panel.index.get_level_values('trade_date') >= 20140101].to_pandas()
df_panel

Unnamed: 0_level_0,Unnamed: 1_level_0,share_return,total_mv,shareindex_return,riskfree_return
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20140102,000001.SZ,-0.1641,1.002537e+07,-0.3454,0.015460
20140102,000002.SZ,-0.4972,8.799966e+06,-0.3454,0.015460
20140102,000004.SZ,1.3734,9.917646e+04,-0.3454,0.015460
20140102,000005.SZ,-0.4000,2.276691e+05,-0.3454,0.015460
20140102,000006.SZ,-1.2164,6.574476e+05,-0.3454,0.015460
...,...,...,...,...,...
20221130,872374.BJ,-1.7259,,0.1199,0.006092
20221201,301290.SZ,-8.7349,4.856403e+05,1.0831,0.006103
20221201,301311.SZ,12.7436,5.414400e+05,1.0831,0.006103
20221201,870199.BJ,-3.1447,1.498420e+05,1.0831,0.006103


## 2.构造截面异质波动率与市值高低组合

#### 2.1 计算面板数据的异质波动率IDVOL

In [5]:
# 滚动OLS回归求异质波动率
def roll_idvol(df_code: pd.DataFrame, ols_window: int, var_ma: int) -> pd.DataFrame:
    try:
        # 估计参数
        model_ols = RollingOLS(endog=df_code[['Y']], exog=df_code[['CONST', 'X']], window=ols_window)
        df_para = model_ols.fit().params.rename(columns={'CONST': 'Alpha', 'X': 'Beta'})

        # 预测残差 已经对齐了
        df_con = pd.concat([df_code, df_para], axis=1, join='inner')
        df_con['Residual'] = df_con['Alpha'] + df_con['Beta'] * df_con['X'] - df_con['Y']

        # 计算月波动率
        df_con['Idvol'] = df_con['Residual'].rolling(var_ma).var(ddof=1)
        return df_con[['share_return', 'total_mv', 'Idvol']]
    except (IndexError, ValueError): return pd.DataFrame(columns=['trade_date', 'ts_code']).set_index(['trade_date', 'ts_code'])


# 分组计算
def cal_panel_ols():
    # 定义回归变量 CAPM回归: (rm-rf)=a+b*(RM-rf)
    df_panel['Y'] = df_panel['share_return'] - df_panel['riskfree_return']
    df_panel['CONST'] = 1  # 带截距项回归
    df_panel['X'] = df_panel['shareindex_return'] - df_panel['riskfree_return']

    # 多线程加速
    pandarallel.initialize(progress_bar=True)
    df_out = (df_panel.groupby(level=['ts_code'])[['share_return', 'total_mv', 'Y', 'CONST', 'X']]
              .parallel_apply(lambda x: roll_idvol(x, 5, 30)).droplevel(2)
              )

    # 保存
    df_out.to_parquet(f'{DATASETS_PATH}ASHARE_OLS_PANEL.parquet', engine='pyarrow', index=True)


# 计算滚动回归
if not os.path.exists(f'{DATASETS_PATH}ASHARE_OLS_PANEL.parquet'): cal_panel_ols()

# 加载滚动回归的面板数据集
df_ols_panel = (
        cudf.read_parquet(f'{DATASETS_PATH}ASHARE_OLS_PANEL.parquet').reset_index().set_index(['trade_date', 'ts_code']).sort_index()
        .rename(columns={'Idvol': 'idvol'})
)
df_ols_panel

Unnamed: 0_level_0,Unnamed: 1_level_0,share_return,total_mv,idvol
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20140102,000001.SZ,-0.1641,1.002537e+07,
20140102,000002.SZ,-0.4972,8.799966e+06,
20140102,000004.SZ,1.3734,9.917646e+04,
20140102,000005.SZ,-0.4000,2.276691e+05,
20140102,000006.SZ,-1.2164,6.574476e+05,
...,...,...,...,...
20221128,873122.BJ,-2.0158,1.259199e+05,19.27363078
20221128,873169.BJ,-1.5198,5.443271e+04,1.319910083
20221128,873223.BJ,-0.2660,5.898188e+04,0.730762806
20221128,873339.BJ,-0.5093,1.290394e+05,


### 2.2 面板数据分组

In [None]:
# 参数阈值
QUANTILE = 0.5


# 面板数据分组
def group_ols_panel(df, group_col: str):
    """
    :param df:
    :param group_col:分组变量
    """
    # 分组
    df[f'{group_col}_split'] = df[group_col].groupby(level=['trade_date']).transform(lambda x: x.quantile(QUANTILE))
    df[f'{group_col}_group'] = np.where(df[group_col].to_pandas() >= df[f'{group_col}_split'].to_pandas(),
                                        f'{group_col}_high', f'{group_col}_low')
    # 重新设定分组索引
    df = df.reset_index().set_index(['trade_date', f'{group_col}_group', 'ts_code']).sort_index()

    # 求组中市值加权系数
    df[f'{group_col}_mv_ratio'] = df['total_mv'] / df.groupby(level=['trade_date', f'{group_col}_group'])['total_mv'].transform('sum')

    # 求组中回报
    df[f'{group_col}_vw_return'] = df[f'{group_col}_mv_ratio'] * df['share_return']
    df[f'{group_col}_group_return'] = (df.groupby(level=['trade_date', f'{group_col}_group'])[f'{group_col}_vw_return']
                                       .transform('sum'))

    # 重置索引
    return df.reset_index(f'{group_col}_group')


# 循环分组,最后处理成时间序列数据
def group_cols(df, columns: list):
    # 用于分组
    df_temp = df

    # 求组中市值加权回报
    for col in columns: df_temp = group_ols_panel(df_temp, col)

    # 筛选
    df_temp = df_temp[[i + '_group' for i in columns] + [i + '_group_return' for i in columns]]

    # 保留唯一的组合 2^N
    df_time_panel = (df_temp.reset_index().groupby(['trade_date'] + [i + '_group' for i in columns]).first())

    # 转为时间序列数据
    df_time = cudf.DataFrame()
    for col in columns: df_time = cudf.concat([df_time,
                                               df_time_panel.groupby(level=['trade_date', f'{col}_group']).first().reset_index()
                                              .pivot(index='trade_date', columns=f'{col}_group', values=f'{col}_group_return')],
                                              join="left", axis=1, sort=True)
    # 高减低
    # for col in columns: df_time[f'{col}_mid'] = df_time[f'{col}_high'] - df_time[f'{col}_low']

    return df_time


df_group_time = group_cols(df_ols_panel, ['total_mv', 'idvol'])
df_group_time

In [None]:
# 合并其他时间序列数据
df_series_join = (cudf.concat([cudf.from_pandas(df_time_series), df_group_time], join="left", axis=1, sort=True).dropna(axis=0)
                  .reset_index().set_index(['trade_date', 'img_neg', 'tex_neg']).drop(columns='riskfree_return').to_pandas())


# 增加残差
def add_residual(df):
    df_temp = df[['con_return', 'shareindex_return']].copy()
    df_temp['CONST'] = 1
    model_ols = RollingOLS(endog=df_temp[['con_return']], exog=df_temp[['CONST', 'shareindex_return']], window=5)
    df_ols = model_ols.fit().params
    df_temp['con_residual_return'] = df_temp['con_return'] - (
            df_ols['CONST'] * df_temp['CONST'] + df_temp['shareindex_return'] * df_ols['shareindex_return'])
    return pd.concat([df, df_temp[['con_residual_return']]], axis=1).dropna(axis=0)


#
df_series_join = add_residual(df_series_join)
df_series_join

## 3. VAR模型分析

### 3.1 回归前数据处理

In [None]:
# 增加平方项
def add_square_column(df): return pd.concat([df, df.pow(2).add_suffix('_s')], axis=1)


# 增加日期虚拟变量
def add_dummy_column(df, column: str):
    df_weekday = pd.get_dummies(pd.to_datetime(df[column], format='%Y%m%d').dt.weekday, prefix='weekday', drop_first=True)
    df_month = pd.get_dummies(pd.to_datetime(df[column], format='%Y%m%d').dt.month, prefix='month', drop_first=True)
    return pd.concat([df, df_weekday, df_month], axis=1)


# 处理好的用于回归的数据
df_series_ols = add_dummy_column(add_square_column(df_series_join).reset_index(), 'trade_date')
df_series_ols
# @formatter:off

### 3.2 回归结果

In [None]:
%%stata -d df_series_ols -force
//描述性统计
logout, save(Outputs/Table_Sum)  replace: ///
tabstat *_neg *_return *_high *_low, s(N sd mean p50 min max ) f(%12.4f) c(s)

#### 3.2.1 不同群体

In [None]:
%%stata -d df_series_ols -force
//时间设定
ge time = _n
tsset time
est clear

//VAR回归
foreach var in shareindex_return con_residual_return   {
    rename(`var' `var'_s) (return return_s)

    eststo: qui var return img_neg return, lags(1/5) exog(month_* weekday_*)
    estadd local Month "Yes", replace
    estadd local Weekday "Yes", replace

    rename(return return_s) (`var' `var'_s)
}

//输出
esttab , keep(return:L*.img_neg) ///
star(* 0.1 ** 0.05 *** 0.01) ///
stats( Month Weekday  r2_1 N, fmt(%3s %3s %12.4f %12.0f)) b(%12.4f) ///
title("Table1 Arbitrage Limit")

#### 3.2.2 套利限制

In [None]:
%%stata -d df_series_ols -force
//时间设定
ge time = _n
tsset time
est clear

//VAR回归
foreach var in total_mv_high total_mv_low  idvol_high idvol_low {
    rename(`var' `var'_s) (return return_s)

    eststo: qui var return img_neg return, lags(1/5) exog(month_* weekday_*)
    estadd local Month "Yes", replace
    estadd local Weekday "Yes", replace

    rename(return return_s) (`var' `var'_s)
}

//输出
esttab , keep(return:L*.img_neg) ///
star(* 0.1 ** 0.05 *** 0.01) ///
stats( Month Weekday  r2_1 N, fmt(%3s %3s %12.4f %12.0f)) b(%12.4f) ///
title("Table1 Arbitrage Limit") mtitle("HIGH" "LOW" "HIGH" "LOW")  nogap ///
mgroups("Market Value" "Idiosyncratic Volatility", pattern(1 0 1 0) ) showtabs

## 4. 按照观测窗口构造投资策略(暂时不做)

In [None]:
def cal_return(df, ma):
    df[f'img_neg_m{ma}'] = (df['img_neg'].rolling(ma).mean())

    # 历史均值K
    df['sell_signal'] = df['img_neg'] >= df[f'img_neg_m{ma}']
    df['sell_signal'] = df['sell_signal'].shift(1)

    # 高于均值投资
    df['img_return'] = np.where(df['sell_signal'], -1*(df['sell_signal']*df['HIGH']), df['shareindex_return'])

    # 去掉空行
    df.dropna(axis=0, inplace=True)

    # 换算
    df['mv_shareindex'] = ((df['shareindex_return'] + 100)/100)
    df['mv_img'] = ((df['img_return'] + 100)/100)
    df['mv_shareindex'] = df['mv_shareindex'].cumprod(axis=0)
    df['mv_img'] = df['mv_img'].cumprod(axis=0)

    return df.rename(columns={'mv_img': f'mv_img_{ma}'})


def start():
    df_in = df_series
    for i in [5, ]:
        df_in = cal_return(df_in, i)

    return df_in

# start()