# 媒体情绪专题

## 1.数据准备
风险偏好
事件

In [1]:
import sys
import os
import numpy as np
import cudf  #CUDA计算
import pandas as pd

# 自行编写的包
sys.path.append('/home/ubuntu/notebook/Investor-Sentiment')
sys.path.append('/usr/local/stata17/utilities')

# 多核 groupby Apply
# from pandarallel import pandarallel

# 数据库
from utils.sql import DB
from loader.findata_loader import DownLoader
from loader.findata_loader import Loader

# 统计工具
# from statsmodels.regression.rolling import RollingOLS  #滚动回归
# from statsmodels.regression.linear_model import OLS  #OLS回归
# Stata
from pystata import config

config.init('mp')

# ------------------------------数据集路径----------------------------------#
DATASETS_PATH = '/data/DataSets/investor_sentiment/'


  ___  ____  ____  ____  ____ ©
 /__    /   ____/   /   ____/      17.0
___/   /   /___/   /   /___/       MP—Parallel Edition

 Statistics and Data Science       Copyright 1985-2021 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        stata@stata.com

Stata license: Single-user 8-core , expiring  1 Jan 2025
Serial number: 501709301094
  Licensed to: Colin's Stata
               Love U

Notes:
      1. Unicode is supported; see help unicode_advice.
      2. More than 2 billion observations are allowed; see help obs_advice.
      3. Maximum number of variables is set to 5,000; see help set_maxvar.


### 1.1 下载和合并面板数据

In [2]:
# 数据集:个股K线面板数据,个股基本面数据
DownLoader().load_data()

data_loader = Loader()

  0%|          | 0/5039 [00:00<?, ?it/s]
  0%|          | 0/5039 [00:00<?, ?it/s][A
100%|██████████| 5039/5039 [00:00<00:00, 31654.72it/s][A


KeyboardInterrupt: 

其他时间序列数据源

#### 指数盈利预测面板数据

In [None]:
data_loader.get_conidx_panel()

#### A股基本面面板数据

In [None]:
data_loader.get_ashare_panel()

#### 指数基本面面板数据

In [None]:
data_loader.get_index_panel()

#### 时间序列数据

In [None]:
data_loader.get_time_series()

In [None]:
# @formatter:on

## 3.截面效应分析

#### 3.1 计算面板数据的异质波动率

提取用于回归的数据

In [None]:
data_loader.get_cross_panel_reg()

In [None]:
# 滚动OLS回归求异质波动率
def roll_idvol(df_code: pd.DataFrame, ols_window: int, var_ma: int) -> pd.DataFrame:
    try:
        # 估计参数
        model_ols = RollingOLS(endog=df_code[['Y']], exog=df_code[['CONST', 'X']], window=ols_window)
        df_para = model_ols.fit().params.rename(columns={'CONST': 'Alpha', 'X': 'Beta'})

        # 预测残差 已经对齐了
        df_con = pd.concat([df_code, df_para], axis=1, join='inner')
        df_con['Residual'] = df_con['Alpha'] + df_con['Beta'] * df_con['X'] - df_con['Y']

        # 计算月波动率
        df_con['Idvol'] = df_con['Residual'].rolling(var_ma).var(ddof=1)
        return df_con[['share_return', 'total_mv', 'Idvol']]
    except (IndexError, ValueError): return pd.DataFrame(columns=['trade_date', 'ts_code']).set_index(['trade_date', 'ts_code'])


# 分组计算
def cal_panel_ols():
    df_panel = data_loader.get_cross_panel_reg()
    # 定义回归变量 CAPM回归: (rm-rf)=a+b*(RM-rf)
    df_panel['Y'] = df_panel['share_return'] - df_panel['riskfree_return']
    df_panel['CONST'] = 1  # 带截距项回归
    df_panel['X'] = df_panel['shareindex_return'] - df_panel['riskfree_return']

    # 多线程加速
    pandarallel.initialize(progress_bar=True)
    df_out = (df_panel.groupby(level=['ts_code'])[['share_return', 'total_mv', 'Y', 'CONST', 'X']]
              .parallel_apply(lambda x: roll_idvol(x, 5, 30)).droplevel(2)
              )
    # 保存
    df_out.to_parquet(f'{DATASETS_PATH}ASHARE_OLS_PANEL.parquet', engine='pyarrow', index=True)


# 计算滚动回归
if not os.path.exists(f'{DATASETS_PATH}ASHARE_OLS_PANEL.parquet'): cal_panel_ols()

# 加载滚动回归的面板数据集
df_ols_panel = (
        cudf.read_parquet(f'{DATASETS_PATH}ASHARE_OLS_PANEL.parquet').reset_index().set_index(['trade_date', 'ts_code']).sort_index()
        .rename(columns={'Idvol': 'idvol'})
)
df_ols_panel

### 3.2 面板数据分组

In [None]:
# 参数阈值
QUANTILE = 0.5


# 面板数据分组
def group_ols_panel(df, group_col: str):
    """
    :param df:
    :param group_col:分组变量
    """
    # 分组
    df[f'{group_col}_split'] = df[group_col].groupby(level=['trade_date']).transform(lambda x: x.quantile(QUANTILE))
    df[f'{group_col}_group'] = np.where(df[group_col].to_pandas() >= df[f'{group_col}_split'].to_pandas(),
                                        f'{group_col}_high', f'{group_col}_low')
    # 重新设定分组索引
    df = df.reset_index().set_index(['trade_date', f'{group_col}_group', 'ts_code']).sort_index()

    # 求组中市值加权系数
    df[f'{group_col}_mv_ratio'] = df['total_mv'] / df.groupby(level=['trade_date', f'{group_col}_group'])['total_mv'].transform('sum')

    # 求组中回报
    df[f'{group_col}_vw_return'] = df[f'{group_col}_mv_ratio'] * df['share_return']
    df[f'{group_col}_group_return'] = (df.groupby(level=['trade_date', f'{group_col}_group'])[f'{group_col}_vw_return']
                                       .transform('sum'))

    # 重置索引
    return df.reset_index(f'{group_col}_group')


# 循环分组,最后处理成时间序列数据
def group_cols(df, columns: list):
    # 用于分组
    df_temp = df

    # 求组中市值加权回报
    for col in columns: df_temp = group_ols_panel(df_temp, col)

    # 筛选
    df_temp = df_temp[[i + '_group' for i in columns] + [i + '_group_return' for i in columns]]

    # 保留唯一的组合 2^N
    df_time_panel = (df_temp.reset_index().groupby(['trade_date'] + [i + '_group' for i in columns]).first())

    # 转为时间序列数据
    df_time = cudf.DataFrame()
    for col in columns: df_time = cudf.concat([df_time,
                                               df_time_panel.groupby(level=['trade_date', f'{col}_group']).first().reset_index()
                                              .pivot(index='trade_date', columns=f'{col}_group', values=f'{col}_group_return')],
                                              join="left", axis=1, sort=True)
    # 高减低
    # for col in columns: df_time[f'{col}_mid'] = df_time[f'{col}_high'] - df_time[f'{col}_low']

    return df_time


df_group_time = group_cols(df_ols_panel, ['total_mv', 'idvol'])
df_group_time

## 4. VAR模型分析

### 4.1 回归前数据处理

In [None]:
# 增加用于回归的市场指数
df_time_join = (
        cudf.concat([get_time_series().drop(columns='riskfree_return'), df_group_time,
                     # 指数数据
                     cudf.from_pandas(get_index_panel()[['shareindex_return']].to_pandas().query("ts_code == '000001.SH'")
                                      .reset_index('ts_code', drop=True).rename(columns={'shareindex_return': 'idx_000001'})),
                     # 指数数据
                     cudf.from_pandas(get_index_panel()[['shareindex_return']].to_pandas().query("ts_code == '000300.SH'")
                                      .reset_index('ts_code', drop=True).rename(columns={'shareindex_return': 'idx_000300'})),
                     # 指数数据
                     cudf.from_pandas(get_index_panel()[['shareindex_return']].to_pandas().query("ts_code == '000016.SH'")
                                      .reset_index('ts_code', drop=True).rename(columns={'shareindex_return': 'idx_000016'})),
                     # 指数数据
                     cudf.from_pandas(get_index_panel()[['shareindex_return']].to_pandas().query("ts_code == '399300.SZ'")
                                      .reset_index('ts_code', drop=True).rename(columns={'shareindex_return': 'idx_399300'})),
                     ], sort=True, axis=1
                    ).dropna(axis=0).to_pandas()
)
df_time_join

In [None]:
# 增加平方项
def add_square_column(df): return pd.concat([df, df.pow(2).add_suffix('_s')], axis=1)


# 增加日期虚拟变量
def add_dummy_column(df, column: str):
    df_weekday = pd.get_dummies(pd.to_datetime(df[column], format='%Y%m%d').dt.weekday, prefix='weekday', drop_first=True)
    df_month = pd.get_dummies(pd.to_datetime(df[column], format='%Y%m%d').dt.month, prefix='month', drop_first=True)
    return pd.concat([df, df_weekday, df_month], axis=1)


# 处理好的用于回归的数据
df_series_ols = add_dummy_column(add_square_column(df_time_join).reset_index(), 'trade_date')
df_series_ols


### 4.2 回归结果

In [None]:
# @formatter:off

In [None]:
%%stata -d df_series_ols -force
/*{*/
//描述性统计
logout, save(Outputs/Table_Sum)  replace: ///
tabstat *_neg idx_* *_high *_low, s(N sd mean p50 min max ) f(%12.4f) c(s)

#### 4.2.1 主要股票市场

In [None]:
%%stata -d df_series_ols -force -nogr

//时间设定
ge time = _n
tsset time
est clear

//VAR回归
foreach var in idx_000001 idx_000300 idx_000016 idx_399300    {

    rename(`var' `var'_s) (return return_s)
    eststo: qui var return img_neg return_s, lags(1/5) exog(month_* weekday_*)
    estadd local Month "Yes", replace
    estadd local Weekday "Yes", replace

    //绘图
    irf creat var, set(Outputs/`var'_img ,replace) step(5)
    irf graph oirf, impulse(img_neg) response(return) lstep(0) ustep(5) name(`var'_img,replace)  ///
    byopts(note("")) byopts(legend(off)) xtitle(, size(small) margin(zero)) ///
    ysc(r(-0.15,0.15)) yline(0) ylabel(#2) ytitle(return, size(small) margin(zero)) scheme(sj)

    rename(return return_s) (`var' `var'_s)
}
/*{*/
//输出
esttab , keep(return:L*.img_neg) ///
star(* 0.1 ** 0.05 *** 0.01) ///
stats( Month Weekday  r2_1 N, fmt(%3s %3s %12.4f %12.0f)) b(%12.4f) ///
title("Table1 Main Market") mtitle("000001.SH" "000300.SH" "000016.SH" "399300.SZ")  nogap

脉冲响应曲线

In [None]:
%%stata
graph combine idx_000001_img idx_000016_img idx_000300_img idx_399300_img, ///
xcommon ycommon name(combine_img, replace) scheme(sj)

#### 4.2.2 套利限制

In [None]:
%%stata -d df_series_ols -force

//时间设定
ge time = _n
tsset time
est clear

//VAR回归
foreach var in total_mv_high total_mv_low  idvol_high idvol_low {
    rename(`var' `var'_s) (return return_s)

    eststo: qui var return img_neg return_s, lags(1/5) exog(month_* weekday_*)
    estadd local Month "Yes", replace
    estadd local Weekday "Yes", replace

    rename(return return_s) (`var' `var'_s)
}

//输出
esttab , keep(return:L*.img_neg) ///
star(* 0.1 ** 0.05 *** 0.01) ///
stats( Month Weekday  r2_1 N, fmt(%3s %3s %12.4f %12.0f)) b(%12.4f) ///
title("Table1 Arbitrage Limit") mtitle("HIGH" "LOW" "HIGH" "LOW")  nogap ///
mgroups("Market Value" "Idiosyncratic Volatility", pattern(1 0 1 0) ) showtabs

## 5. 按照观测窗口构造投资策略(暂时不做)

In [None]:
def ma_strategy(df, factor_list, window_list):
    """
    用均值策略作为买入卖出信号
    :param df: 因子数据
    :param factor_list: 待选决策因子
    :param window_list: 待选决策窗口
    :return:
    """
    out_list = []
    # 循环因子和窗口
    for w in window_list:
        for f in factor_list:
            # MA均值计算
            df[f'{f}_ma_{w}'] = (df['img_neg'].rolling(w).mean())

            # 形成投资信号
            df[f'{f}_sell_signal_ma_{w}'] = df[f] >= df[f'{f}_ma_{w}']
            df[f'{f}_sell_signal_ma_{w}'] = df[f'{f}_sell_signal_ma_{w}'].shift(1)  # 向前移动一天,使用历史信息

            # 卖空指数操作
            df[f'{f}_return_ma_{w}'] = np.where(df[f'{f}_sell_signal_ma_{w}'], -1 * (df[f'{f}_sell_signal_ma_{w}'] * df['close_chg']),
                                                df['close_chg'])

            # 计算累积市值
            df[f'{f}_mv_ma_{w}'] = ((df[f'{f}_return_ma_{w}'] + 100) / 100).cumprod(axis=0)

            # 计算最大回撤
            df[f'{f}_mdd_ma_{w}'] = df[f'{f}_return_ma_{w}'].cummin(axis=0)

            # 计算夏普比率
            df[f'{f}_sharp_ma_{w}'] = (df[f'{f}_return_ma_{w}'].mean()) / df[f'{f}_return_ma_{w}'].std(ddof=0) * np.sqrt(250)

            # 卡玛比率
            df[f'{f}_kama_ma_{w}'] = (df[f'{f}_return_ma_{w}'].mean() * 250) / (df[f'{f}_mdd_ma_{w}'].min())

            # 输出值列表
            out_list += [f'{f}_mv_ma_{w}', f'{f}_mdd_ma_{w}', f'{f}_sharp_ma_{w}', f'{f}_kama_ma_{w}']

    # 去掉空行(MA行)
    df.dropna(axis=0, inplace=True)

    # 对比基准
    df['mv_shareindex'] = ((df['close_chg'] + 100) / 100).cumprod(axis=0)
    df['mdd_shareindex'] = df['close_chg'].cummin(axis=0)
    df['sharp_shareindex'] = (df['close_chg'].mean()) / df['close_chg'].std(ddof=0) * np.sqrt(250)

    return df[['trade_date', 'ts_code', 'mv_shareindex', 'mdd_shareindex', 'sharp_shareindex'] + out_list]


#ma策略
ma_strategy(df_data, ['img_neg', 'tex_neg', 'SENT_INDEX', 'SENT_INDEX_R'], [5, 10, 15, 20])
