# 投资者情绪专题
使用所有的数据进行加总

## 数据准备

In [1]:
import sys
import os
import cudf  #CUDA计算
import pandas as pd

# 自行编写的包
sys.path.append('/home/ubuntu/notebook/Investor-Sentiment')
sys.path.append('/usr/local/stata17/utilities')

# Stata
from pystata import config  # noqa

config.init('mp')
# ------------------------------数据集路径----------------------------------#
DATASETS_PATH = '/data/DataSets/investor_sentiment/'


  ___  ____  ____  ____  ____ ©
 /__    /   ____/   /   ____/      17.0
___/   /   /___/   /   /___/       MP—Parallel Edition

 Statistics and Data Science       Copyright 1985-2021 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        stata@stata.com

Stata license: Single-user 8-core , expiring  1 Jan 2025
Serial number: 501709301094
  Licensed to: Colin's Stata
               Love U

Notes:
      1. Unicode is supported; see help unicode_advice.
      2. More than 2 billion observations are allowed; see help obs_advice.
      3. Maximum number of variables is set to 5,000; see help set_maxvar.


In [2]:
df = (
        cudf.read_parquet(
                DATASETS_PATH + 'FORUM_SENT_TRANS.parquet',
                columns=['PostDate', 'Stockcode', 'PostSource', 'PositiveSentIndexA', 'PositiveSentIndexB', 'TotalPosts', 'AvgReadings',
                         'AvgComments', 'AvgPositThumbUps'])
        .query("PostDate>=20131231")

)

In [3]:
df.set_index(['PostDate', 'Stockcode']).sort_index().to_pandas()

Unnamed: 0_level_0,Unnamed: 1_level_0,PostSource,PositiveSentIndexB,TotalPosts,AvgReadings,AvgComments,AvgPositThumbUps
PostDate,Stockcode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20131231,000030.SZ,1,,0,,,
20131231,000030.SZ,1,,0,,,
20131231,000034.SZ,1,,0,,,
20131231,000034.SZ,1,,0,,,
20131231,000043.SZ,1,,0,,,
...,...,...,...,...,...,...,...
20211016,900948.SH,1,,0,,,
20211016,900952.SH,1,,0,,,
20211016,900953.SH,1,,0,,,
20211016,900955.SH,1,,0,,,


## 指标计算

计算加权影响力

In [4]:
# 每日总量
df['DayTotalPosts'] = df.groupby(['PostDate', 'Stockcode'])['TotalPosts'].transform('sum')
df['DayAvgReadings'] = df.groupby(['PostDate', 'Stockcode'])['AvgReadings'].transform('sum')
df['DayAvgComments'] = df.groupby(['PostDate', 'Stockcode'])['AvgComments'].transform('sum')
df['DayAvgPositThumbUps'] = df.groupby(['PostDate', 'Stockcode'])['AvgPositThumbUps'].transform('sum')

# 加权因子总量
df['DayTotalPostsFactor'] = df['TotalPosts'] / df['DayTotalPosts']
df['DayAvgReadingsFactor'] = df['AvgReadings'] / df['DayAvgReadings']
df['DayAvgCommentsFactor'] = df['AvgComments'] / df['DayAvgComments']
df['DayAvgPositThumbUpsFactor'] = df['AvgPositThumbUps'] / df['DayAvgPositThumbUps']

# 加权情绪因子
df['PositiveSentIndexAFactor'] = df['DayTotalPostsFactor'] * df['PositiveSentIndexA']

df.set_index(['PostDate', 'Stockcode']).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,PostSource,PositiveSentIndexB,TotalPosts,AvgReadings,AvgComments,AvgPositThumbUps,DayTotalPosts,DayAvgReadings,DayAvgComments,DayAvgPositThumbUps,DayTotalPostsFactor,DayAvgReadingsFactor,DayAvgCommentsFactor,DayAvgPositThumbUpsFactor
PostDate,Stockcode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20131231,000030.SZ,1,,0,,,,0,,,,,,,
20131231,000030.SZ,1,,0,,,,0,,,,,,,
20131231,000034.SZ,1,,0,,,,0,,,,,,,
20131231,000034.SZ,1,,0,,,,0,,,,,,,
20131231,000043.SZ,1,,0,,,,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211016,900948.SH,1,,0,,,,0,,,,,,,
20211016,900952.SH,1,,0,,,,0,,,,,,,
20211016,900953.SH,1,,0,,,,0,,,,,,,
20211016,900955.SH,1,,0,,,,0,,,,,,,
