# 金融市场情绪专题
PCA

In [1]:
import sys
import os
import cudf
import pandas as pd

# ------------------------------数据集路径----------------------------------#
# /usr/local/miniconda3/envs/Rapids/bin/python
DATASETS_PATH = '/data/DataSets/BW_INDEX_CSMAR/BW_5_CSMAR/'

In [2]:
os.listdir(DATASETS_PATH)

['QX_STOCKRATE[DES][csv].txt',
 'QX_TRM1.csv',
 'QX_FUNDDISCOUNTPREMIUM[DES][csv].txt',
 'QX_DIVIDENDPREMIUM.csv',
 'QX_FUNDDISCOUNTPREMIUM1.csv',
 'QX_FUNDDISCOUNTPREMIUM.csv',
 'QX_DIVIDENDPREMIUM[DES][csv].txt',
 'QX_DIVIDENDPREMIUM.parquet',
 'QX_FUNDDISCOUNTPREMIUM.parquet',
 'QX_TRM.csv',
 'QX_TRM[DES][csv].txt',
 'QX_STOCKRATE.csv',
 'QX_IPO[DES][csv].txt',
 'QX_IPO.csv',
 'QX_IPO.parquet']

### 股利溢价-年
SgnYear [统计年度] - YYYY
Dnum [分红公司数目] - 指当年发生了现金分红的上市公司数量
NDnum [未分红公司数目] - 指当年没有现金分红的上市公司数量
DMB [分红公司平均市值账面比] - 计算公式为：所有分红的公司的市值账面比的平均数
NDMB [未分红公司平均市值账面比] - 计算公式为：所有未分红的公司的市值账面比的平均数
PDND [股利溢价] - 计算公式为：分红公司平均市值账面比-未分红公司平均市值账面比
LogPDND [对数股利溢价] - 计算公式为：股利溢价的对数

In [3]:
def cal_pd():
    df = cudf.read_parquet(f'{DATASETS_PATH}QX_DIVIDENDPREMIUM.parquet')
    return df[['LogPDND']]


df_pd = cal_pd()
df_pd

Unnamed: 0_level_0,LogPDND
SgnYear,Unnamed: 1_level_1
1991,-0.9005
1992,-0.272
1993,-0.0592
1994,-0.1019
1995,-0.0237
1996,-0.0648
1997,-0.0639
1998,-0.0479
1999,-0.0426
2000,-0.0986


### 股票融资占股票债券融资比例-年

In [4]:
# pd.read_csv(f'{DATASETS_PATH}QX_STOCKRATE.csv')

### 基金折溢价率CEFD-面板
Symbol [基金代码] -
TradingDate [交易日期] -
CategoryID [基金类别] - S0601=股票型基金;S0602=债券型基金;S0603=货币型基金;S0604=混合型基金;S0605=FOF;S0606=股指期货型基金;S0699=其他
FundTypeID [基金运作方式] - S0501=契约型开放式;S0502=契约型封闭式
IsETF [是否ETF] - 1=是；2=否
IsLOF [是否LOF] - 1=是；2=否
IsIndexFund [是否指数基金] - 1=是；2=否
IsInnovative [是否创新型基金] - 1=是；2=否
NAV [基金份额净值] -
AccumulativeNAV [基金份额累计净值] -
ClosePrice [收盘价] -
CovertRate [溢价率(%)] - 计算公式为：（收盘价-基金份额净值）*101/基金份额净值
TurnoverRate [换手率] -

In [5]:

def cal_cefd():
    """
    计算封闭式基金溢价
    """

    '''
    保留 封闭式基金
    剔除 LOF基金,创新型基金,股票指数基金
    '''

    df = (
            cudf.from_pandas(
                    cudf.read_parquet(f'{DATASETS_PATH}QX_FUNDDISCOUNTPREMIUM.parquet').sort_index().to_pandas()
                    .query(" FundTypeID=='S0502'& IsLOF==2 & IsInnovative==2 & IsIndexFund==2")
            )
    )

    '''
    CovertRate 平均加权
    '''
    df['DayAvgCR'] = df.groupby(level=['TradingDate'])['CovertRate'].transform('mean')

    '''
    时间序列
    '''
    df = df.groupby(level=['TradingDate']).first()[['DayAvgCR']]

    return df.rename(columns={'DayAvgCR': 'CEFD'})


df_cefd = cal_cefd()
df_cefd

Unnamed: 0_level_0,CEFD
TradingDate,Unnamed: 1_level_1
19980430,101.500000
19980630,60.272500
19980731,41.500000
19980831,28.418000
19980930,31.054000
...,...
20221216,4.303333
20221219,1.501667
20221220,0.239091
20221221,0.583000


### 市场换手率-面板数据
TradingDate [交易日期] -
MarketType [市场类型] - 1=上证A股市场；2=上证B股市场；4=深证A股市场；8=深证B股市场；6=主板；5=沪深A股市场；7=中小板；10=沪深B股市场；15=沪深AB股市场；16=创业板；20=深证A股和创业板；21=沪深A股和创业板；31=沪深AB股和创业板；32=科创板；33=上证A股和科创板；37=沪深A股和科创板；47=沪深AB股和科创板；53=沪深A股和创业板和科创板；63=沪深AB股和创业板和科创板；64=北证A股市场；69=沪深京A股市场；79=沪深京AB股市场；85=沪深京A股和创业板；95=沪深京AB股和创业板；101=沪深京A股和科创板；111=沪深京AB股和科创板；117=沪深京A股和创业板和科创板；127=沪深京AB股和创业板和科创板；
Parameter [参数值] - 字段说明见说明书“附录 参数说明”
Unit [参数单位] -
TurnoverRate1 [换手率(总股本)] - 计算公式为：交易量/总股本
TurnoverRate2 [换手率(流通股本)] - 计算公式为：交易量/流通股本

In [6]:
# pd.read_csv(f'{DATASETS_PATH}QX_TRM.csv')

### IPO股票信息-面板数据
Symbol [股票代码] -
ListedDate [上市日期] -
ListedYear [上市年份] -
ABSymbol [AB股交叉码] - 指公司同时发行了A和B股的情况
HSymbol [H股交叉码] - 指的是公司同时发行了H股的情况
EstablishDate [公司成立日期] -
IpoDate [首次招股日期] -
CompanyListedDate [公司上市日期] -
IsIPO [是否首次发行] - A=是；B=否
TotalShares [实际发行总量] -
IssuePrice [发行价格] -
CurrencyCode [发行价格币种] -
ClosePrice [上市首日收盘价] -
ReturnRate [上市首日回报率] -
TurnoverRate1 [上市首日换手率(总股本)] -
TurnoverRate2 [上市首日换手率(流通股本)] -
PE [上市首日市盈率] -
PB [上市首日市净率] -

In [7]:
def cal_ipo():
    """
    计算IPO信息
    """
    df = cudf.read_parquet(f'{DATASETS_PATH}QX_IPO.parquet')

    '''
    计算平均首日收益
    '''
    df['DayAvgRR'] = df.groupby(level=['ListedDate'])['ReturnRate'].transform('mean')
    df['DayTotalN'] = df.groupby(level=['ListedDate'])['ReturnRate'].transform('count')
    df = df.groupby(level=['ListedDate']).first()[['DayAvgRR', 'DayTotalN']]

    return df.rename(columns={'DayAvgCR': 'RIPO', 'DayTotalN': 'NIPO'})


df_ipo = cal_ipo()
df_ipo

Unnamed: 0_level_0,DayAvgRR,NIPO
ListedDate,Unnamed: 1_level_1,Unnamed: 2_level_1
19901210,0.803,1
19901219,48.89698229,7
19910114,14.98,1
19910129,13.58,1
19910403,0.225,1
...,...,...
20221216,0.1284755,2
20221221,0.156567,1
20221222,0.157636,2
20221223,,0


## 合成

In [8]:
df_pca = df_ipo.join(df_cefd)
df_pca.sort_index()

Unnamed: 0,DayAvgRR,NIPO,CEFD
19901210,0.803,1,
19901219,48.89698229,7,
19910114,14.98,1,
19910129,13.58,1,
19910403,0.225,1,
...,...,...,...
20221216,0.1284755,2,4.303333333
20221221,0.156567,1,0.583
20221222,0.157636,2,0.293333333
20221223,,0,
