In [None]:
"""
对于自定函数生成的Barra的描述因子（而非来自joinquant的数据），
进行数据的格式整理，加入行业因子，涨跌幅，市值几个因子，
调整格式为组合优化函数所需要的格式，
每个月对该次分析所需要的历史数据输出csv文件保存
"""
import os
import pandas as pd
import numpy as np
import akshare as ak
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize
import datetime
import math
import matplotlib.pyplot
import alphalens
import copy

# 数据导入和整理

In [4]:
"""
对保存的描述因子csv文件读取
每个csv文件文件名为描述因子名.csv，列为stock，行为date
"""
for root, dirs, files in os.walk('dataset/barra factor'):
    pass
files_list = [x[:-4] for x in files]
dataset_dict = {}

for key in files_list : 
    print(key)
    dataset_dict[key] = pd.read_csv('dataset/barra factor/' + key + '.csv', low_memory=False).set_index('date')
    
"""
将日涨跌幅和市值也作为因子纳入因子dataframe的dict中去
"""    
for key in ['涨跌幅(%)', '总市值(元)']:
    x = pd.read_csv('dataset/后复权数据-分类/' + key + '.csv', low_memory=False, encoding='gbk')
    x = x.rename(columns = {"日期": "date"}).set_index('date')
    x_columns = x.columns.to_list()
    for i in range(len(x_columns)) : 
        if x_columns[i][:1] == '6' : 
            x_columns[i] = x_columns[i][:6] + '.XSHG'
        else : 
            x_columns[i] = x_columns[i][:6] + '.XSHE'
    x.columns = x_columns
    if key == '涨跌幅(%)': key = 'ret'
    if key == '总市值(元)': key = 'capital'
    print(key)
    dataset_dict[key] = x

beta
blev
btop
cetop
cmra
dastd
dtoa
egrlf
egro
egrsf
epfwd
etop
hsigma
lncap
mlev
nlsize
rstr
sgro
stoa


KeyError: "None of ['date'] are in the columns"

In [None]:
"""
由于涨跌幅，市值和Barra因子来自不同的平台，这里对它们的股票和日期取交集，
"""
# 列取交集
stks_sec = set()
for key in dataset_dict.keys():
    if len(stks_sec) == 0 : 
        stks_sec = set(dataset_dict[key].columns)
    else: 
        stks_sec = set(dataset_dict[key].columns) & stks_sec
stks_sec = sorted(list(stks_sec))
for key in dataset_dict.keys():
    dataset_dict[key] = dataset_dict[key][stks_sec]
    
# 行取交集
dates = set()
for key in dataset_dict.keys():
    if len(dates) == 0 : 
        dates = set(dataset_dict[key].index)
    dates = dates & set(dataset_dict[key].index)
    print(len(dates))
dates = sorted(list(dates))
for key in dataset_dict.keys():
    dataset_dict[key] = dataset_dict[key].loc[dates,:]

# 数据去极值，标准化，中性化函数

In [9]:
def winsorize(factor_df):
    """
    对因子值做去极值操作
    :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	　　　     BA	　　　CMG	　　   DAL	      LULU	　　
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902
    :return:去极值后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。
    """

    def winsorize_series(se):
        q = se.quantile([0.025, 0.975])
        if isinstance(q, pd.Series) and len(q) == 2:
            se[se < q.iloc[0]] = q.iloc[0]
            se[se > q.iloc[1]] = q.iloc[1]
        return se

    def handle(rows):
        return winsorize_series(rows[1])

    result = pd.DataFrame(list(map(handle, factor_df.iterrows())), factor_df.index)

    return result

In [10]:
def standardize(factor_df):
    """
    对因子值做z-score标准化
    :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	　　　     BA	　　　CMG	　　   DAL	      LULU	　　
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902
    :return:z-score标准化后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。
    """


    def standardize_series(se):
        se_std = se.std()
        se_mean = se.mean()
        return (se - se_mean) / se_std

    def handle(rows):
        return standardize_series(rows[1])

    result = pd.DataFrame(list(map(handle, factor_df.iterrows())), factor_df.index)

    return result

In [11]:
"""
将股票分类到其所属的不同行业下面，
获得datafarme：res
列为申万一级行业代码，行为0，1，2...，value为各个行业的成分股代码，
有些行业成分股较少时，多出的行对应的valu为空值
"""

sector_list = pd.read_csv('D:/Python/Flies/Guanyun/barra/dataset/sector_list.csv',encoding='gbk',dtype=str)
stock_list = pd.read_csv('D:/Python/Flies/Guanyun/A股数据/申万一级行业成份/' + '801740' +'.csv',dtype=str)['stock_code'].to_list()

sector_content_df = pd.read_csv('D:/Python/Flies/Guanyun/barra/dataset/sector_content_df.csv').set_index('date')
import re
p = re.compile(r'(\d+.[A-Z]+)')
def findall_apply_func(y):
    def findall_apply_inner_func(df_str,y) : 
        return p.findall(df_str)
    return y.apply(findall_apply_inner_func, args=(y,))
sector_content_df = sector_content_df.apply(findall_apply_func,axis=0)
sector_content_df = sector_content_df.iloc[-1,:]
res = pd.DataFrame()
for sec in sector_content_df.index : 
    res = pd.concat([res,pd.DataFrame(sector_content_df[sec])],axis=1)
res.columns = sector_content_df.index


In [13]:
# 获取行业分类
def get_industry_class(symbols):
    """
    获取行业分类信息
    :param symbols: 一组股票代码(list),形式为通用标准(编码.交易所 如["000001.XSHE","600000.XSHG"])
    :return: sina的行业分类信息。(pandas.Dataframe) index为行业分类编号(1-49);columns为股票代码;值为0/1,分别表示属于该行业/不属于该行业
    """
    sector_list = pd.read_csv('D:/Python/Flies/Guanyun/barra/dataset/sector_list.csv',encoding='gbk',dtype=str)
    frame = pd.DataFrame(0,index = sector_content_df.index, columns = symbols)
    for symbol in symbols:
        this_class = res[res == symbol].dropna(axis=1,how="all").columns[0]
        frame.loc[this_class, symbol] = 1
    return frame

In [14]:
from sklearn import linear_model
from scipy.stats import rankdata
# 行业、市值中性化 - 对Dataframe数据
def neutralize( factor_df, factorIsMV = True):
    """
    对因子做行业、市值中性化
    :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	　　　     BA	　　　CMG	　　   DAL	      LULU	　　
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902
    :param factorIsMV: 待中性化的因子是否是市值类因子(bool)。是则为True,默认为False
    :return: 中性化后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。
    """

#     # 剔除有过多无效数据的个股   # 回看固定时间
#     empty_data = pd.isnull(factor_df).sum()
#     pools = empty_data[empty_data < len(factor_df) * 0.1].index  # 保留空值比例低于0.1的股票
#     factor_df = factor_df.loc[:, pools]

    # 剔除过多值为空的截面
    factor_df = factor_df.dropna(thresh = len(factor_df.columns) * 0.5) # 保留空值比例低于0.9的截面

    # 获取行业分类信息
#     X = get_industry_class(pools)
    X = get_industry_class(factor_df.columns.to_list())
    nfactors = len(X.index)

    # 获取对数流动市值，并去极值、标准化。市值类因子不需进行这一步
    if not factorIsMV:
        x1 = standardize(winsorize(read_LFLO(pools, factor_df.index[0], factor_df.index[-1])))
        nfactors += 1

    result = []
    # 逐个截面进行回归，留残差作为中性化后的因子值
    for i in factor_df.index:
        if not factorIsMV:
            DataAll = pd.concat([X.T, x1.loc[i], factor_df.loc[i]], axis=1)
        else:
            DataAll = pd.concat([X.T, factor_df.loc[i]], axis=1)
        # 剔除截面中值含空的股票
        DataAll = DataAll.dropna()
        DataAll.columns = list(range(0, nfactors + 1))
        regr = linear_model.LinearRegression(fit_intercept=False)
        regr.fit(np.matrix(DataAll.iloc[:, 0:nfactors]), np.transpose(np.matrix(DataAll.iloc[:, nfactors])))
        residuals = np.transpose(np.matrix(DataAll.iloc[:, nfactors])) -regr.predict(np.matrix(DataAll.iloc[:, 0:nfactors]))
        residuals = pd.DataFrame(data=residuals, index=np.transpose(np.matrix(DataAll.index.values)))
        residuals.index = DataAll.index.values
        residuals.columns = [i]
        result.append(residuals)

    result = pd.DataFrame(pd.concat(result, axis=1).T)

    return result


# 数据处理

In [15]:
sec_df = pd.DataFrame(index=dates, columns=stks_sec)
for stock in sec_df.columns:
    sec_df[stock] = res[res == stock].dropna(how = 'all', axis = 1).columns[0]
dataset_dict['industry'] = sec_df

In [27]:
for t in range(71, len(stock_list_df_final.index)) : 
    print(stock_list_df_final.index[t])
    dataset_dict_500 = {}
    dates_index = dates.index(stock_list_df_final.index[t])
    dates_500 = dates[dates_index-505:dates_index-1]
    stks_500 = stock_list_df_final.iloc[t,:].to_list()
    while np.nan in stks_500:
        stks_500.remove(np.nan)

    for key in dataset_dict.keys() : 
        if key in files_list : 
            dataset_dict_500[key] = neutralize(standardize(winsorize(dataset_dict[key].loc[dates_500, stks_500])), factorIsMV = True)
        else : 
            dataset_dict_500[key] = dataset_dict[key].loc[dates_500, stks_500]

    dataset_stks_df = pd.DataFrame(columns = ['date','stocknames'] + list(dataset_dict_500.keys()))
    for stocknames in stks_500:
        dataset_stks_df = dataset_stks_df.append([{'stocknames':stocknames}], ignore_index=True)



    dataset_df_sum = pd.DataFrame()
    interval = 50
    for i in range(len(dates_500)) : 
        tmp_df = copy.deepcopy(dataset_stks_df)
        tmp_df = tmp_df.set_index('stocknames')
        tmp_df['date'] = dates_500[i]
        for key in dataset_dict_500.keys() : 
            try:
                tmp_df[key] = pd.DataFrame(dataset_dict_500[key].loc[dates_500[i]]).sort_index()
            except:
                pass
        tmp_df['stocknames'] = tmp_df.index
        tmp_df = tmp_df.reset_index(drop=True)        

        if len(dataset_df_sum) == 0 : 
            dataset_df_sum = copy.deepcopy(tmp_df)
        else: 
            dataset_df_sum = pd.concat([dataset_df_sum, tmp_df],axis=0)

    print('====================================================')

    dataset_df_sum['size'] = 1.0 * dataset_df_sum['lncap']
    dataset_df_sum['beta'] = 1.0 * dataset_df_sum['beta']
    dataset_df_sum['momentum'] = 1.0 * dataset_df_sum['rstr']
    dataset_df_sum['residual_volatility'] = 0.74 * dataset_df_sum['dastd'] + 0.16 * dataset_df_sum['cmra'] + 0.10 * dataset_df_sum['hsigma']
    dataset_df_sum['non_linear_size'] = 1.0 * dataset_df_sum['nlsize']
    dataset_df_sum['book_to_price_ratio'] = 1.0 * dataset_df_sum['btop']
    dataset_df_sum['liquidity'] = 0.35 * dataset_df_sum['stom'] + 0.35 * dataset_df_sum['stoq'] + 0.30 * dataset_df_sum['stoa']
    dataset_df_sum['earnings_yield'] = 0.68 * dataset_df_sum['epfwd'] + 0.21 * dataset_df_sum['cetop'] + 0.11 * dataset_df_sum['etop']
    dataset_df_sum['growth'] = 0.18 * dataset_df_sum['egrlf'] + 0.11 * dataset_df_sum['egrsf'] + 0.24 * dataset_df_sum['egro'] + 0.47 * dataset_df_sum['sgro']
    dataset_df_sum['leverage'] = 0.38 * dataset_df_sum['mlev'] + 0.35 * dataset_df_sum['dtoa'] + 0.27 * dataset_df_sum['blev']
    dataset_df_sum = dataset_df_sum.drop([ 'blev', 'btop', 'cetop', 'cmra', 'dastd', 'dtoa',
           'egrlf', 'egro', 'egrsf', 'epfwd', 'etop', 'hsigma', 'lncap', 'mlev',
           'nlsize', 'rstr', 'sgro', 'stoa', 'stom', 'stoq'],axis=1)
    dataset_df_sum = dataset_df_sum[[
        'date', 'stocknames', 'capital', 'ret', 'industry', 'size', 'beta',
        'momentum', 'residual_volatility', 'non_linear_size',
        'book_to_price_ratio', 'liquidity', 'earnings_yield', 'growth', 'leverage'
    ]]    

    dataset_df_sum.to_csv('dataset/barra processing/total data/' + 'barra_data_' + str(t) + '.csv')


2015-06-08
2015-07-08
2015-08-06
2015-09-08
2015-10-14
2015-11-12
2015-12-11
2016-01-12
2016-02-17
2016-03-17
2016-04-18
2016-05-18
2016-06-20
2016-07-19
2016-08-17
2016-09-19
2016-10-25
2016-11-23
2016-12-22
2017-01-23
2017-02-28
2017-03-29
2017-05-02
2017-06-02
2017-07-03
2017-08-01
2017-08-30
2017-09-28
2017-11-03
2017-12-04
2018-01-03
2018-02-01
2018-03-09
2018-04-11
2018-05-14
2018-06-12
2018-07-12
2018-08-10
2018-09-10
2018-10-17
2018-11-15
2018-12-14
2019-01-16
2019-02-21
2019-03-22
2019-04-23
2019-05-27
2019-06-26
2019-07-25
2019-08-23
2019-09-24
2019-10-30
2019-11-28
2019-12-27
2020-02-05
2020-03-05
2020-04-03
2020-05-08
2020-06-08
2020-07-09
2020-08-07
2020-09-07
2020-10-14
2020-11-12
2020-12-11
2021-01-12
2021-02-10
2021-03-18
2021-04-19
2021-05-21


ValueError: '2021-05-21' is not in list