In [1]:
import os
os.chdir('D:/Python/Flies/Guanyun/barra/dataset/barra processing/')

import pandas as pd 
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression
from mfm_func_file import *

In [2]:
import warnings
warnings.simplefilter('error', RuntimeWarning)

# code

In [3]:
def dropna_of_factor(df, factor, val):
    """
    去除空值
    """
    ret_nan_count = df.groupby('stocknames').agg({factor: lambda x: x.isnull().sum()})
    df = df[df['stocknames'].isin(ret_nan_count[factor][ret_nan_count[factor]<val].index)]
    return df
def nan_processing(data):
    # 1,剔除停牌时间过多/上市过晚的股票
    data = dropna_of_factor(data, 'ret', 20)
    data = dropna_of_factor(data, 'size', 20)
    # 2,填充缺失值
    isna_se = data.isna().sum()
    isna_index = isna_se[isna_se>0].index
    for factor in isna_se[isna_se>0].index : 
        if factor != 'capital' and factor != 'ret' : 
            # 对barra因子的缺失值,用行业内均值填充
            data[factor] = data.groupby(['date','industry'])[factor].transform(lambda x:x.fillna(x.mean()))
        if factor == 'capital':
            # 对市值的缺失值,用个股内均值填充
            data[factor] = data.groupby(['stocknames'])[factor].transform(lambda x:x.fillna(x.mean()))
        if factor == 'ret' : 
            # 对每日涨跌幅的缺失值,用0填充
            data[factor] = data[factor].transform(lambda x:x.fillna(0))
    # 3,剔除经过这两步处理后仍存在缺失值的股票(一般不会有,防止出现整个行业存在某一factor空值的情况)
    for factor in data.columns : 
        data = dropna_of_factor(data, factor, 1)    
    return data

In [10]:
# files 的格式形如'barra_data_1030.csv'
for root, dirs, files in os.walk('total_data_jqdata/'):
    pass
for fill_name in files: 
    ####导入数据
    data = []
    data.append(pd.read_csv('total_data_jqdata/' + fill_name))
    data = pd.concat(data, axis = 0).iloc[:,1:]
    
    data = nan_processing(data)
#     data = nan_processing(data)
#     data = data.fillna(0)
#     for factor in data.columns : 
#         data = dropna_of_factor(data, factor, 1)    
#     data.index = range(len(data))

    data = data.reset_index().iloc[:,1:]
    ####行业数据
    industry_info = pd.read_csv(open('sector_list/sector_list.csv'))
    industry = np.array([1*(data.industry.values == x) for x in industry_info.code.values]).T
    industry = pd.DataFrame(industry, columns = list(industry_info.industry_names.values))
    data = pd.concat([data.loc[:,['date', 'stocknames', 'capital', 'ret']], industry, 
                      data.loc[:, ['size', 'beta', 'momentum','residual_volatility', 'non_linear_size', 'book_to_price_ratio',
                               'liquidity', 'earnings_yield', 'growth', 'leverage']]], axis = 1)
    data = data.drop(['国防军工'],axis=1)


    model = MFM(data, 27, 10)
    (factor_ret, specific_ret, R2) = model.reg_by_time()
    print('\n<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<共同因子协方差矩阵>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
    nw_cov_factor =model.Newey_West_by_time(q = 2, tao = 90, content = 'factor_ret')                 #Newey_West调整
    model.eigen_risk_adj_by_time(M = 1000, scale_coef = 1.2)    #特征风险调整
    vr_cov_ls_fac, lamb_fac = model.vol_regime_adj_by_time(tao = 42)               #vol regime调整

    print('\n<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<特异风险矩阵计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
    nw_cov_ls_spc = model.Newey_West_by_time(q = 5, tao = 90, content = 'specific_ret')                 #Newey_West调整

    dates = pd.to_datetime(data.date.values)                        #日期
    sorted_dates = pd.to_datetime(np.sort(pd.unique(dates)))   #排序后的日期
    T = len(sorted_dates)                                      #期数

    spec_2 = []
    spec_3 = []
    for i in range(T) : 
        if len(nw_cov_ls_spc[i]) == 0 : 
            spec_2.append(pd.DataFrame())
            spec_3.append(pd.DataFrame())
            continue
        today_data = data[data['date'] == sorted_dates[i].strftime("%Y-%m-%d")]
        str_mod = structural_model(i, today_data, nw_cov_ls_spc[i], h = 252)
        spec_2.append(str_mod)
        bys_data = pd.merge(str_mod.rename('vol'), today_data[['stocknames','capital']],
                            how='inner', left_index=True, right_on='stocknames').set_index('stocknames')
        bys_shrk = bayes_shrink(bys_data['vol'], bys_data['capital'], ngroup = 10, q = 1)
        spec_3.append(bys_shrk)

    vr_cov_ls, lamb = model.vol_regime_adj_by_time(tao = 42, h = 252, content = 'specific_ret', data_cov = spec_3)  #vol regime调整

    print('\n<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<最小波动组合计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')

    X_df = data[data['date'] == sorted_dates[-1].strftime("%Y-%m-%d")].set_index('stocknames')
    X_df['country'] = 1
    X_df = X_df.loc[:,vr_cov_ls_fac[0].index]
    X = np.matrix(X_df)
    F_df = vr_cov_ls_fac[0]
    F = np.matrix(F_df)
    delta = np.diag(vr_cov_ls[0])
    V = X @ F @ X.T + delta
    V_df = pd.DataFrame(V, columns = vr_cov_ls[0].index.to_list(), index = vr_cov_ls[0].index.to_list())

    omega_MV = min_vol_fac_port(V_df, X_df, X_df.columns[28:])
    omega_MV.to_csv('new_vmatrix/' + sorted_dates[-1].strftime("%Y-%m-%d") + '.csv')

Cross Section Regression, Date: 2018-07-09, 
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<共同因子协方差矩阵>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>




<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<特异风险矩阵计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<最小波动组合计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Cross Section Regression, Date: 2018-08-07, 
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<共同因子协方差矩阵>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>




<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<特异风险矩阵计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<最小波动组合计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Cross Section Regression, Date: 2018-09-05, 
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<共同因子协方差矩阵>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>




<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<特异风险矩阵计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<最小波动组合计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Cross Section Regression, Date: 2018-10-12, 
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<共同因子协方差矩阵>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>




<<<<<<<<<<<<<<<<<<<<<


<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<最小波动组合计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Cross Section Regression, Date: 2020-12-08, 
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<共同因子协方差矩阵>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>




<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<特异风险矩阵计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<最小波动组合计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Cross Section Regression, Date: 2021-01-07, 
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<共同因子协方差矩阵>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>




<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<特异风险矩阵计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<最小波动组合计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Cross Section Regression, Date: 2021-02-05, 
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<共同因子协方差矩阵>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>




<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<特异风险矩阵计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<最小波动组合计算>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Cross Section Regression, Date: 2021-03-15, 
<<<<<<<<<<<<<<<<<<<<<<<<<

RuntimeWarning: invalid value encountered in double_scalars