In [1]:
"""
生成Barra因子所需的各个描述因子
数据来自wind和聚宽两个渠道，其中一致预期类数据有大量缺失，效果不好
"""

import pandas as pd
import numpy as np
import akshare as ak
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize
import datetime
import math
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [7]:
"""
数据导入和整理
"""

dataset_keys = [
    '涨跌幅(%)',
    'R_f',
    'R_m',
    'market_cap',  # 总市值
    '换手率(%)',
    '市净率',
    '市盈率',
    'total_liability',
    'total_assets',
    'equities_parent_company_owners', # 归属于母公司股东权益合计
    'preferred_shares_equity',
    'total_non_current_liability',
    'A股流通市值(元)',
    'FY1',
    'FY3',
    'FY12',
    'pcf_ratio',
    'operating_revenue',
    '总股本(股)',
    'eps',
    'circulating_market_cap',
    'net_profit',
    '总市值(元)',
    '收盘价(元)',
]

# 读取数据
dataset_dict = {}
for key in dataset_keys : 
    dataset_dict[key] = pd.read_csv('dataset/data_of_factor_need/' + key + '.csv', low_memory=False).set_index('date')
    
# 读取tradingdate，并获取对应一年前的交易日日期
trading_date_list = pd.read_csv('dataset/trading_date.csv').iloc[252:,:] 
dates = trading_date_list.loc[252*2:3000,'date']
def last_year_func(x):
    index_x = dates[dates.values==x].index
    try:
        return dates[index_x - 252].values[0]
    except : 
        return np.nan
dates_last = dates.apply(last_year_func)  

# 读取股票列表stks，这里用来自wind和来自jqdata的数据的交集
stks = list(set(dataset_dict['涨跌幅(%)'].columns) & (set(dataset_dict['FY3'].columns)) & (set(dataset_dict['pcf_ratio'].columns) ))
stks.sort()
dates = sorted(list(set(dates) & set(dataset_dict['pcf_ratio'].index)))

## Beta

In [7]:
def beta(stks, dates):
    # '000001.SZ'
    T = 252
    L = 0
    half_day = 63
    weights = 0.5**(np.arange(252-1,-1,-1)/63) 
    weights = weights / sum(weights)
    dates_i = dates[252:]
    def beta_inner(date):
        index_x = dates[dates.values==date].index
        date_last = dates[index_x - 251].values[0]
        
        stks_data = dataset_dict['涨跌幅(%)'].loc[date_last:date,stks].sort_index(axis=0, ascending=True)
        rf_data = dataset_dict['R_f'].loc[date_last:date,'r_f_daily'].sort_index(axis=0, ascending=True)
        rm_data = dataset_dict['R_m'].loc[date_last:date,'market'].sort_index(axis=0, ascending=True)
        
        y = stks_data.sub(rf_data, axis="index")
        x = rm_data.sub(rf_data, axis="index")

        sum_w_x_y = ((y.sub(y.mean(axis=1),axis=0)).multiply((x - x.mean()),axis=0).multiply(weights,axis=0).mean() * len(y))
        sum_w_x2 = ((x - x.mean())**2).multiply(weights,axis=0).sum()
        
#         alpha_value = np.mean(y) - beta_value * np.mean(x) 

#         ei = y.sub(alpha_value.add(beta_value.mul(x,axis=0)))
        return sum_w_x_y / sum_w_x2
    dates_i.index = dates_i
    aa = dates_i.apply(beta_inner)    
    
    return aa

beta(stks, dates).to_csv('dataset/barra_factor/' + 'beta' + '.csv')

## Size

In [5]:
def lncap(stks, dates):
    # '000001.SZ'
    return np.log(dataset_dict['总市值(元)'].loc[dates,stks])

lncap(stks, dates).to_csv('dataset/barra_factor/' + 'lncap' + '.csv')

## Residual Volatility

$$dastd = \frac{1}{n} \sum_{t=1}^{n}{w_t {(r_{et} - \overline{r_e})}^2}$$
$$ r_{et} = r_t - r_{ft} $$

In [7]:
def dastd(dates):
    # '000001.SZ'
    T = 252
    L = 0
    half_day = 42
    
    dates_i = dates[252:]
    def beta_inner(date):
        index_x = dates[dates.values==date].index
        date_last = dates[index_x - 252].values[0]
        
        stks_data = dataset_dict['涨跌幅(%)'].loc[date_last:date,stks].sort_index(axis=0, ascending=True)
        rf_data = dataset_dict['R_f'].loc[date_last:date,'r_f_daily'].sort_index(axis=0, ascending=True)
        rm_data = dataset_dict['R_m'].loc[date_last:date,'market'].sort_index(axis=0, ascending=True)
        
        ret = stks_data.sub(rf_data, axis="index")
        ret_mean = np.mean(ret)

        return (pd.DataFrame.ewm((ret - ret_mean)**2, halflife=half_day,adjust=False).mean().iloc[-1]) / (T - L)

    dates_i.index = dates_i
    aa = dates_i.apply(beta_inner)    
    
    return aa
(dastd(dates)*10000).to_csv('dataset/barra_factor/' + 'dastd' + '.csv')

$$ Z(T) = \sum_{t=1}^{T}{[ln(1+r_t)-ln(1+r_{ft})]} $$
$$crma = ln(1+Z_{max})-ln(1+Z_{min})$$

In [212]:
def cmra(stks, dates) : 
    stks_chg_data = pd.concat([dataset_dict['R_f']['r_f_daily'], dataset_dict['涨跌幅(%)'][stks]],axis=1).sort_index()
    T_month = [252 - 21 * month for month in range(13)]
    dates_i = dates[252:]
    def inner(date):
        index_loc = stks_chg_data.index.get_loc(date)
#         print(date, index_loc)
        T_month_date = index_loc - np.array(T_month)
#         print(T_month_date)
        def inner_a(i) : 
            # 计算Z_T(未累加)
            i = i.values[0]
            start_day = T_month_date[i]
            end_day = T_month_date[i + 1]
            ln_rf_rft = np.log(1+stks_chg_data.iloc[start_day:end_day, 1:]).\
                    sub(np.log(1+stks_chg_data.iloc[start_day:end_day, 0]),axis=0)  
            ln_rf_rft.loc[:,ln_rf_rft.isnull().sum()/len(ln_rf_rft) > 0.3] = np.nan     #   股票21天内停牌时间不超过30%
            res = ln_rf_rft.mean() * len(stks_chg_data.iloc[start_day:end_day, :])   
            res = res.rename(end_day)
            return res
        innera = pd.DataFrame([i for i in range(12)],index=[i for i in range(12)])
        innera = innera.apply(inner_a,axis=1) 
        innera.loc[:,innera.isnull().sum()/len(innera) > 0.5] = np.nan  # 股票上市需要超过六个月
        innera = innera.iloc[::-1,:].cumsum().iloc[::-1,:] # 累加以获得Z_T
        return np.log(1 + np.max(innera)) - np.log(1 + np.min(innera))  

    dates_i.index = dates_i
    aa = dates_i.apply(inner)    
    
    return aa
res = cmra(stks, dates)
res.to_csv('dataset/barra_factor/' + 'cmra' + '.csv')

2012-02-23 504
2012-02-24 505
2012-02-27 506
2012-02-28 507
2012-02-29 508
2012-03-01 509
2012-03-02 510
2012-03-05 511
2012-03-06 512
2012-03-07 513
2012-03-08 514
2012-03-09 515
2012-03-12 516
2012-03-13 517
2012-03-14 518
2012-03-15 519
2012-03-16 520
2012-03-19 521
2012-03-20 522
2012-03-21 523
2012-03-22 524
2012-03-23 525
2012-03-26 526
2012-03-27 527
2012-03-28 528
2012-03-29 529
2012-03-30 530
2012-04-05 531
2012-04-06 532
2012-04-09 533
2012-04-10 534
2012-04-11 535
2012-04-12 536
2012-04-13 537
2012-04-16 538
2012-04-17 539
2012-04-18 540
2012-04-19 541
2012-04-20 542
2012-04-23 543
2012-04-24 544
2012-04-25 545
2012-04-26 546
2012-04-27 547
2012-05-02 548
2012-05-03 549
2012-05-04 550
2012-05-07 551
2012-05-08 552
2012-05-09 553
2012-05-10 554
2012-05-11 555
2012-05-14 556
2012-05-15 557
2012-05-16 558
2012-05-17 559
2012-05-18 560
2012-05-21 561
2012-05-22 562
2012-05-23 563
2012-05-24 564
2012-05-25 565
2012-05-28 566
2012-05-29 567
2012-05-30 568
2012-05-31 569
2012-06-01

2014-05-27 1049
2014-05-28 1050
2014-05-29 1051
2014-05-30 1052
2014-06-03 1053
2014-06-04 1054
2014-06-05 1055
2014-06-06 1056
2014-06-09 1057
2014-06-10 1058
2014-06-11 1059
2014-06-12 1060
2014-06-13 1061
2014-06-16 1062
2014-06-17 1063
2014-06-18 1064
2014-06-19 1065
2014-06-20 1066
2014-06-23 1067
2014-06-24 1068
2014-06-25 1069
2014-06-26 1070
2014-06-27 1071
2014-06-30 1072
2014-07-01 1073
2014-07-02 1074
2014-07-03 1075
2014-07-04 1076
2014-07-07 1077
2014-07-08 1078
2014-07-09 1079
2014-07-10 1080
2014-07-11 1081
2014-07-14 1082
2014-07-15 1083
2014-07-16 1084
2014-07-17 1085
2014-07-18 1086
2014-07-21 1087
2014-07-22 1088
2014-07-23 1089
2014-07-24 1090
2014-07-25 1091
2014-07-28 1092
2014-07-29 1093
2014-07-30 1094
2014-07-31 1095
2014-08-01 1096
2014-08-04 1097
2014-08-05 1098
2014-08-06 1099
2014-08-07 1100
2014-08-08 1101
2014-08-11 1102
2014-08-12 1103
2014-08-13 1104
2014-08-14 1105
2014-08-15 1106
2014-08-18 1107
2014-08-19 1108
2014-08-20 1109
2014-08-21 1110
2014-08-

2016-07-01 1563
2016-07-04 1564
2016-07-05 1565
2016-07-06 1566
2016-07-07 1567
2016-07-08 1568
2016-07-11 1569
2016-07-12 1570
2016-07-13 1571
2016-07-14 1572
2016-07-15 1573
2016-07-18 1574
2016-07-19 1575
2016-07-20 1576
2016-07-21 1577
2016-07-22 1578
2016-07-25 1579
2016-07-26 1580
2016-07-27 1581
2016-07-28 1582
2016-07-29 1583
2016-08-01 1584
2016-08-02 1585
2016-08-03 1586
2016-08-04 1587
2016-08-05 1588
2016-08-08 1589
2016-08-09 1590
2016-08-10 1591
2016-08-11 1592
2016-08-12 1593
2016-08-15 1594
2016-08-16 1595
2016-08-17 1596
2016-08-18 1597
2016-08-19 1598
2016-08-22 1599
2016-08-23 1600
2016-08-24 1601
2016-08-25 1602
2016-08-26 1603
2016-08-29 1604
2016-08-30 1605
2016-08-31 1606
2016-09-01 1607
2016-09-02 1608
2016-09-05 1609
2016-09-06 1610
2016-09-07 1611
2016-09-08 1612
2016-09-09 1613
2016-09-12 1614
2016-09-13 1615
2016-09-14 1616
2016-09-19 1617
2016-09-20 1618
2016-09-21 1619
2016-09-22 1620
2016-09-23 1621
2016-09-26 1622
2016-09-27 1623
2016-09-28 1624
2016-09-

2018-08-08 2077
2018-08-09 2078
2018-08-10 2079
2018-08-13 2080
2018-08-14 2081
2018-08-15 2082
2018-08-16 2083
2018-08-17 2084
2018-08-20 2085
2018-08-21 2086
2018-08-22 2087
2018-08-23 2088
2018-08-24 2089
2018-08-27 2090
2018-08-28 2091
2018-08-29 2092
2018-08-30 2093
2018-08-31 2094
2018-09-03 2095
2018-09-04 2096
2018-09-05 2097
2018-09-06 2098
2018-09-07 2099
2018-09-10 2100
2018-09-11 2101
2018-09-12 2102
2018-09-13 2103
2018-09-14 2104
2018-09-17 2105
2018-09-18 2106
2018-09-19 2107
2018-09-20 2108
2018-09-21 2109
2018-09-25 2110
2018-09-26 2111
2018-09-27 2112
2018-09-28 2113
2018-10-08 2114
2018-10-09 2115
2018-10-10 2116
2018-10-11 2117
2018-10-12 2118
2018-10-15 2119
2018-10-16 2120
2018-10-17 2121
2018-10-18 2122
2018-10-19 2123
2018-10-22 2124
2018-10-23 2125
2018-10-24 2126
2018-10-25 2127
2018-10-26 2128
2018-10-29 2129
2018-10-30 2130
2018-10-31 2131
2018-11-01 2132
2018-11-02 2133
2018-11-05 2134
2018-11-06 2135
2018-11-07 2136
2018-11-08 2137
2018-11-09 2138
2018-11-

2020-09-17 2591
2020-09-18 2592
2020-09-21 2593
2020-09-22 2594
2020-09-23 2595
2020-09-24 2596
2020-09-25 2597
2020-09-28 2598
2020-09-29 2599
2020-09-30 2600
2020-10-09 2601
2020-10-12 2602
2020-10-13 2603
2020-10-14 2604
2020-10-15 2605
2020-10-16 2606
2020-10-19 2607
2020-10-20 2608
2020-10-21 2609
2020-10-22 2610
2020-10-23 2611
2020-10-26 2612
2020-10-27 2613
2020-10-28 2614
2020-10-29 2615
2020-10-30 2616
2020-11-02 2617
2020-11-03 2618
2020-11-04 2619
2020-11-05 2620
2020-11-06 2621
2020-11-09 2622
2020-11-10 2623
2020-11-11 2624
2020-11-12 2625
2020-11-13 2626
2020-11-16 2627
2020-11-17 2628
2020-11-18 2629
2020-11-19 2630
2020-11-20 2631
2020-11-23 2632
2020-11-24 2633
2020-11-25 2634
2020-11-26 2635
2020-11-27 2636
2020-11-30 2637
2020-12-01 2638
2020-12-02 2639
2020-12-03 2640
2020-12-04 2641
2020-12-07 2642
2020-12-08 2643
2020-12-09 2644
2020-12-10 2645
2020-12-11 2646
2020-12-14 2647
2020-12-15 2648
2020-12-16 2649
2020-12-17 2650
2020-12-18 2651
2020-12-21 2652
2020-12-

$$ hsigma = std(e_t) $$

In [9]:
def hsigma(stks, dates):
    T = 252
    L = 0
    half_day = 63
    
    dates_i = dates[252:]
    def beta_inner(date):
        index_x = dates[dates.values==date].index
        date_last = dates[index_x - 252].values[0]
        
        stks_data = dataset_dict['涨跌幅(%)'].loc[date_last:date,stks].sort_index(axis=0, ascending=True)
        rf_data = dataset_dict['R_f'].loc[date_last:date,'r_f_daily'].sort_index(axis=0, ascending=True)
        rm_data = dataset_dict['R_m'].loc[date_last:date,'market'].sort_index(axis=0, ascending=True)
        
        y = stks_data.sub(rf_data, axis="index")
        x = rm_data.sub(rf_data, axis="index")

        cov = (y - np.mean(y)).multiply(x - np.mean(x), axis="index")
        doc = (x - np.mean(x))**2

        a = pd.DataFrame.ewm(cov,halflife=half_day,adjust=False).mean().iloc[-1]
        b = pd.DataFrame.ewm(doc,halflife=half_day,adjust=False).mean().iloc[-1] 
        beta_value = a/b

        alpha_value = np.mean(y) - (beta_value) * np.mean(x) 

        ei = y.sub(alpha_value.add(pd.DataFrame([beta_value]*len(x),index=x.index).mul(x,axis=0)))
        return ei.std()
    dates_i.index = dates_i
    aa = dates_i.apply(beta_inner)    
    
    return aa

hsigma(stks, dates).to_csv('dataset/barra_factor/' + 'hsigma' + '.csv')

## Momentum

$$rstr =  \sum_{t=L}^{T+L}{w_t ln(1+r_t)} - \sum_{t=L}^{T+L}{w_t ln(1+r_{ft})}$$

In [76]:
def rstr(dates):
    half_day = 126
    dates_i = dates[504+21:]
    def rstr_inner(date):
        weights = 0.5**(np.arange(504-1,-1,-1)/126)   #指数衰减权重 # new: T to h
        index_x = dates[dates.values==date].index
        date = dates[index_x - 21].values[0]
        date_last = dates[index_x - 504 - 20].values[0]
        rt = dataset_dict['涨跌幅(%)'].loc[date_last:date,stks].sort_index(axis=0, ascending=True)
        rft = dataset_dict['R_f'].loc[date_last:date,'r_f_daily'].sort_index(axis=0, ascending=True)
        
        return (rt.multiply(weights, axis=0)).mean() - (rft.multiply(weights, axis=0)).mean()
    dates_i.index = dates_i
    aa = dates_i.apply(rstr_inner)    
    return aa
res_df = rstr(dates)
res_df.to_csv('dataset/barra_factor/' + 'rstr' + '.csv')


## Liquidity

$$stom=ln(\sum^{21}_{t=1}{\frac{V_t}{S_t}})$$

In [11]:
stks_stom_data = pd.read_csv('dataset/后复权数据-分类/换手率(%).csv',encoding='gbk')
stks_stom_data = stks_stom_data.rename(columns = {"日期": "date"}).set_index('date')
stks_stom_data_columns = stks_stom_data.columns.to_list()
for i in range(len(stks_stom_data_columns)) : 
    if stks_stom_data_columns[i][:1] == '6' : 
        stks_stom_data_columns[i] = stks_stom_data_columns[i][:6] + '.XSHG'
    else : 
        stks_stom_data_columns[i] = stks_stom_data_columns[i][:6] + '.XSHE'
stks_stom_data.columns = stks_stom_data_columns
stks_stom_data = stks_stom_data[stks].sort_index()

In [12]:
def stom(date):
    T = 21
    L = 0
    dates_i = dates[21:]
    def stom_inner(date):
        index_x = dates[dates.values==date].index
        date_last = dates[index_x - 21].values[0]
        stks_stom_data = dataset_dict['换手率(%)'].loc[date_last:date,stks]
        
        return np.log(stks_stom_data.sum())
    dates_i.index = dates_i
    aa = dates_i.apply(stom_inner)    
    return aa

stom_data = stom(dates)
stom_data = stom_data.replace([np.inf, -np.inf], np.nan)
stom_data.to_csv('dataset/barra_factor/' + 'stom' + '.csv')

$$stoq=ln(\frac{1}{3}\sum^{3}_{t=1}{exp(stom_t)})$$

In [13]:
    # '000001.SZ'
    T = 21 * 3
    L = 0
    dates = pd.Series(stom_data.index.to_list())
    dates_i = dates[T: ]
    def stoq_inner(date):
        index_x = dates[dates.values==date].index
        date_last = dates[index_x - T].values[0]
        stks_stom_data = stom_data.loc[date_last:date,stks]
        
        return np.log(stks_stom_data.sum()/3)
    dates_i.index = dates_i
    aa = dates_i.apply(stoq_inner)    
    return aa
stoq_df = stoq()
stoq_df = stoq_df.replace([np.inf, -np.inf], np.nan)
stoq_df.to_csv('dataset/barra_factor/' + 'stoq' + '.csv')

$$stoa=ln(\frac{1}{12}\sum^{12}_{1}{exp(stom_t)})$$

In [14]:
def stoa():
    T = 21 * 12
    L = 0
    dates = pd.Series(stom_data.index.to_list())
    dates_i = dates[T: ]
    def stoa_inner(date):
        index_x = dates[dates.values==date].index
        date_last = dates[index_x - T].values[0]
        stks_stom_data = stom_data.loc[date_last:date,stks]
        
        return np.log(stks_stom_data.sum()/12)
    dates_i.index = dates_i
    aa = dates_i.apply(stoa_inner)    
    return aa
stoa_df = stoa()
stoa_df = stoa_df.replace([np.inf, -np.inf], np.nan)
stoa_df.to_csv('dataset/barra_factor/' + 'stoa' + '.csv')

## Non Linear Size

In [7]:
def nlsize(stks, date):
    size_factor = np.log(dataset_dict['总市值(元)'].loc[dates,stks])
    size_factor_3 = size_factor**3
    dates_i = dates[252: ]
    
    def ols_inner(date):
        aa = (dataset_dict['总市值(元)'].loc[date,stks].dropna()) / sum(dataset_dict['总市值(元)'].loc[date,stks].dropna())
        WLS = LinearRegression()
        WLS.fit(pd.DataFrame(size_factor_3.loc[date].dropna()), pd.DataFrame(dataset_dict['总市值(元)'].loc[date,stks].dropna()), 
                    sample_weight = aa)
        e_value = (pd.DataFrame(dataset_dict['总市值(元)'].loc[date,stks].dropna()) - WLS.predict(pd.DataFrame(size_factor_3.loc[date].dropna())) ).T
        e_value[list(set(stks) - set(e_value.columns))] = np.nan
        e_value = (e_value[stks])
        return e_value.iloc[0]
    dates_i.index = dates_i
    aa = dates_i.apply(ols_inner)    
    
    return aa
    
nlsize(stks, dates).to_csv('dataset/barra_factor/' + 'nlsize' + '.csv')

## Book to price

In [5]:
def btop(stks, dates) : 
    bp_ratio = dataset_dict['收盘价(元)'].loc[dates[252:],stks] * \
                dataset_dict['总股本(股)'].loc[dates[252:],stks] / \
                dataset_dict['equities_parent_company_owners'].loc[dates[252:],stks]
    res = 1/(bp_ratio)
    res[res<0] = np.nan
    return res
btop(stks, dates).to_csv('dataset/barra_factor/' + 'btop' + '.csv')

## Earning yeild

In [18]:
def epfwd(): 
    return dataset_dict['FY12'].loc[dates[252:],stks] / dataset_dict['market_cap'].loc[dates[252:],stks]

epfwd().to_csv('dataset/barra_factor/' + 'epfwd' + '.csv')

In [8]:
def cetop(): 
    return 1/(dataset_dict['pcf_ratio'].loc[dates[252:],stks] / dataset_dict['总股本(股)'].loc[dates[252:],stks])

cetop().to_csv('dataset/barra_factor/' + 'cetop' + '.csv')

In [20]:
def etop(): 
    return 1 / dataset_dict['市盈率'].loc[dates[252:],stks]

etop().to_csv('dataset/barra_factor/' + 'etop' + '.csv')

## Growth

In [21]:
def egrlf(): 
    return dataset_dict['FY3'].loc[dates[252:],stks] / dataset_dict['net_profit'].loc[dates[252:],stks]
egrlf().to_csv('dataset/barra_factor/' + 'egrlf' + '.csv')

In [22]:
def egrsf(): 
    return dataset_dict['FY1'].loc[dates[252:],stks] / dataset_dict['net_profit'].loc[dates[252:],stks]
egrsf().to_csv('dataset/barra_factor/' + 'egrsf' + '.csv')

In [23]:
def egro(dates): 
    data_df = dataset_dict['eps'].loc[dates[252:],stks]
    dates_i = dates[252: ]
    def ols_inner(date):
        index_loc = data_df.index.get_loc(date)
        ols_list = []
        for i in range(5) : 
            ols_list.append(index_loc - 252*i)
        y = data_df.iloc[ols_list,:]
        x = [1,2,3,4,5]

        cov = (y - np.mean(y)).multiply(x - np.mean(x), axis="index")
        doc = (x - np.mean(x))**2
        
        a = cov.sum()
        b = doc.sum()
        beta_value = a/b
        return beta_value
    dates_i.index = dates_i
    aa = dates_i.apply(ols_inner)    
    
    return aa
egro(dates).to_csv('dataset/barra_factor/' + 'egro' + '.csv')

In [24]:
def sgro(dates): 
    data_df = dataset_dict['operating_revenue'].loc[dates[252:],stks] / dataset_dict['总股本(股)'].loc[dates[252:],stks]
    dates_i = dates[252: ]
    def ols_inner(date):
        index_loc = data_df.index.get_loc(date)
        ols_list = []
        for i in range(5) : 
            ols_list.append(index_loc - 252*i)
        y = data_df.iloc[ols_list,:]
        x = [1,2,3,4,5]

        cov = (y - np.mean(y)).multiply(x - np.mean(x), axis="index")
        doc = (x - np.mean(x))**2
        
        a = cov.sum()
        b = doc.sum()
        beta_value = a/b
        return beta_value
    dates_i.index = dates_i
    aa = dates_i.apply(ols_inner)    
    return aa
sgro(dates).to_csv('dataset/barra_factor/' + 'sgro' + '.csv')

## Leverage

$$ mlev = \frac{me+pe+ld}{me}$$
mlev: 市场杠杆,me: 普通股市值,pe: 优先股账面价值,ld: 长期负债账面价值

In [8]:
def mlev(stks, dates):
    result = (dataset_dict['A股流通市值(元)'].loc[dates[252:],stks] + \
            dataset_dict['preferred_shares_equity'].loc[dates[252:],stks].fillna(0) + \
            dataset_dict['total_non_current_liability'].loc[dates[252:],stks].fillna(0)) / \
            dataset_dict['A股流通市值(元)'].loc[dates[252:],stks]
    return result


mlev(stks, dates).to_csv('dataset/barra_factor/' + 'mlev' + '.csv')

$$ dtoa = \frac{td}{ta}$$
dtoa: 资产负债比,td: 总负债账面价值,ta: 总资产账面价值

In [26]:
def dtoa(stks, dates) : 
    # new
    return dataset_dict['total_liability'].loc[dates[252:],stks] / dataset_dict['total_assets'].loc[dates[252:],stks]
dtoa(stks, dates).to_csv('dataset/barra_factor/' + 'dtoa' + '.csv')

$$ blev = \frac{be+pe+ld}{be}$$
blev: 账面杠杆,be: 普通股账面价值,pe: 优先股账面价值,ld: 长期负债账面价值

In [27]:
def blev(stks, date) : 
    result = (dataset_dict['equities_parent_company_owners'].loc[dates[252:],stks] + \
            dataset_dict['preferred_shares_equity'].loc[dates[252:],stks].fillna(0) + \
            dataset_dict['total_non_current_liability'].loc[dates[252:],stks].fillna(0)) / \
            dataset_dict['equities_parent_company_owners'].loc[dates[252:],stks]
    return result
blev_df = blev(stks, dates)
blev_df[(blev_df<0)&(blev_df>100)] = np.nan
blev_df.to_csv('dataset/barra_factor/' + 'blev' + '.csv')