<font face="黑体" size=6>风格因子计算方法</font>

In [1]:
%pylab inline --no-import-all
from pathlib import Path
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series
import statsmodels.api as sm
import datetime
import os
import math

def clean_extreme_value(arr_data, method='MAD'):
    """
    对数据进行去极值处理
    :param arr_data: np.array
        需要进行去极值的原始数据，数组的每一列各自进行去极值操作
    :param method: 去极值算法
    :return: np.array
        去极值处理后的数据
    """
    raw_data = arr_data.copy()
    m = np.median(raw_data, axis=0)     # 原始数据的中位数
    mad = np.median(np.fabs(raw_data - m), axis=0)
    fupper = m + mad * 5,2
    flower = m - mad * 5.2
    for k in range(raw_data.shape[1]):
        if method == 'MAD':
            raw_data[:, k][raw_data[:, k] > fupper[k]] = fupper[k]
            raw_data[:, k][raw_data[:, k] < flower[k]] = flower[k]
    return raw_data

def normalize_data(arr_data):
    """
    对数据进行标准化
    :param arr_data: np.array
        需要进行标准化处理的原始数据
    :return: np.array
        标准化处理后的数据
    """
    raw_data = arr_data.copy()
    u = np.mean(raw_data, axis=0)
    s = np.std(raw_data, axis=0)
    return (raw_data - u)/s

def _code_to_symbol(code):
    """
    生成本系统的证券代码symbol
    :param code:原始代码，如600000
    :return:
    """
    if len(code) != 6:
        return code
    else:
        return 'SH%s' % code if code[:1] in ['5', '6', '9'] else 'SZ%s' % code


def _code_to_index_symbol(code):
    """
    生成本系统的指数代码symbol
    :param code: 原始代码，如000001（上证综指）
    :return:
    """
    if len(code) != 6:
        return code
    else:
        return 'SZ%s' % code if code[:3] == '399' else 'SH%s' % code

def code_to_symbol(code, index=False):
    if not index:
        return _code_to_symbol(code)
    else:
        return _code_to_index_symbol(code)

def to_date(date_like):
    if isinstance(date_like, datetime.datetime) or isinstance(date_like, datetime.date):
        return date_like
    else:
        return datetime.datetime.strptime(date_like.replace('-', ''), '%Y%m%d')

def is_fin_report_date(date):
    """
    给定的日期是否为财务报告日期
    Parameters:
    --------
    :param date: datetime-like or str
        日期
    :return: bool
    """
    date = to_date(date)
    year = date.year
    report_dates = [datetime.datetime(year, 3, 31), datetime.datetime(year, 6, 30),
                    datetime.datetime(year, 9, 30), datetime.datetime(year, 12, 31)]
    if date in report_dates:
        return True
    else:
        return False
    
def get_fin_report_date(trading_day):
    """
    根据交易日日期返回最新财报的报告期日期
    Parameters:
    --------
    :param trading_day: datetime-like or str
        交易日期
    :return: datetime.datetime
        最新财报的报告期日期
    --------
        规则：5、6、7、8月采用年报或一季报数据（年报、一季报4月底全部公告完毕）
             9、10月采用中报数据（中报8月底全部公告完毕）
             11、12月及下一年1、2、3、4月采用三季报数据（三季报10月底全部公告完毕）
    """
    trading_day = to_date(trading_day)
    year = trading_day.year
    month = trading_day.month
    day = trading_day.day
    if month in (5,6,7,8):
        month = 3
        day = 31
    elif month in (9, 10):
        month = 6
        day = 30
    elif month in (11, 12):
        month = 9
        day = 30
    elif month in (1, 2, 3, 4):
        year -= 1
        month = 9
        day = 30
    return datetime.datetime(year, month, day)

FIN_BASIC_DATA_HEADER = ['ReportDate', 'BasicEPS', 'UnitNetAsset', 'UnitNetOperateCashFlow', 'MainOperateRevenue',
                         'MainOperateProfit', 'OperateProfit', 'InvestIncome', 'NonOperateNetIncome', 'TotalProfit',
                         'NetProfit', 'DeductedNetProfit', 'NetOperateCashFlow', 'CashEquivalentsChg', 'TotalAsset',
                         'CurrentAsset', 'TotalLiability', 'CurrentLiability', 'ShareHolderEquity', 'ROE']

FIN_SUMMARY_DATA_HEADER = ['ReportDate', 'OperatingIncome', 'OperatingCost', 'OperatingProfit', 'TotalProfit',
                           'IncomeTax', 'NetProfit', 'EarningsPerShare', 'Cash', 'AccountsReceivable', 'Inventories',
                           'TotalCurrentAssets', 'NetFixedAssets', 'TotalAssets', 'TotalCurrentLiabilities',
                           'TotalNonCurrentLiabilities', 'TotalLiabilities', 'TotalShareholderEquity',
                           'InitialCashAndCashEquivalentsBalance', 'NetCashFlowsFromOperatingActivities',
                           'NetCashFlowsFromInvestingActivities', 'NetCashFlowsFromFinancingActivities',
                           'NetIncreaseInCashAndCashEquivalents', 'FinalCashAndCashEquivalentsBalance']

def get_fin_basic_data(code, report_date):
    """
    读取个股最新的财务数据
    Parameter:
    --------
    :param code: str
        证券代码，如600000或SH600000
    :param report_date: datetime-like or str
        日期，格式：YYYY-MM-DD or YYYYMMDD
    :return: pd.Series
    --------
        0. ReportDate
        1. BasicEPS:基本每股收益（元）
        2. UnitNetAsset:每股净资产（元）
        3. UnitNetOperateCashFlow:每股经营活动净现金流（元）
        4. MainOperateRevenue:主营业务收入（万元）
        5. MainOperateProfit:主营业务利润（万元）
        6. OperateProfit:营业利润（万元）
        7. InvestIncome:投资收益（万元）
        8. NonOperateNetIncome:营业外收支净额（万元）
        9. TotalProfit:利润总额（万元）
        10. NetProfit:净利润（万元）
        11. DeductedNetPorfit:扣除非经常性损益后净利润（万元）
        12. NetOperateCashFlow:经营活动现金流净额（万元）
        13. CashEquivalentsChg:现金及现金等价物增加额（万元）
        14. TotalAsset:总资产（万元）
        15. CurrentAsset:流动资产（万元）
        16. TotalLiability:总负债（万元）
        17. CurrentLiability:流动负债（万元）
        18. ShareHolderEquity:归属母公司股东权益（万元）
        19. ROE:净资产收益率加权（%）
        读取失败，返回None
    """
    code = code_to_symbol(code)
    date = to_date(report_date)
    if not is_fin_report_date(date):
        return None
    fin_basic_data_path = os.path.join('/Volumes/DB/FactorDB/ElementaryFactor/fin_data/fin_data_basics', '%s.csv' % code)
    df_fin_basic_data = pd.read_csv(fin_basic_data_path, na_values='--', parse_dates=[0],
                                    names=FIN_BASIC_DATA_HEADER, header=0)
    fin_basic_data = df_fin_basic_data[df_fin_basic_data.ReportDate == date]
    if fin_basic_data.shape[0] == 0:
        return None
    else:
        return fin_basic_data.iloc[0]
    
def get_ttm_fin_basic_data(code, date):
    """
    读取个股最新ttm主要财务指标数据
    Parameters:
    --------
    :param code: str
        个股代码，如SH600000或600000
    :param date: datetime-like or str
        日期，格式YYYY-MM-DD or YYYYMMDD
    :return: pd.Series
    --------
    0. ReportDate: 报告期
    1. MainOperateRevenue: 主营业务收入（万元）
    2. MainOperateProfit: 主营业务利润（万元）
    3. OperateProfit: 营业利润（万元）
    4. InvestIncome: 投资收益（万元）
    5. NonOperateNetIncome: 营业外收益净额（万元）
    6. TotalProfit: 利润总额（万元）
    7. NetProfit: 净利润（万元）
    8. DeductedNetProfit: 扣除非经常性损益后净利润（万元）
    9. NetOperateCashFlow: 经营活动现金流净额（万元）
    读取失败，返回None
    """
    code = code_to_symbol(code)
    date = to_date(date)
    if date.month in (5, 6, 7, 8):
        date1 = datetime.datetime(date.year, 3, 31)
        date2 = datetime.datetime(date.year-1, 12, 31)
        date3 = datetime.datetime(date.year-1, 3, 31)
    elif date.month in (9, 10):
        date1 = datetime.datetime(date.year, 6, 30)
        date2 = datetime.datetime(date.year-1, 12, 31)
        date3 = datetime.datetime(date.year-1, 6, 30)
    elif date.month in (11, 12):
        date1 = datetime.datetime(date.year, 9, 30)
        date2 = datetime.datetime(date.year-1, 12, 31)
        date3 = datetime.datetime(date.year-1, 9, 30)
    else:
        date1 = datetime.datetime(date.year-1, 9, 30)
        date2 = datetime.datetime(date.year-2, 12, 31)
        date3 = datetime.datetime(date.year-2, 9, 30)
    fin_basic_data1 = get_fin_basic_data(code, date1)
    if fin_basic_data1 is None:
        return None
    fin_basic_data2 = get_fin_basic_data(code, date2)
    if fin_basic_data2 is None:
        return None
    fin_basic_data3 = get_fin_basic_data(code, date3)
    if fin_basic_data3 is None:
        return None
    ttm_fin_basic_data = Series()
    ttm_fin_basic_data['ReportDate'] = date1
    ttm_fin_basic_data['MainOperateRevenue'] = fin_basic_data1['MainOperateRevenue'] + fin_basic_data2['MainOperateRevenue'] - fin_basic_data3['MainOperateRevenue']
    ttm_fin_basic_data['MainOperateProfit'] = fin_basic_data1['MainOperateProfit'] + fin_basic_data2['MainOperateProfit'] - fin_basic_data3['MainOperateProfit']
    ttm_fin_basic_data['OperateProfit'] = fin_basic_data1['OperateProfit'] + fin_basic_data2['OperateProfit'] - fin_basic_data3['OperateProfit']
    ttm_fin_basic_data['InvestIncome'] = fin_basic_data1['InvestIncome'] + fin_basic_data2['InvestIncome'] - fin_basic_data3['InvestIncome']
    ttm_fin_basic_data['NonOperateNetIncome'] = fin_basic_data1['NonOperateNetIncome'] + fin_basic_data2['NonOperateNetIncome'] - fin_basic_data3['NonOperateNetIncome']
    ttm_fin_basic_data['TotalProfit'] = fin_basic_data1['TotalProfit'] + fin_basic_data2['TotalProfit'] - fin_basic_data3['TotalProfit']
    ttm_fin_basic_data['NetProfit'] = fin_basic_data1['NetProfit'] + fin_basic_data2['NetProfit'] - fin_basic_data3['NetProfit']
    ttm_fin_basic_data['DeductedNetProfit'] = fin_basic_data1['DeductedNetProfit'] + fin_basic_data2['DeductedNetProfit'] - fin_basic_data3['DeductedNetProfit']
    ttm_fin_basic_data['NetOperateCashFlow'] = fin_basic_data1['NetOperateCashFlow'] + fin_basic_data2['NetOperateCashFlow'] - fin_basic_data3['NetOperateCashFlow']
    return ttm_fin_basic_data

def get_fin_summary_data(code, report_date):
    """
    读取个股最新的财务报表摘要数据
    Parameters:
    --------
    :param code: str
        证券代码, 如SH600000, 600000
    :param report_date: datetime-like, str
        报告期, 格式: YYYY-MM-DD or YYYYMMDD
    :return: pd.Series
    --------
        0. ReportDate
        1. OperatingIncome:营业收入(万元)
        2. OperatingCost:营业成本(万元)
        3. OperatingProfit:营业利润(万元)
        4. TotalProfit:利润总额(万元)
        5. IncomeTax:所得税费用(万元)
        6. NetProfit:净利润(万元)
        7. EarningsPerShare:基本每股收益
        8. Cash:货币资金(万元)
        9. AccountsReceivable:应收账款(万元)
        10. Inventories:存货(万元)
        11. TotalCurrentAssets:流动资产合计(万元)
        12. NetFixedAssets:固定资产净额(万元)
        13. TotalAssets:资产总计(万元)
        14. TotalCurrentLiabilities:流动负债合计(万元)
        15. TotalNonCurrentLiabilities:非流动负债合计(万元)
        16. TotalLiabilities:负债合计(万元)
        17. TotalShareholderEquity:所有者权益(或股东权益)合计(万元)
        18. InitialCashAndCashEquivalentsBalance:期初现金及现金等价物余额(万元)
        19. NetCashFlowsFromOperatingActivities:经营活动产生的现金流量净额(万元)
        20. NetCashFlowsFromInvestingActivities:投资活动产生的现金流量净额(万元)
        21. NetCashFlowsFromFinancingActivities:筹资活动产生的现金流量净额(万元)
        22. NetIncreaseInCashAndCashEquivalents:现金及现金等价物增加额(万元)
        23. FinalCashAndCashEquivalentsBalance:期末现金及现金等价物余额(万元)
        读取失败, 返回None
    """
    code = code_to_symbol(code)
    date = to_date(report_date)
    if not is_fin_report_date(date):
        return None
    fin_summary_data_path = os.path.join('/Volumes/DB/FactorDB/ElementaryFactor/fin_data/fin_data_cwbbzy', '%s.csv' % code)
    df_fin_summary_data = pd.read_csv(fin_summary_data_path, na_values='--', parse_dates=[0],
                                      names=FIN_SUMMARY_DATA_HEADER, header=0)
    fin_summary_data = df_fin_summary_data[df_fin_summary_data.ReportDate == date]
    if fin_summary_data.shape[0] == 0:
        return None
    else:
        return fin_summary_data.iloc[0]

Populating the interactive namespace from numpy and matplotlib


  from pandas.core import datetools


# Size(市值因子)
## 定义

　　定义：　　1.0 * LNCAP

　　LNCAP　　<span style="border-bottom:2px solid black">Natural log of market cap</span>

　　　　　　　Given by the logarithm of the total market capitalization of the firm.
## 计算示例

In [8]:
secu_code = 'SH600000'
calc_date = '2017-12-29'

# 取得个股非复权行情数据
quote_header = ['date', 'open', 'high', 'low', 'close', 'vol', 'amount', 'turnover1', 'turnover2']
quote_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_NoFQ/', '%s.csv' % secu_code)
df_secu_quote = pd.read_csv(quote_data_path,names=quote_header,header=0)
secu_close = df_secu_quote[df_secu_quote['date'] <= calc_date].iloc[-1]['close']
# 取得个股最新的股本结构数据
cap_struct_header = ['code', 'date', 'reason', 'total', 'liquid_a', 'liquid_b', 'liquid_h']
cap_struct_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/cap_struct/' '%s.csv' % secu_code)
df_cap_struct = pd.read_csv(cap_struct_path, names=cap_struct_header, header=0)
cap_struct = df_cap_struct[df_cap_struct['date'] <= calc_date].iloc[-1]
total_cap = cap_struct.total - cap_struct.liquid_b - cap_struct.liquid_h
# 计算总市值的自然对数值
lncap = np.log(secu_close * total_cap)
print('{}{}{}{:,.6f}'.format('LNCAP of ', secu_code, ' = ', lncap))

LNCAP of SH600000 = 26.635532


# Beta(贝塔因子)

　　Definition:　　1.0 * BETA

　　BETA　　　　Beta($\beta$)

Computed as the slope coefficient in a time-series regression of excess stock return, $r_t-r_{ft}$, against the cap-weighted excess return of the estimation universe $R_t$,
$$r_t - r_{ft} = \alpha + \beta R_t+e_t\tag{1}$$
The regression coefficients are estimated over thr trailling 252 days of returns with a half-life of 63 trading days.

其中$r_ft$是无风险收益率日序列，$r_t$是股票收益率日序列，$R_t$是市值加权指数(如中证全指、万德全A指数)超额收益序列，回归系数采取过去252个交易日的收益数据，采用指数加权移动平均算法，半衰期为63个交易日（时间越接近权重越大）

按照普通最小二乘法，对于参数的估计为：
$$\beta=\frac{Cov(x,y)}{Var(x)}=\frac{\sum_{i=1}^n(x_i-\bar{x})(y_i-\bar{y})}{\sum_{i=1}^n(x_i-\bar{x})^2}$$

指数加权移动平均(Exponentially Weighted Moving Average, EWMA)，是BARRA model中常用的一种加权方式，按照时间远近呈指数衰减，按照指数加权移动平均，对于参数的估计为：
$$\beta=\frac{\sum_{i=0}^tw_i(x_i-\bar{x})(y_i-\bar{y})}{\sum_{i=0}^tw_i(x_i-\bar{x})^2}$$
其中，$t$是数据的时间长度减去1，这里为252，$x_t$是距离现金最近的数据，其权重为$w_0$。指数加权移动平均算法见[附录](#appendix)。
## 计算示例

In [40]:
secu_code = 'SH600036'
benchmark_code = 'SH000001'
calc_date = '2017-12-29'
days=252
# 取得个股复权行情数据
quote_header = ['code','date','open','high','low','close','vol','amount','turnover1','turnvoer2','factor']
quote_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_FQ/', '%s.csv' % secu_code)
df_secu_quote = pd.read_csv(quote_data_path,names=quote_header,header=0)
# 使用过去252个交易日的复权行情数据
df_secu_quote = df_secu_quote[df_secu_quote.date <= calc_date].tail(days+1)
df_secu_quote.reset_index(drop=True, inplace=True)
# 计算个股的日收益率序列
arr_close = np.array(df_secu_quote.iloc[1:]['close'])
arr_pre_close = np.array(df_secu_quote.shift(1).iloc[1:]['close'])
arr_secu_daily_ret = arr_close / arr_pre_close - 1.

# 取得基准复权行情数据
quote_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_FQ/', '%s.csv' % benchmark_code)
df_benchmark_quote = pd.read_csv(quote_data_path, names=quote_header,header=0)
df_benchmark_quote = df_benchmark_quote[df_benchmark_quote['date'].isin(list(df_secu_quote.date))]
df_benchmark_quote.reset_index(drop=True, inplace=True)
# 计算基准的日收益率序列
arr_close = np.array(df_benchmark_quote.iloc[1:]['close'])
arr_pre_close = np.array(df_benchmark_quote.shift(1).iloc[1:]['close'])
arr_benchmark_daily_ret = arr_close / arr_pre_close - 1.

# 计算权重 - 指数移动加权平均
T = len(arr_benchmark_dail_ret)
time_spans = sorted(range(T), reverse=True)
alpha = 1 - np.exp(np.log(0.5)/63)
x = [1-alpha] * T
y = [alpha] * (T - 1)
y.insert(0,1)
weights = np.float_power(x, time_spans) * y

# 采用加权最小二乘法计算beta
arr_benchmark_daily_ret = sm.add_constant(arr_benchmark_daily_ret)
cap_model = sm.WLS(arr_secu_daily_ret, arr_benchmark_daily_ret, weights=weights)
results = cap_model.fit()
results.params

array([0.00108207, 1.02427672])

# Momentum(动量因子)
## 定义
    Definintion:    1.0*RSTR

    RSTR            Relative strength
    
Computed as the sum of excess log returns over the trailing T = 504 trading days with a lag of L=21 trading days,
$$RSTR = \sum_{i=L}^{T+L}w_t[ln(1+r_t)-ln(r+r_{ft}],\tag{2}$$

where $r_t$ is the stock return on day t, $r_{ft}$ is the risk-free return, and $w_t$ is an exponential weight with a half-life of 126 trading days.
## 计算示例 

In [3]:
secu_code = 'SH600000'
calc_date = '2017-12-29'
days_start = 504
days_end = 21
half_life = 126

# 取得个股复权行情数据
quote_header = ['code','date','open','high','low','close','vol','amount','turnover1','turnvoer2','factor']
quote_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_FQ/', '%s.csv' % secu_code)
df_secu_quote = pd.read_csv(quote_data_path,names=quote_header,header=0)
# 使用过去第21个交易日到第504个交易日的复权行情数据
df_secu_quote = df_secu_quote[df_secu_quote.date <= calc_date].tail(days_start+1)
df_secu_quote = df_secu_quote.head(len(df_secu_quote)-days_end)
df_secu_quote.reset_index(drop=True, inplace=True)
# 计算个股的日对数收益率序列
arr_close = np.array(df_secu_quote.iloc[1:]['close'])
arr_pre_close = np.array(df_secu_quote.shift(1).iloc[1:]['close'])
arr_secu_daily_ret = np.log(arr_close / arr_pre_close)
# 计算权重 - 指数移动加权平均
T = len(arr_secu_daily_ret)
time_spans = sorted(range(T), reverse=True)
alpha = 1 - np.exp(np.log(0.5)/half_life)
x = [1-alpha] * T
y = [alpha] * (T - 1)
y.insert(0,1)
weights = np.float_power(x, time_spans) * y
# 计算RSTR
rstr = np.sum(arr_secu_daily_ret * weights)
rstr

-0.0004597495866620215

# Residual Volatility(残差波动率因子)
    Definition:    0.74*DASTD + 0.16*CMRA + 0.10*HSIGMA
## DASTD(Daily standard deviation)
### 定义
Computed as that the volatility of daily excess return over the past 252 trading days with a half-life of 42 trading days.

是过去252个交易日日超额收益率波动率，按照指数加权权重加权平均，半衰期为42个交易日。
$$DASTD = \sqrt{\sum_{t=1}^Tw_t(r_t-u(r))^2}\tag{3}$$
### 计算示例

In [4]:
secu_code = 'SH600000'
calc_date = '2017-12-29'
trailing = 252
half_life = 42

# 取得个股复权行情数据
quote_header = ['code','date','open','high','low','close','vol','amount','turnover1','turnvoer2','factor']
quote_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_FQ/', '%s.csv' % secu_code)
df_secu_quote = pd.read_csv(quote_data_path,names=quote_header,header=0)
# 使用过去第252个交易日的复权行情数据
df_secu_quote = df_secu_quote[df_secu_quote.date <= calc_date].tail(trailing+1)
df_secu_quote.reset_index(drop=True, inplace=True)
# 计算个股的日对数收益率序列及收益率均值
arr_close = np.array(df_secu_quote.iloc[1:]['close'])
arr_pre_close = np.array(df_secu_quote.shift(1).iloc[1:]['close'])
arr_secu_daily_ret = np.log(arr_close / arr_pre_close)
avg_daily_ret = np.mean(arr_secu_daily_ret)
# 计算权重 - 指数移动加权平均
T = len(arr_secu_daily_ret)
time_spans = sorted(range(T), reverse=True)
alpha = 1 - np.exp(np.log(0.5)/half_life)
x = [1-alpha] * T
y = [alpha] * (T - 1)
y.insert(0,1)
weights = np.float_power(x, time_spans) * y
# 计算DASTD
dastr = np.sqrt(np.sum((arr_secu_daily_ret - avg_daily_ret)**2 * weights))
print('DASTR factor of %s = %f' % (secu_code, dastr))

DASTR factor of SH600000 = 0.009623


## CMRA(Cumulative range)
### 定义
This descriptor differentiates stocks that have experienced wide swing ove the last 12 months from those that have traded within a narrow range. Let $Z(T)$ be the cumulative excess log return over the past $T$ months, with each month defined as the previous 21 trading days

$$Z(T) = \sum_{\tau=1}^T[ln(1+r_\tau) - ln(1+r_{f\tau})],\tag{4}$$

where $r_\tau$ is the stock return for month $\tau$(compounded over 21 days), and $r_{f\tau}$ is the risk-free return. The cumulative range is given by

$$CMRA = ln(1+Z_{max}) - ln(1+Z_{min}),\tag{5}$$

where $Z_{max}=max\{Z(T)\},Z_{min}=min\{Z(T)\}$, and $T=1,\dots,12$.

CMRA是过去12个月超额收益的离差，也是表征股票收益率的波动大小，$Z(T)$是过去$T$个月超额收益对数值的累计值，$Z(T)$是一个时间序列，$T=1,2,3,\dots,12$。

### 计算示例

In [38]:
benchmark_code = 'SH000001'
secu_code = 'SH600000'
calc_date = '2017-12-29'
trailing = 12
days_scale = 21

# 取得个股复权行情数据
# quote_header = ['code','date','open','high','low','close','vol','amount','turnover1','turnvoer2','factor']
# quote_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_FQ/', '%s.csv' % secu_code)
# df_secu_quote = pd.read_csv(quote_data_path,names=quote_header,header=0)
# 使用过去第252个交易日的复权行情数据
# df_secu_quote = df_secu_quote[df_secu_quote.date <= calc_date].tail(trailing*days_scale+1)
# df_secu_quote.reset_index(drop=True, inplace=True)
# print(df_secu_quote)
# 计算个股的日对数收益率序列
# arr_close = np.array(df_secu_quote.iloc[1:]['close'])
# arr_pre_close = np.array(df_secu_quote.shift(1).iloc[1:]['close'])
# arr_secu_daily_ret = np.log(arr_close / arr_pre_close)
# print(arr_secu_daily_ret)
# 从期初开始每隔days_scale个交易日累加日对数收益率
# z = []
# for t in range(1, trailing+1):
#     k = t * days_scale -1
#     if k > len(arr_secu_daily_ret)-1:
#         k = len(arr_secu_daily_ret)-1
#         z.append(np.sum(arr_secu_daily_ret[:k]))
#         break
#     else:
#         z.append(np.sum(arr_secu_daily_ret[:k]))
        

# 计算每个月的个股变化率(1+r)
# z = []
# for t in range(1, trailing+1):
#     k = t * days_scale
#     if k > len(df_secu_quote)-1:
#         k = len(df_secu_quote)-1
#         z.append(df_secu_quote.iloc[k]['close']/df_secu_quote.iloc[0]['close'])
#         break
#     else:
#         z.append(df_secu_quote.iloc[k]['close']/df_secu_quote.iloc[0]['close'])

        
# 取得基准行情, 用于提取交易日信息
quote_header = ['code','date','open','high','low','close','vol','amount','turnover1','turnvoer2','factor']
quote_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_FQ/', '%s.csv' % benchmark_code)
df_benchmark_quote = pd.read_csv(quote_data_path,names=quote_header,header=0)
# 提取交易日序列
df_benchmark_quote = df_benchmark_quote[df_benchmark_quote.date <= calc_date].tail(trailing*days_scale+1)
trading_days = df_benchmark_quote['date'].values.tolist()
# 取得个股复权行情
quote_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_FQ/', '%s.csv' % secu_code)
df_secu_quote = pd.read_csv(quote_data_path,names=quote_header,header=0)
# 提取相应交易日的个股复权行情数据
df_secu_quote = df_secu_quote[df_secu_quote['date'].isin(trading_days)]
df_secu_quote.reset_index(drop=True, inplace=True)
# 计算每个月的个股价格变化率
z = []
if len(df_secu_quote) < 126:
    # 如果提取的个股行情长度小于126个交易日, 返回None
    cmra = np.nan
else:
    print('Init_day of secu:%s, init_day of benchmark:%s, init close:%f' % (df_secu_quote.iloc[0]['date'], trading_days[0], df_secu_quote.iloc[0]['close']))
    prev_trading_day = df_secu_quote.iloc[0]['date']
    for t in range(1, trailing+1):
        k = t * days_scale
        trading_day = trading_days[k]
        try:
            secu_trading_day = df_secu_quote[df_secu_quote['date'] <= trading_day].iloc[-1]['date']
            if secu_trading_day <= prev_trading_day:
                print('month #%d, benchmark_day:%s, secu_day:%s,z=None,' % (t, trading_day, secu_trading_day))
                continue
            else:
                z.append(df_secu_quote[df_secu_quote['date']==secu_trading_day].iloc[0]['close']/df_secu_quote.iloc[0]['close'])
#                 prev_trading_day = df_secu_quote[df_secu_quote['date'] <= trading_day].iloc[-1]['date']
                prev_trading_day = secu_trading_day
                print('month #%d, benchmark_day:%s, secu_day:%s, z=%f' % (t, trading_day, secu_trading_day, z[-1]))
        except Exception as e:
            print(e)
#         prev_tra
    cmra = np.log(max(z)) - np.log(min(z))
print(z)
print('CMRA factor of %s = %f' % (secu_code, cmra))

Init_day of secu:2016-12-20, init_day of benchmark:2016-12-20, init close:150.932900
month #1, benchmark_day:2017-01-19, secu_day:2017-01-19, z=1.015971
month #2, benchmark_day:2017-02-24, secu_day:2017-02-24, z=1.026413
month #3, benchmark_day:2017-03-27, secu_day:2017-03-27, z=0.985258
month #4, benchmark_day:2017-04-27, secu_day:2017-04-27, z=0.934275
month #5, benchmark_day:2017-05-31, secu_day:2017-05-31, z=1.038736
month #6, benchmark_day:2017-06-29, secu_day:2017-06-29, z=1.032265
month #7, benchmark_day:2017-07-28, secu_day:2017-07-28, z=1.089703
month #8, benchmark_day:2017-08-28, secu_day:2017-08-28, z=1.042781
month #9, benchmark_day:2017-09-26, secu_day:2017-09-26, z=1.040354
month #10, benchmark_day:2017-11-01, secu_day:2017-11-01, z=1.014467
month #11, benchmark_day:2017-11-30, secu_day:2017-11-30, z=1.044399
month #12, benchmark_day:2017-12-29, secu_day:2017-12-29, z=1.018512
[1.0159706730606781, 1.0264130616982778, 0.9852583499025064, 0.9342754296776913, 1.0387364186337

# Non-linear Size(非线性市值因子)
## 定义
　　　　Definition:　　1.0*NLSIZE

　　　　NLSIZE　　　　　Cube of Size

First, the standardized Size exposure(i.e, log of market cap) is cubed. The resulting factor is then orthogonalized with respect to the Size factor on a regression-weighted basis. Finally, the factor is winsorized and standardized.

NLSIZE为SIZE因子的立方，之后将结果和SIZE回归取残差去除其和SIZE因子的共线性，残差值再进行缩尾处理(winsorized)和标准化(standardized)。缩尾处理即极值处理。
## 计算示例

In [19]:
calc_date = '2017-12-29'

# 取得SIZE因子载荷
lncap_data_path = '/Volumes/DB/FactorDB/RiskFactor/LNCAP/LNCAP_{}.csv'.format(calc_date.replace('-',''))
df_lncap = pd.read_csv(lncap_data_path, header=0)
# Size因子数组
arr_size = np.array(df_lncap['factorvalue'])
# Size因子三次方数组
arr_size_cube = arr_size ** 3
# 相对Size因子正交化
model = sm.OLS(arr_size_cube, arr_size)
result = model.fit()
# 对残差值进行缩尾处理和标准化
n = len(result.resid)
arr_resid = result.resid.reshape(n,1)
arr_res_winsorized = clean_extreme_value(arr_resid)
arr_res_standardized = normalize_data(arr_res_winsorized)
df_nlsize = pd.DataFrame(dict({'date':df_lncap['date'].values, 'id':df_lncap['id'].values, 'factorvalue':arr_res_standardized.reshape(n,)}))
df_nlsize.head()

Unnamed: 0,date,factorvalue,id
0,2018-01-02,2.795924,SZ000002
1,2018-01-02,0.724944,SZ000009
2,2018-01-02,2.795924,SZ000001
3,2018-01-02,1.250179,SZ000027
4,2018-01-02,1.482972,SZ000039


# Value(估值因子)
## 定义
　　Definition:　　1.0*BTOP

　　BTOP　　　　Book-to-price ratio

Last reported book value of common equity divided by current market capitalization

上个季报公司普通股权账面价值(即净资产)除以公司当前的市值。

## 计算示例

In [32]:
secu_code = 'SH600000'
calc_date = '2017-12-29'
report_date = get_fin_report_date(calc_date)

# 读取最新的个股净资产数据
fin_basic_data = get_fin_basic_data(secu_code, report_date)
# 读取个股的市值因子载荷(LNCAP)
lncap_path = '/Volumes/DB/FactorDB/RiskFactor/LNCAP/LNCAP_{}.csv'.format(calc_date.replace('-', ''))
df_lncap = pd.read_csv(lncap_path, header=0)
lncap = df_lncap[df_lncap['id'] == secu_code].iloc[0]['factorvalue']
# 账面市值比 = 净资产/市值
btop = (fin_basic_data['TotalAsset'] - fin_basic_data['TotalLiability']) * 10000 / np.exp(lncap)
print('净资产 = %.2f' % (fin_basic_data['TotalAsset'] - fin_basic_data['TotalLiability']))
print('市值 = %.2f' % np.exp(lncap))
print('BTOP of %s = %.4f' % (secu_code, btop))

净资产 = 42270000.00
市值 = 369542692236.00
BTOP of SH600000 = 1.1438


# Liquidity(流动性因子)

$Definition: 0.35 * STOM + 0.35 * TOQ + 0.3 * STOA$

## 定义
### STOM(share turnover, one month)
Computed as the log of the sum of daily turnover during the previous 21 trading days,

$$STOM=\ln\left(\sum^{21}_{t=1}\frac{V_t}{S_t}\right),$$

where $V_t$ is the tradiing volume on day $t$, and $S_t$ is the number of shares outstanding.

即最近一个月的换手率和的对数值。
### STOQ(Average share turnover, trailing 3 months)
Let $STOM_\tau$ be the share turnover for month $\tau$, with each month consisting of 21 trading days. The quarterly share turnover is defined by

$$STOQ=\ln\left[\frac{1}{T}\sum^T_{\tau=1}\exp(STOM_\tau)\right],$$

where $T=3$ months.

即最近三个月stom的均值。
### STOA(Average share turnover, trailing 12 months)
Let $STOM_{\tau}$ be the share turnover for month $\tau$, with each month consisting 21 trading days. The annual share turnover is defined by

$$STOA=\ln\left[\frac{1}{T}\sum^T_{\tau=1}\exp(STOM_\tau)\right],$$

where $T=12$ months.

即最近12个月stom的均值。

The Liquidity factor is orthogonalized with respect to Size to reduce colinearity.

流动性因子需要和Size因子进行正交化，消除共线性。

## 计算示例

In [44]:
secu_code = 'SZ000554'
calc_date = '2017-12-29'

# 取得个股过去252个交易日的日行情数据（非复权）
mkt_data_path = Path('/Volumes/DB/FactorDB/ElementaryFactor/mkt_daily_NoFQ/', '{code}.CSV'.format(code=secu_code))
mkt_data_header = ['date', 'open', 'high', 'low', 'close', 'vol', 'amount', 'turnover1', 'turnover2']
df_mkt_data = pd.read_csv(mkt_data_path, names=mkt_data_header, header=0)
df_mkt_data = df_mkt_data[df_mkt_data.date <= calc_date].tail(252)
# STOM
stom = math.log(df_mkt_data.iloc[-21:]['turnover1'].sum())
# STOQ
stoq = math.log(df_mkt_data.iloc[-63:]['turnover1'].sum()/3.0)
# STOA
stoa = math.log(df_mkt_data['turnover1'].sum()/12.0)
# LIQUIDITY = 0.35*STOM + 0.35*STOQ + 0.3*STOA
raw_liquidity = 0.35*stom + 0.35*stoq + 0.3*stoa
# output
print("stom = {stom}, stoq = {stoq}, stoa = {stoa}, liquidity={liquidity}".format(stom=stom, stoq=stoq, stoa=stoa, liquidity=raw_liquidity))

stom = -1.0550804969668242, stoq = -0.681486596851669, stoa = 0.0048301866770218095, liquidity=-0.606349426833366


# EarningsYield(盈利预期因子)

$Definition: 0.68 * EPFWD + 0.21 * CETOP + 0.11 * ETOP$

## EPFWD(Predicted earnings-to-price ratio, 预期盈利市值比)
### 定义
Given by the 12-month forward-looking earnings divided by the current market capitalization. Forward-looking earnings are defined as a weighted average between the average analyst-predicted earnings for the current and next fiscal years.
### 计算示例

In [9]:
secu_code = 'SH600000'
calc_date = '2017-12-29'

# 读取个股的预期盈利
consensus_data_path = '/Volumes/DB/FactorDB/ElementaryFactor/consensus_data/predicted_earnings/predictedearnings_20171229.csv'
df_predictedearnings = pd.read_csv(consensus_data_path, header=0)
secu_predictedearnings = df_predictedearnings[df_predictedearnings['code']==secu_code].iloc[0]['predicted_earnings']
# 读取个股市值
lncap_path = '/Volumes/DB/FactorDB/RiskFactor/Size/LNCAP/LNCAP_20171229.csv'
df_lncap_data = pd.read_csv(lncap_path, header=0)
secu_lncap = df_lncap_data[df_lncap_data['id']==secu_code].iloc[0]['factorvalue']
secu_cap = np.exp(secu_lncap)
# epfwd = 预期盈利/市值
epfwd = secu_predictedearnings * 10000 / secu_cap
print('Predicted Earnings(M$) = {:,.4f}, Market Capitalization($) = {:,.2f}, EPFWD = {:.4f}'\
      .format(secu_predictedearnings, secu_cap, epfwd))
# print('EPFWD of %s is %.4f..' % (secu_code, epfwd))

Predicted Earnings(M$) = 6,013,524.5900, Market Capitalization($) = 369,542,692,236.00, EPFWD = 0.1627


## CETOP(Cash earnings-to-price ratio, 现金流量市值比)
### 定义
Given by the trailing 12-month cash earning divide by current price. 现金流量是过去12个月经营活动产生的净额。
### 计算示例

In [7]:
secu_code = 'SH600151'
calc_date = '2017-12-29'

# 读取个股的主要财务指标数据ttm值
ttm_fin_data = get_ttm_fin_basic_data(secu_code, calc_date)
print(np.isnan(ttm_fin_data['NetOperateCashFlow']))
ttm_cash = ttm_fin_data['NetOperateCashFlow']
# 读取个股市值
lncap_path = '/Volumes/DB/FactorDB/RiskFactor/Size/LNCAP/LNCAP_20171229.csv'
df_lncap_data = pd.read_csv(lncap_path, header=0)
secu_lncap = df_lncap_data[df_lncap_data['id']==secu_code].iloc[0]['factorvalue']
secu_cap = np.exp(secu_lncap)
# cetop = 经营活动现金流ttm值/市值
cetop = ttm_cash * 10000 / secu_cap
print('ttm_cash(M$) = {:,.2f}, Market Capitalization($) = {:,.0f}, CETOP = {:.4f}'.format(ttm_cash, secu_cap, cetop))

True
ttm_cash(M$) = nan, Market Capitalization($) = 10,885,974,881, CETOP = nan


## ETOP(Trailing earnings-to-price ratio, 盈利市值比)
### 定义
Given by the trailing 12-month earnings divided by the current market capitalization. Trailing earnings are defined as the last reported fiscal-year earnings plus the difference between current interim figure and the comparative interim figure from the previous year. 即盈利市值比, 盈利是过去12个月的ttm数据。
### 计算示例

In [3]:
secu_code = 'SH600151'
calc_date = '2017-12-29'

# 读取个股的ttm净利润
ttm_fin_data = get_ttm_fin_basic_data(secu_code, calc_date)
ttm_netprofit = ttm_fin_data['NetProfit']
# 读取个股市值
lncap_path = '/Volumes/DB/FactorDB/RiskFactor/Size/LNCAP/LNCAP_20171229.csv'
df_lncap_data = pd.read_csv(lncap_path, header=0)
secu_lncap = df_lncap_data[df_lncap_data['id']==secu_code].iloc[0]['factorvalue']
secu_cap = np.exp(secu_lncap)
# etop = ttm净利润/市值
etop = ttm_netprofit * 10000 / secu_cap
print('ttm_netprofit(M$) = {:,.2f}, Market Capitalization($) = {:,.0f}, ETOP = {:.4f}'\
      .format(ttm_netprofit, secu_cap, etop))

ttm_netprofit(M$) = -13,701.00, Market Capitalization($) = 10,885,974,881, ETOP = -0.0126


# Leverage(杠杆因子)
$Definition: 0.38*MLEV + 0.35*DTOA + 0.27*BLEV$
## MLEV(Market leverage, 市场杠杆)
### 定义
Computed as

$$MLEV=\frac{ME+PE+LD}{ME},$$

where ME is the market value of common equity on the last trading day, PE is the most recent book value of preferred equity, and LD is the most recent book value of long-term debt. 即ME是普通股市值, PE是优先股账面价值, LD是长期负债账面价值。
### 计算示例

In [6]:
secu_code = 'SH600000'
calc_date = '2017-12-29'
report_date = get_fin_report_date(calc_date)

# 读取最新财务数据
fin_summary_data = get_fin_summary_data(secu_code, report_date)
ld_data = fin_summary_data['TotalNonCurrentLiabilities']
if np.isnan(ld_data):
    ld_data = fin_summary_data['TotalLiabilities']
ld_data *= 10000
pe_data = 0
# 读取个股市值
lncap_path = '/Volumes/DB/FactorDB/RiskFactor/Size/LNCAP/LNCAP_%s.csv' % calc_date.replace('-', '')
df_lncap_data = pd.read_csv(lncap_path, header=0)
secu_lncap = df_lncap_data[df_lncap_data['id']==secu_code].iloc[0]['factorvalue']
me_data = np.exp(secu_lncap)
# mlev = (me+pe+ld)/me
mlev = (me_data + pe_data + ld_data) / me_data
print('me = {:,.0f}, pe = {:,.0f}, ld = {:,.0f}, mlev = {:.4f}'.format(me_data, pe_data, ld_data, mlev))

me = 369,542,692,236, pe = 0, ld = 5,641,137,000,000, mlev = 16.2652


## DTOA(Debe-to-assets, 资产负债比)
### 定义
computed as 

$$DTOA = \frac{TD}{TA}$$

where TD is the book value of total debt(long-term debt and current liabilitied), and TA is most recent book value of total assets. TD是总负债账面价值, TA是总资产账面价值。
### 计算示例

In [12]:
secu_code = 'SH600519'
calc_date = '2017-12-29'
report_date = get_fin_report_date(calc_date)

# 读取最新财务数据
fin_basic_data = get_fin_basic_data(secu_code, report_date)
td = fin_basic_data['TotalLiability']
ta = fin_basic_data['TotalAsset']
# dtoa = td / ta
dtoa = td / ta
print('td(M$) = {:,.4f}, ta(M$) = {:,.4f}, dtoa = {:.4f}'.format(td, ta, dtoa))

td(M$) = 3,939,919.0000, ta(M$) = 12,778,004.0000, dtoa = 0.3083


## BLEV(Book leverage, 账面杠杆)
### 定义
computed as

$$BLEV = \frac{BE + PE + LD}{BE}$$

where BE is the most recent book value of common equity, PE is the most recent book value of prferred equity, and LD id the most recent book value of long-term debt. BE是普通股账面价值, PE是优先股账面价值, LD是长期负债账面价值。
### 计算示例

In [17]:
secu_code = 'SH603996'
calc_date = '2017-12-29'
report_date = get_fin_report_date(calc_date)

# 读取个股最新财务报表摘要数据
fin_summary_data = get_fin_summary_data(secu_code, report_date)
be = fin_summary_data['TotalShareholderEquity']
ld = fin_summary_data['TotalNonCurrentLiabilities']
if np.isnan(ld):
    ld = fin_summary_data['TotalLiabilities']
pe = 0
# blev = (be + pe + ld) / be
blev = (be + pe + ld) / be
print('be(M$) = {:,.4f}, pe = {:,.4f}, ld = {:,.4f}, blev = {:.4f}'.format(be, pe, ld, blev))

be(M$) = 142,033.0000, pe = 0.0000, ld = 0.0000, blev = 1.0000


# <a id='appendix'>附录：指数加权移动平均</a>
指数加权移动平均的权重有两种表现形式，一种是递推公式，如下：

$y_0 = x_0$

$y_t = (1-\alpha)y_{t-1} + \alpha x_t$

根据递推公式，可以得到

$y_t = (1-\alpha)^tx_0 + (1-\alpha)^{t-1}\alpha x_1 + \ldots + (1-\alpha)\alpha x_{t-1} + \alpha x_t$

权重可以写为，

$
\left\{
\begin{array}{ll}
w_i = (1-\alpha)^i\alpha & (i < t)\\
w_i = (1-\alpha)^i       & (i = t)
\end{array}
\right.
$

可以得到$\sum_{i=0}^t = 1$.注意$i$表示距离现在时间，$i$越大，距离现在时间越长。

在python中可以直接调用pandas.ewma()或者pandas.ewm().mean()实现。注意的是，默认参数为adjust=True，采用近似的权重，权重为$w_i=(1-\alpha)^i$，当参数adjust=False，才会采用以上精确推算结果。

半衰期$h$和参数$\alpha$的关系为$\alpha = 1 - e^{\frac{ln(0.5)}{h}}$，这是因为最大权重$w_0 = \alpha$，经过了$h$天之后权重为$w_h = (1-\alpha)^h\alpha$，两者之比$\frac{w_h}{w_0}=(1-\alpha)^h=0.5$，可以得出$\alpha = 1 - e^{\frac{ln(0.5)}{h}}$，也就是说经过了$h$天后，权重变成了初始权重的一半。