### 1. 回归训练区间
2014-01-01  ~  2016-12-31

### 2. 回归股票选择
- 股票选择（HS300指数）
- 如果是以A股选择回归，那么回测就以A股去进行选股

### 3. 回归因子数据准备、收益率计算
- 因子数据：横截面数据拼接，添加日期数据、去除空值
- 收益率计算: 所有样本的收益率计算、取出价格为空值的计算
- 这个回归就是每个月用股票的回报率对股票上一期末的因子值做一个横截 面的回归。 

### 4. 目标值特征值提取进行回归估计
- 数据处理：去除收益为0(价格数据不存在)、去极值、标准化处理、市值中性化处理

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

## 1. 准备因子对应的日期数据
- 每月最后一天的日期

In [2]:
dates  = get_trading_dates(start_date="2014-01-01", end_date="2016-01-01")

# 再将日期数据转化成每月月末的数据. 每月最后一个交易日, 按月计算收益率
month_date = []
for i in range(len(dates) -1):
    if dates[i].year != dates[i+1].year:
      month_date.append(dates[i])
    elif dates[i].month != dates[i+1].month:
      month_date.append(dates[i])

#把最后一个交易日加入
month_date.append(dates[-1])

# 每月日期最后一天的列表获取
month_date

[datetime.date(2014, 1, 30),
 datetime.date(2014, 2, 28),
 datetime.date(2014, 3, 31),
 datetime.date(2014, 4, 30),
 datetime.date(2014, 5, 30),
 datetime.date(2014, 6, 30),
 datetime.date(2014, 7, 31),
 datetime.date(2014, 8, 29),
 datetime.date(2014, 9, 30),
 datetime.date(2014, 10, 31),
 datetime.date(2014, 11, 28),
 datetime.date(2014, 12, 31),
 datetime.date(2015, 1, 30),
 datetime.date(2015, 2, 27),
 datetime.date(2015, 3, 31),
 datetime.date(2015, 4, 30),
 datetime.date(2015, 5, 29),
 datetime.date(2015, 6, 30),
 datetime.date(2015, 7, 31),
 datetime.date(2015, 8, 31),
 datetime.date(2015, 9, 30),
 datetime.date(2015, 10, 30),
 datetime.date(2015, 11, 30),
 datetime.date(2015, 12, 31)]

## 2. 准备因子数据（特征值），根据训练每月最后交易日日期列表

In [21]:

# 特征值都是该月的因子数据（为了避免下个月在日期列表中不存在），

# 因子的数据处理：直接删除缺失值

# 获取HS300的股票列表：
stocks = index_components("000300.XSHG")
all_data = pd.DataFrame()

# month_date当中的datetime.date(2015,12,31)号因子数据不用取
for date in month_date[:-1]:
    # 查询因子数据
    q = query(fundamentals.eod_derivative_indicator.pe_ratio, 
              fundamentals.eod_derivative_indicator.pb_ratio, 
              fundamentals.eod_derivative_indicator.market_cap, 
              fundamentals.financial_indicator.ev, 
              fundamentals.financial_indicator.return_on_asset_net_profit, 
              fundamentals.financial_indicator.du_return_on_equity, 
              fundamentals.financial_indicator.earnings_per_share, 
              fundamentals.income_statement.revenue, 
              fundamentals.income_statement.total_expense).filter(fundamentals.stockcode.in_(stocks))
    
    # 获取因子数据, 由于是panel结构，所以要取出横截面结构[:,0,:]
    fund = get_fundamentals(q,entry_date=date)[:,0,:]
    
    # 拼接date，为(.ix)获取下一个月的股票收益率
    fund["date"] = date
    
    # 进行每月因子数据拼接
    all_data = pd.concat([all_data, fund])
    
# 删除缺失值（因为回归里面不允许有缺失值）
all_data = all_data.dropna()

# 在每支股票中，建立【每个月的因子值】对应的【下个月的收益率】
all_data["next_month_return"] = np.nan    # 相当于初始化收益率

In [22]:
all_data

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense,date,next_month_return
000001.XSHE,7.126,0.9684,1.08537e+11,1.85809e+12,0.6756,13.0164,1.43,3.7345e+10,2.2038e+10,2014-01-30,
000002.XSHE,5.3769,1.0571,8.12905e+10,1.91646e+11,1.7804,9.3272,0.56,6.34153e+10,5.396e+10,2014-01-30,
000060.XSHE,29.0435,1.9053,1.13668e+10,1.91828e+10,0.9316,3.068,0.08,9.75875e+09,9.64269e+09,2014-01-30,
000063.XSHE,33.5992,2.0244,4.56162e+10,9.63637e+10,0.5568,2.561,0.16,5.45575e+10,5.70062e+10,2014-01-30,
000069.XSHE,8.0826,1.4932,3.56303e+10,6.55072e+10,4.0417,13.7533,0.399,1.72365e+10,1.32602e+10,2014-01-30,
000100.XSHE,10.1129,1.5054,2.13287e+10,5.31028e+10,2.1248,10.2132,0.149,6.12241e+10,6.05763e+10,2014-01-30,
000157.XSHE,10.1368,0.935,3.89151e+10,7.0563e+10,4.387,9.2131,0.49,2.88971e+10,2.42468e+10,2014-01-30,
000333.XSHE,14.8607,2.4057,7.90211e+10,9.53968e+10,7.2895,17.3695,3.73,9.39206e+10,8.7225e+10,2014-01-30,
000338.XSHE,9.7984,1.262,3.49879e+10,5.65886e+10,4.0573,10.4072,1.35,4.33327e+10,4.00661e+10,2014-01-30,
000402.XSHE,5.1507,0.6684,1.48932e+10,4.74655e+10,2.1432,6.6307,0.44,1.01632e+10,8.23405e+09,2014-01-30,


## 3. 获取价格数据计算对应的收益率

In [4]:
all_price = pd.DataFrame()


# 获取每月最后一个交易日的价格数据
for date in month_date:
    
    price = get_price(stocks, start_date=date, end_date=date,fields="close")
    
    # 将每个price数组填充到DataFrame里面
    all_price = pd.concat([all_price, price],axis=0)    # 加不加axis结果都一样

    
# 转置方便后续计算处理
all_price = all_price.T


# 按行（股票样本）去除空值
all_price = all_price.dropna()


# 计算下月收益率：
# 循环all_price的列索引，不能循环到最后一个，所以-1
for i in range(len(all_price.columns)-1):
    
    # 下月收益率 = （下月收盘价 - 当月收盘价）/ 当月收盘价 = 下月收盘价 / 当月收盘价 - 1
    # 获取【当前列数据】和【后一列数据】，将计算出来的【下月收益率】替换掉【当前列数据】
    all_price.iloc[:,i] = all_price.iloc[:,i+1] / all_price.iloc[:,i] - 1

# all_price是【下个月的收益率】，不是本月的收益率
# 2014-01-30 00:00:00的收益率 是 2014-02-28 00:00:00的收益率

In [25]:
all_price

Unnamed: 0,2014-01-30 00:00:00,2014-02-28 00:00:00,2014-03-31 00:00:00,2014-04-30 00:00:00,2014-05-30 00:00:00,2014-06-30 00:00:00,2014-07-31 00:00:00,2014-08-29 00:00:00,2014-09-30 00:00:00,2014-10-31 00:00:00,...,2015-03-31 00:00:00,2015-04-30 00:00:00,2015-05-29 00:00:00,2015-06-30 00:00:00,2015-07-31 00:00:00,2015-08-31 00:00:00,2015-09-30 00:00:00,2015-10-30 00:00:00,2015-11-30 00:00:00,2015-12-31 00:00:00
002460.XSHE,0.296963,0.012888,0.000000,0.000000,0.090917,0.255158,-0.072004,-0.053634,-0.057181,0.097139,...,0.019841,0.207851,-0.185907,-0.143383,-0.013508,0.231291,0.352666,0.391955,0.327353,20.7659
601919.XSHG,0.038217,0.104294,-0.147222,-0.022801,-0.003333,0.103679,0.012121,0.215569,0.012315,0.299270,...,1.056452,-0.125490,-0.067265,-0.189103,0.149209,0.000000,0.000000,0.000000,-0.224420,9.0200
601727.XSHG,0.059517,0.045453,-0.053732,-0.013491,0.052039,0.049491,0.003426,0.365246,0.075628,0.087483,...,0.665744,0.191865,-0.311346,-0.109491,-0.175227,0.028388,0.000000,0.000000,0.027605,11.5400
002065.XSHE,0.166117,-0.116848,-0.068390,0.065955,0.009030,-0.148116,0.000000,0.000000,0.000000,0.033846,...,0.155984,0.202407,-0.327907,-0.128700,-0.262274,-0.050861,0.262821,-0.013543,0.148741,12.2334
000898.XSHE,-0.044353,0.032140,-0.010380,-0.020977,0.098867,0.131163,-0.023209,0.047490,0.158634,0.066030,...,0.156641,-0.041041,0.048266,-0.213700,-0.106271,-0.081870,0.006361,-0.037971,0.046063,4.5214
000538.XSHE,-0.064575,-0.022118,0.011905,0.021079,-0.092647,0.036015,-0.074519,0.024376,0.042910,0.047130,...,0.042460,0.079159,0.150574,-0.141492,-0.057088,-0.077412,0.052344,0.004900,0.072992,70.2228
600015.XSHG,0.000000,0.033060,0.005918,-0.027099,-0.007248,0.133906,-0.042053,0.007118,0.034159,0.159460,...,0.255805,-0.072835,0.012653,-0.123107,-0.041590,-0.025078,0.044511,0.048300,0.096663,9.4018
601958.XSHG,-0.020117,-0.031597,-0.048942,0.154311,0.055307,0.240801,-0.035394,0.017761,-0.050006,0.141980,...,0.358006,-0.100100,-0.157968,-0.233449,-0.136211,-0.125639,0.165680,0.005039,0.036290,8.2269
002608.XSHE,-0.022423,-0.011794,-0.053058,0.013310,0.073939,0.139404,0.099517,0.138840,-0.013117,0.000000,...,0.061022,-0.086269,-0.016522,-0.277600,-0.031008,0.000000,0.000000,0.000000,0.000000,8.7500
601169.XSHG,0.075726,-0.010429,0.003952,0.030180,0.026748,0.118827,-0.002719,0.095510,0.049802,0.088963,...,0.190659,-0.020016,0.046352,-0.193162,0.070612,-0.084033,0.018577,0.057010,0.135931,6.6427


## 4. 将收益率填充到因子对应的列（下个月收益率）当中

* 通过日期和股票代码（两表共有），整合all_date["next_month_return"], all_price

In [5]:
# 每个样本填充对应收益率
for i in range(len(all_data)): 
    
    # 在all_data表中获取股票代码和日期
    stock = all_data.index[i]
    # .ix[行号，字段名]：混合索引，第0行对应第一行
    date = all_data.ix[i,'date']
    
    # 在all_price表中，寻找收益率：
    # stock/date不一定在all_price里面
    if stock in all_price.index and date in all_price.columns:
        all_data.ix[i,'next_month_return'] = all_price.loc[stock,date]
        
# 把收益率为空的删除
all_data = all_data.dropna()

In [6]:
all_data

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense,date,next_month_return
000001.XSHE,7.126,0.9684,1.08537e+11,1.85809e+12,0.6756,13.0164,1.43,3.7345e+10,2.2038e+10,2014-01-30,-0.023689
000002.XSHE,5.3769,1.0571,8.12905e+10,1.91646e+11,1.7804,9.3272,0.56,6.34153e+10,5.396e+10,2014-01-30,-0.089430
000060.XSHE,29.0435,1.9053,1.13668e+10,1.91828e+10,0.9316,3.068,0.08,9.75875e+09,9.64269e+09,2014-01-30,0.079866
000063.XSHE,33.5992,2.0244,4.56162e+10,9.63637e+10,0.5568,2.561,0.16,5.45575e+10,5.70062e+10,2014-01-30,-0.009043
000069.XSHE,8.0826,1.4932,3.56303e+10,6.55072e+10,4.0417,13.7533,0.399,1.72365e+10,1.32602e+10,2014-01-30,-0.077553
000100.XSHE,10.1129,1.5054,2.13287e+10,5.31028e+10,2.1248,10.2132,0.149,6.12241e+10,6.05763e+10,2014-01-30,0.112017
000157.XSHE,10.1368,0.935,3.89151e+10,7.0563e+10,4.387,9.2131,0.49,2.88971e+10,2.42468e+10,2014-01-30,-0.009907
000333.XSHE,14.8607,2.4057,7.90211e+10,9.53968e+10,7.2895,17.3695,3.73,9.39206e+10,8.7225e+10,2014-01-30,-0.169011
000338.XSHE,9.7984,1.262,3.49879e+10,5.65886e+10,4.0573,10.4072,1.35,4.33327e+10,4.00661e+10,2014-01-30,-0.002856
000402.XSHE,5.1507,0.6684,1.48932e+10,4.74655e+10,2.1432,6.6307,0.44,1.01632e+10,8.23405e+09,2014-01-30,-0.065031


## 5. 特征值和目标值处理

In [None]:
# 需要使用的接口：

def mad(factor):
    """自实现3倍中位数绝对值偏差去极值"""
    # 1、找出因子的中位数 median
    median_factor = np.median(factor)
    
    # 2、得到每个因子值与中位数的绝对偏差值 |x – median|
    # 3、得到绝对偏差值的中位数， MAD = median(|x – median|)
    mad = np.median(abs(factor-median_factor))
    
    # 4、计算MAD_e = 1.4826*MAD，然后确定参数 n，做出调整
    # 定义3倍中位数的上下限制
    high = median_factor + (3 * 1.4826 * mad)
    low = median_factor - (3 * 1.4826 * mad)
    
    # 利用3倍中位数的值去极值
    factor = np.where(factor > high, high, factor)
    factor = np.where(factor < low, low, factor)
    return factor

  
def stand(factor):
    """自实现标准化，相当于StandScaler"""
    mean = factor.mean()
    std_dev = factor.std()
    return (factor-mean)/std_dev

In [15]:
# 目标值：下期收益率
y = all_data["next_month_return"]

# 特征值：9列因子值
x = all_data.drop(["date", "next_month_return"],axis=1)    # axis=1：按照列drop

# 保留一份原始market_cap因子值（作为中性化处理特征值）
x_market_cap = all_data['market_cap']


# 5.1 特征值处理：
# 对于每列数据（每个特征值）进行去极值，标准化，中性化处理
for name in x.columns:
    x[name] = mad(x[name])
    x[name] = stand(x[name])

In [16]:
x.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense
000001.XSHE,-0.815344,-1.05121,1.4836,1.80072,-0.738091,0.970035,2.20963,1.62643,1.09411
000002.XSHE,-0.87527,-1.01664,0.798488,1.80072,-0.341725,0.393145,0.626803,1.62643,1.59898
000060.XSHE,-0.0644249,-0.685999,-0.959765,-0.839525,-0.646247,-0.585624,-0.793082,-0.227772,-0.0791039
000063.XSHE,0.0916587,-0.639572,-0.0985537,0.419896,-0.780713,-0.664905,-0.556435,1.62643,1.59898
000069.XSHE,-0.78257,-0.84664,-0.349651,-0.0836142,0.469554,1.08527,0.15055,0.3589,0.263298


In [17]:
# 特征值处理 - 市值中性化处理
# 特征值：市值因子值（未经处理过的），目标值：其他因子值
for name in x.columns:
    
    if name == "market_cap":
        # 不处理市值因子
        continue
        
    # 准备特征值和目标值
    # 特征值：x_market_cap
    # 目标值：其他因子值
    y_factor = x[name]
    
    # 建立线性回归方程
    lr = LinearRegression()
    
    # 注意：要把特征值转成2维数组【.reshape(-1,1)】
    lr.fit(x_market_cap.values.reshape(-1,1),y_factor)
    y_predict = lr.predict(x_market_cap.values.reshape(-1,1))
    
    # 得出真实值与预测值之间的误差，把误差当做新的因子值
    x[name] = y_factor - y_predict



In [18]:
x.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense
000001.XSHE,-0.797964,-1.02889,1.4836,1.74339,-0.726999,0.958731,2.19398,1.57993,1.05032
000002.XSHE,-0.879092,-1.02154,0.798488,1.81332,-0.344164,0.39563,0.630244,1.63665,1.60861
000060.XSHE,-0.122657,-0.760778,-0.959765,-0.647451,-0.683412,-0.547748,-0.740656,-0.071991,0.0676206
000063.XSHE,0.0600775,-0.680128,-0.0985537,0.524064,-0.800869,-0.644364,-0.528002,1.71091,1.67856
000069.XSHE,-0.821921,-0.897174,-0.349651,0.0461843,0.444439,1.11086,0.185978,0.464173,0.36245


## 7. 建立回归方程，确定权重

- 特征值：因子数据（处理后）
- 目标值：下期收益率（标准化）

In [19]:
# 下期收益率标准化
y = stand(y)

lr = LinearRegression()
lr.fit(x,y)
# 确定每个因子对应的系数
lr.coef_

array([ 0.0348569 , -0.05684269, -0.14476781, -0.03907358, -0.04118705,
        0.14049031, -0.03542189,  0.38059331, -0.28289997])