In [1]:
import numpy as np
import pandas as pd
from jqdata import *
import tushare as ts
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

## 创建获取数据的时间列

In [2]:
dateList = []
dateList_1M = []
for i in range(2007,2018):
    dateList.append(str(i)+'-05-01')
    dateList_1M.append(str(i)+'-04-01')
print(dateList)
print(dateList_1M)

['2007-05-01', '2008-05-01', '2009-05-01', '2010-05-01', '2011-05-01', '2012-05-01', '2013-05-01', '2014-05-01', '2015-05-01', '2016-05-01', '2017-05-01']
['2007-04-01', '2008-04-01', '2009-04-01', '2010-04-01', '2011-04-01', '2012-04-01', '2013-04-01', '2014-04-01', '2015-04-01', '2016-04-01', '2017-04-01']


## 利用聚宽网提供的API获取有关数据并进行预处理

In [3]:
stockData = pd.DataFrame(columns=["code","market_cap","SMB","BM","HML","price","nextPrice","priceRatio"]) 
for i in range(0,len(dateList[:-1])):
    # 先更新“沪深300”股票池
    stockList = get_index_stocks('000300.XSHG', date=dateList[i])
    # 获取数据
    print("--------\n", dateList[i], "\n沪深300股票数：", len(stockList)) 
    df = get_fundamentals(query(
            valuation.code,
            valuation.market_cap,
            (balance.total_owner_equities/valuation.market_cap/100000000.0).label("BM"),
        ).filter(
            # 这里不能使用 in 操作, 要使用in_()函数
            valuation.code.in_(stockList)
                         ), date=dateList[i])
    df['price'] = NaN
    df['nextPrice'] = NaN
    df['priceRatio'] = NaN
    # markReturn为同期市场收益率（沪深300）
    pd_markeReturn = ts.get_k_data('399300', ktype='M', index=True, start=dateList_1M[i], end=dateList[i])
    pd_markeReturnNext = ts.get_k_data('399300', ktype='M', index=True, start=dateList_1M[i+1], end=dateList[i+1])
    markReturn = (pd_markeReturnNext['open'].tolist()[0]+pd_markeReturnNext['close'].tolist()[0])/(
        pd_markeReturn['open'].tolist()[0]+pd_markeReturn['close'].tolist()[0])-1
    # 增加price,nextPrice,up列数据
    for j in range(0, len(df)):
        # 用tushare接口获取月K线，并参考月K线的open，close价得股价，即该股的月均价
        df_price = ts.get_k_data(df['code'][j][:6], ktype='M', start=dateList_1M[i], end=dateList[i])
        print(df['code'][j]," is OK.")
        if(df_price.empty==False):
            df['price'][j] = (df_price['open'].tolist()[0]+df_price['close'].tolist()[0])/2
        else:
            df['price'][j] = NaN
        df_nextPrice = ts.get_k_data(df['code'][j][:6], ktype='M', start=dateList_1M[i+1], end=dateList[i+1])
        if(df_nextPrice.empty==False):
            df['nextPrice'][j] = (df_nextPrice['open'].tolist()[0]+df_nextPrice['close'].tolist()[0])/2
        else:
            df['nextPrice'][j] = NaN
        # 比较price和nextPrice，以及减去市场波动即“沪深300指数”同期波动，得该股涨跌
        if((df['price'][j]!=NaN)&(df['nextPrice'][j]!=NaN)):
            priceRatio = df['nextPrice'][j]/df['price'][j]-1-markReturn
            df['priceRatio'][j] = priceRatio
#     print("原数据量：", len(df))
    df.dropna(axis=0,how='any',inplace=True)
    df['SMB'] = NaN
    df['HML'] = NaN
    split = int(len(df)*0.3)
    #依据market_cap列，将股票的SMB（market_cap总市值）按30%,40%,30%的比例分为S（小盘股）,M（中盘股）,B（大盘股）
    df.sort(columns='market_cap', inplace=True)
    df['SMB'][0:split] = 'S'
    df['SMB'][split:-split] = 'M'
    df['SMB'][-split:] = 'B'
    #依据BM列，将股票的HML（BM账面市值比）按30%,40%,30%的比例分为L（低价值）,M（中价值）,H（高价值）
    df.sort(columns='BM', inplace=True)
    df['HML'][0:split] = 'L'
    df['HML'][split:-split] = 'M'
    df['HML'][-split:] = 'H'
    df = df.sort_index(ascending=True)
#     print("最终数据量：", len(df))
    stockData = stockData.append(df, ignore_index=True) 
stockData = stockData[["code", "market_cap", "SMB", "BM", "HML","priceRatio"]]
stockData.to_csv('stockAll.csv')

--------
 2007-05-01 
沪深300股票数： 300
000001.XSHE  is OK.
000002.XSHE  is OK.
000009.XSHE  is OK.
000012.XSHE  is OK.
000021.XSHE  is OK.
000022.XSHE  is OK.
000024.XSHE  is OK.
000027.XSHE  is OK.
000029.XSHE  is OK.
000031.XSHE  is OK.
000036.XSHE  is OK.
000039.XSHE  is OK.
000059.XSHE  is OK.
000060.XSHE  is OK.
000061.XSHE  is OK.
000063.XSHE  is OK.
000066.XSHE  is OK.
000068.XSHE  is OK.
000069.XSHE  is OK.
000088.XSHE  is OK.
000089.XSHE  is OK.
000099.XSHE  is OK.
000100.XSHE  is OK.
000157.XSHE  is OK.
000400.XSHE  is OK.
000401.XSHE  is OK.
000402.XSHE  is OK.
000410.XSHE  is OK.
000422.XSHE  is OK.
000423.XSHE  is OK.
000425.XSHE  is OK.
000488.XSHE  is OK.
000503.XSHE  is OK.
000520.XSHE  is OK.
000527.XSHE  is OK.
000528.XSHE  is OK.
000538.XSHE  is OK.
000539.XSHE  is OK.
000541.XSHE  is OK.
000550.XSHE  is OK.
000559.XSHE  is OK.
000562.XSHE  is OK.
000568.XSHE  is OK.
000581.XSHE  is OK.
000601.XSHE  is OK.
000617.XSHE  is OK.
000623.XSHE  is OK.
000625.XSHE  is OK.
0006

## 抽出需要的数据，并对feature进行One-Hot编码，，转换成六个feature

In [4]:
# 只选取需要的几列
stockData = pd.read_csv('stockAll.csv')
stockData2 = stockData[["SMB", "HML", "priceRatio"]]

In [5]:
# 对SMB和HML进行One-Hot编码
# le = LabelEncoder()
to_encode = ["SMB", "HML"]
stockData2 = pd.get_dummies(stockData2, columns=to_encode)
print(stockData2.head())
# stockData3 = stockData2[["SMB_S", "SMB_M", "SMB_B", "HML_H", "HML_M", "HML_L", "up", "priceRatio", "date"]]
stockData3 = stockData2[["SMB_S", "SMB_M", "SMB_B", "HML_H", "HML_M", "HML_L", "priceRatio"]]
print(stockData3.head())
stockData3.to_csv('TwoFactor_encode.csv')

   priceRatio  SMB_B  SMB_M  SMB_S  HML_H  HML_L  HML_M
0    0.064799      1      0      0      0      1      0
1    0.962011      1      0      0      0      0      1
2   -0.007515      0      1      0      0      0      1
3   -0.408805      0      1      0      0      0      1
4   -0.489338      0      1      0      0      0      1
   SMB_S  SMB_M  SMB_B  HML_H  HML_M  HML_L  priceRatio
0      0      0      1      0      0      1    0.064799
1      0      0      1      0      1      0    0.962011
2      0      1      0      0      1      0   -0.007515
3      0      1      0      0      1      0   -0.408805
4      0      1      0      0      1      0   -0.489338


## 建立线性回归模型，训练得到六个特征的系数，用系数的正负大小判断各feature对lable的影响

In [6]:
x = stockData2[["SMB_S", "SMB_M", "SMB_B", "HML_H", "HML_M", "HML_L"]]
y = stockData2["priceRatio"]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.7)

In [7]:
clf_ordinary = Lasso(alpha=0)
clf_ordinary.fit(x_train, y_train)
print(clf_ordinary.coef_)

[ 0.03294995  0.00973541 -0.00561727  0.0360998   0.01214244 -0.0072663 ]


### 各feature对应系数： 
#### [SMB]    SMB_S(小盘股):0.03294995，SMB_M(中盘股):0.00973541， SMB_B(大盘股):-0.00561727
#### [HML]    HML_H(高价值):0.0360998，HML_M(中价值):0.01214244，HML_L(低价值):-0.0072663
<p style = " font-size:15px;line-height:2em;">可见SMB_S和HML_H明显对label（股价涨跌比）呈正向影响，而SMB_B和HML_L则明显呈负向影响。在此基础上可以认为具有高账面市值比（价值高）的和低市值（规模小）的公司，其股票价格涨的可能性比较大（相比沪深300同期波动）。这结果与Fama和French对美国证券市场以1936-1990年为样本期的研究结果相同，即价值股（高账面市值比）和小规模（低市值）股的历史表现要优于同期其它股票。</p>