Data source: BaoStock and TuShare

Sample period (build model and select factors): 09/30/2010 – 09/30/2021

back-testing: 09/30/2011 – 09/30/2021

Sample frequency: monthly, daily

Example: Start stock selection in 2011.09.30. First, use the data from 2010.09.30 to 2011.09.30 to select factors, and use this data to train three prediction models;

# 引入库

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from datetime import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


# 主要超参数

In [2]:
day0 = '20100930'   #数据开始时间
day1 = '20110930'  #回测开始时间
day2 = '20210930'  #回测结束时间

# 提取数据

In [3]:
bs_data = pickle.load(open('/Users/cr/Downloads/undergraduate time/大四上/FIN 4998/paper/code/data & data code/day','rb'))


In [None]:
bs_data

# 处理收益率技术指标并获取每个月最后一个交易日

In [5]:
d = [x[1] for x in bs_data.index.tolist()]
d = list(set(d))
d = [datetime.strptime(x,'%Y%m%d') for x in d]
d.sort()

In [6]:
#获取每个月最后一个交易日
def get_real_last(d):
    sd = pd.DataFrame(d,index=d)
    sd['year'] = sd[0].apply(lambda x:x.year)
    sd['month'] = sd[0].apply(lambda x:x.month)
    sd['day'] = sd[0].apply(lambda x:x.day)
    real_last = pd.DataFrame(sd['day'].groupby([sd['year'],sd['month']]).max())
    real_last = real_last.reset_index()
    real_last['index'] = real_last.index
    real_last['result'] = real_last['index'].apply(lambda x:datetime(real_last.loc[x,'year'],real_last.loc[x,'month'],
                                                                    real_last.loc[x,'day']))
    
    return list(real_last['result'])

month_day = get_real_last(d)
month_day = [datetime.strftime(x,'%Y%m%d') for x in month_day]
dayindex0 = month_day.index(day0)
dayindex1 = month_day.index(day1)
dayindex2 = month_day.index(day2)
month_day = month_day[dayindex0:dayindex2+1]

In [7]:
#提取交易数据，用于回测
daterange = pd.date_range(day1,day2)
daterange = list(set(daterange)&set(d))
daterange.sort()
daterange = [datetime.strftime(x,'%Y%m%d') for x in daterange]
trade_dic = {}
for i in ['close','pctChg','isST','tradestatus']:
    temp = bs_data[i]
    temp = temp.unstack()
    temp = temp.T
    temp = temp.loc[daterange,:]
    trade_dic[i] = temp

In [8]:
#技术指标数据和价值指标
'''
240~20日动量
120~20日动量
60~20日动量
20日反转

20日收益率偏度
20日收益率峰度

20日换手率均值
60日换手率均值
20日换手率与60日换手率比值（异常换手率）

pe
pb
ps
pcf

收益率
'''
print('\n')





In [9]:
def to_factor(factor,month_day):
    factor = factor.loc[month_day]
    return factor.T.stack()

In [10]:
#计算峰度和偏度
temp = bs_data['pctChg']
temp = temp.unstack().T
skew20 = temp.rolling(20).skew()
skew20 = to_factor(skew20,month_day)

kurt20 = temp.rolling(20).kurt()
kurt20 = to_factor(kurt20,month_day)

#计算动量和反转
temp = bs_data['adclose']  #使用后复权价格
temp = temp.unstack().T
mom240 = temp.shift(20) / temp.shift(240) - 1
mom240 = to_factor(mom240,month_day)
mom120 = temp.shift(20) / temp.shift(120) - 1
mom120 = to_factor(mom120,month_day)
mom60 = temp.shift(20) / temp.shift(60) - 1
mom60 = to_factor(mom60,month_day)
rev20 = temp / temp.shift(20) - 1
rev20 = to_factor(rev20,month_day)

#计算T+1期收益率
returnv = temp.loc[month_day,:]
returnv = returnv.shift(-1) / returnv - 1
returnv = returnv.T.stack()

#计算换手率
temp = bs_data['turn'] 
temp = temp.unstack().T
turn20 = temp.rolling(20).mean()
turn20 = to_factor(turn20,month_day)
turn60 = temp.rolling(60).mean()
turn60 = to_factor(turn60,month_day)
turn20_60 = turn20/turn60

#提取pe pb pcf和ps
value = pd.DataFrame()
for i in ['peTTM','pbMRQ','pcfNcfTTM','psTTM']:
    temp = bs_data[i]
    temp = temp.unstack().T
    temp = temp.loc[month_day,:]
    temp = temp.unstack()
    value = pd.concat([value,temp],axis=1)
value.columns = ['peTTM','pbMRQ','pcfNcfTTM','psTTM']

#合并数据
tdf = pd.concat([returnv,value,mom240,mom120,mom60,rev20,skew20,kurt20,turn20,turn60,turn20_60],axis=1)
tdf.columns = ['return','peTTM','pbMRQ','pcfNcfTTM','psTTM','mom240','mom120',
              'mom60','rev20','skew20','kurt20','turn20','turn60','turn20_60']

In [11]:
import gc
del bs_data
gc.collect()

0

# 处理财务指标并合并数据

In [12]:
ts_data = pickle.load(open('/home/aq/aq_data/tushare/fina_data','rb'))
fina_fac = pd.read_csv('/home/aq/aq_data/tushare/财报指标.csv')
ff = list(fina_fac.query('表格类型=="fina_indicator"')['名称'])
for i in ['ts_code','ann_date','end_date','update_flag']:
    ff.remove(i)
ts_data = ts_data[ff]

ts_data = ts_data.reset_index(['ts_code','end_date','ann_date'])
ts_data = ts_data.sort_values(by=['ts_code','end_date','ann_date'],ascending=[True,True,True])
ts_data = ts_data.set_index(['ts_code','end_date'])
ts_data = ts_data.loc[~ts_data.index.duplicated(keep='first')]  #保留最原始的财报

#处理单季度因子，将单季度因子计算TTM（新增了30多个因子）
l = [True if x[:2]=='q_' else False for x in ts_data.columns]
dl = pd.DataFrame(ts_data.columns)
dl['l'] = l
qlist = list(dl.query('l==True')[0])

for q in qlist:
    temp = ts_data[q].unstack()
    temp = temp.T
    temp = temp.rolling(4).mean()
    temp = temp.unstack()
    ts_data[q+'_TTM'] = temp
    
    
#删除年报，滞后到截至时间
ts_data['report'] = [x[1][4:] for x in ts_data.index.tolist()]
ts_data = ts_data.query('report!="1231"')

ts_data['date'] = [x[1] for x in ts_data.index.tolist()]

def datedelay(x):
    year,month,day = x[:4],x[4:6],x[6:]
    if month == '03':
        month = '04'
        day = '30'
        r = year+month+day
    elif month == '06':
        month = '08'
        day = '31'
        r = year+month+day
    elif month == '09':
        month = '10'
        day = '31'
        r = year+month+day
    else:
        r = 'error'
    return r

ts_data['dl_date'] = ts_data['date'].apply(lambda x:datedelay(x))

print(ts_data.shape)
ts_data = ts_data.query('dl_date!="error"')

ts_data = ts_data.drop(labels=['ann_date','report','date'],axis=1)
ts_data = ts_data.reset_index(['ts_code'])
ts_data = ts_data.set_index(['ts_code','dl_date'])

(157446, 202)


In [157]:
'''
特征，股票，时间
'''
b = time.time()
ts_dic = {}
for i in ts_data.columns:
    temp = ts_data[i]
    temp = temp.unstack().T
    ts_dic[i] = {}
    for j in temp.columns:
        ts_dic[i][j] = {x:y for (x,y) in zip(temp.index,temp[j])}
e = time.time()
print(e-b)

62.61437654495239


In [191]:
def find_finareport(ts_dic,col,code,date):
    def reportdate(x):
        year = x[:4]
        month = x[4:6]

        if month in ['04','05','06','07']:
            r = year+'0430'
        elif month in ['08','09']:
            r = year+'0831'
        elif month in ['10','11','12']:
            r = year+'1031'
        elif month in ['01','02','03']:
            r = str(int(year)-1)+'1031'
        else:
            r = 'error'
        return r
    try:
        value = ts_dic[col][code][reportdate(date)]
    except:
        value = np.nan
    return value

In [192]:
tdf['index'] = tdf.index
for c in tqdm(ts_data.columns):
    tdf[c] = tdf['index'].apply(lambda x:find_finareport(ts_dic,c,x[0],x[1]))
tdf = tdf.drop(labels=['index'],axis=1)

100%|██████████| 198/198 [01:39<00:00,  2.00it/s]


In [195]:
file = '/Users/cr/Downloads/undergraduate time/大四上/FIN 4998/paper/code/机器学习/'
##用于训练的数据（时间段 ）：
pickle.dump(tdf,open(file+'ml_data','wb'))
##用于回测的数据（）：
pickle.dump(trade_dic,open(file+'trade_dic','wb'))

In [199]:
tdf.shape

(618317, 210)

In [200]:
tdf

Unnamed: 0,Unnamed: 1,return,peTTM,pbMRQ,pcfNcfTTM,psTTM,mom240,mom120,mom60,rev20,skew20,...,q_gr_yoy_TTM,q_gr_qoq_TTM,q_sales_yoy_TTM,q_sales_qoq_TTM,q_op_yoy_TTM,q_op_qoq_TTM,q_profit_yoy_TTM,q_profit_qoq_TTM,q_netprofit_yoy_TTM,q_netprofit_qoq_TTM
000001.SZ,20100930,0.135018,9.826565,1.858148,4.515850,3.506698,-0.168171,-0.250749,0.000000,-0.073672,0.920621,...,8.215075,4.414700,8.215075,4.414700,52.891500,7.157200,56.890175,5.507975,56.890175,5.507975
000001.SZ,20101029,-0.109723,10.471109,1.995982,3.154221,3.738848,-0.328346,-0.175273,-0.051399,0.108368,0.767350,...,15.415200,6.558875,15.415200,6.558875,59.490825,7.872025,60.698500,6.857750,60.698500,6.857750
000001.SZ,20101130,-0.036608,9.322188,1.776977,2.808131,3.328610,-0.234619,0.065677,0.065677,-0.121651,-1.065155,...,15.415200,6.558875,15.415200,6.558875,59.490825,7.872025,60.698500,6.857750,60.698500,6.857750
000001.SZ,20101231,-0.030399,8.980924,1.711926,2.705332,3.206758,-0.288841,-0.053684,-0.037747,-0.047073,0.497931,...,15.415200,6.558875,15.415200,6.558875,59.490825,7.872025,60.698500,6.857750,60.698500,6.857750
000001.SZ,20110131,0.040496,8.707913,1.659885,2.623092,3.109276,-0.261114,-0.098230,-0.165874,-0.030399,-0.351746,...,15.415200,6.558875,15.415200,6.558875,59.490825,7.872025,60.698500,6.857750,60.698500,6.857750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689009.SH,20210531,0.161255,247.402932,13.780468,42.839595,7.238830,,-0.107218,-0.108966,0.150706,1.247689,...,,,,,,,,,,
689009.SH,20210630,-0.196429,287.297921,16.002639,49.747699,8.406128,,0.000516,0.164363,0.098323,0.355943,...,,,,,,,,,,
689009.SH,20210730,0.096491,230.864401,12.859263,39.975829,6.754924,,-0.218787,0.089114,-0.126883,-1.805909,...,,,,,,,,,,
689009.SH,20210831,0.078667,128.371056,13.141128,41.702653,6.061203,,0.059948,-0.178553,0.146438,1.733691,...,,,,,,,,,,
