In [130]:
import numpy as np
from skimage.restoration import denoise_wavelet
import pandas as pd
import matplotlib.pyplot as plt
import talib as ta #导入需要的库

In [131]:
# 定义DWT降噪函数
def denoise(var):
    denoise = denoise_wavelet(var, method='VisuShrink', mode='hard', wavelet_levels=5, wavelet='sym2',
                                rescale_sigma='True')
    return denoise

In [132]:
import warnings
warnings.filterwarnings('ignore')#忽略警告信息

In [141]:
sz50 = pd.read_csv('/Users/wujiewang/Desktop/毕设实证/数据集/sz50/sz50stocks_front.csv', encoding='utf-8') #读取上证50成分股向前复权股价
#向前复权反映真实持仓成本，向后复权反应股价真实变化。考虑到最终将构建投资组合，因此用前复权，反映持仓成本。

In [145]:
sz50.drop('前收盘价(元)', axis = 1, inplace = True) #删去多余列

In [146]:
sz50.columns = ['code', 'name', 'date', 'open', 'close', 'h_price', 'l_price', 'volume', 'mkt_value']#重命名表头

In [161]:
# MAD去极值
def mad(factor):
    '''3倍中位数去极值'''
    # 求出因子值的中位数
    med = np.median(factor)

    # 求出因子值与中位数的差值，进行绝对值
    mad = np.median(abs(factor - med))

    # 定义几倍的中位数上下限
    high = med + (3 * 1.4826 * mad)
    low = med - (3 * 1.4826 * mad)

    # 替换上下限以外的值
    factor = np.where(factor > high, high, factor)
    factor = np.where(factor < low, low, factor)
    return factor

In [162]:
# 构建函数，自动计算动量类指标
def MOM_indicators(samp):
    #创建基准表
    output = (samp.iloc[:, 0:3].sort_values(by = 'date')).reset_index(drop = True)
    #计算技术指标，并同时降噪
    high, low, open_, close, volume= [samp['h_price'], samp['l_price'], samp['open'], samp['close'], samp['volume']]
    output['ADX'] = denoise(ta.ADX(high , low,  close, timeperiod=14))
    output['ADXR'] = denoise(ta.ADXR(high , low,  close, timeperiod=14))
    output['APO'] = denoise(ta.APO(close, fastperiod=12, slowperiod=26, matype=0))
    output['AROON_UP'], output['AROON_DOWN'] = ta.AROON(high , low, timeperiod=14)
    output['AROONOSC'] = denoise(ta.AROONOSC(high, low, timeperiod=14))
    output['BOP'] = denoise(ta.BOP(open_, high, low, close))
    output['CCI'] = denoise(ta.CCI(high, low, close, timeperiod=14))
    output['CMO'] = denoise(ta.CMO(close, timeperiod=14))
    output['DX'] = denoise(ta.DX(high, low, close, timeperiod=14))
    # 取出MACD_hist指标
    dif, dem, MACD_hist = ta.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)
    output['MACD'] = denoise(MACD_hist)
    # 取出MACDEXT_hist指标
    dif, dem,  MACDEXT_hist = ta.MACDEXT(close, fastperiod=12, fastmatype=0, slowperiod=26, slowmatype=0, signalperiod=9, signalmatype=0)
    output['MACDEXT'] = denoise(MACDEXT_hist)
    output['MFI'] = denoise(ta.MFI(high, low, close, volume, timeperiod=14))
    output['MINUS_DI'] = denoise(ta.MINUS_DI(high, low, close, timeperiod=14))
    output['MINUS_DM'] = denoise(ta.MINUS_DM(high, low, timeperiod=14))
    output['MOM6'] = denoise(ta.MOM(close, timeperiod=6))
    output['MOM12'] = denoise(ta.MOM(close, timeperiod=12))
    output['MOM36'] = denoise(ta.MOM(close, timeperiod=36))
    output['PPO'] = denoise(ta.PPO(close, fastperiod=12, slowperiod=26, matype=0))
    output['ROC'] = denoise(ta.ROC(close, timeperiod=10))
    output['RSI'] = denoise(ta.RSI(close, timeperiod=14))
    output['ULTOSC'] = denoise(ta.ULTOSC(high, low, close, timeperiod1=7, timeperiod2=14, timeperiod3=28))
    output['WILLR'] = denoise(ta.WILLR(high, low, close, timeperiod=14))
    output.iloc[:, 3: ] = (output.iloc[:, 3: ]).apply(lambda x: mad(x)) #对所有指标进行MAD去极值
    return output

In [163]:
# 取出所有上证50个股名单
corp_list = list(sz50['code'].unique())

# 创建list，用于保存组合
MOM_list = []

# 分个股循环计算其指标
for name in corp_list:
    samp = sz50.query(f"code == '{name}'")
    samp_output = MOM_indicators(samp)
    MOM_list.append(samp_output)

# 拼接所有个股指标数据
MOM_tb = pd.concat(MOM_list)

In [164]:
MOM_tb

Unnamed: 0,code,name,date,ADX,ADXR,APO,AROON_UP,AROON_DOWN,AROONOSC,BOP,...,MINUS_DI,MINUS_DM,MOM6,MOM12,MOM36,PPO,ROC,RSI,ULTOSC,WILLR
0,600010.SH,包钢股份,2012/1/10,,,,,,,-0.072550,...,,,,,,,,,,
1,600010.SH,包钢股份,2012/1/11,,,,,,,-0.072938,...,,,,,,,,,,
2,600010.SH,包钢股份,2012/1/12,,,,,,,-0.050771,...,,,,,,,,,,
3,600010.SH,包钢股份,2012/1/13,,,,,,,-0.034648,...,,,,,,,,,,
4,600010.SH,包钢股份,2012/1/16,,,,,,,-0.024569,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,688599.SH,天合光能,2022/9/5,90.031401,88.326996,-0.196603,,,-85.714286,0.005826,...,26.678922,1.853493,-0.35,-0.80,-1.03,-1.252184,-2.652005,37.663651,31.262612,-93.750000
620,688599.SH,天合光能,2022/9/6,90.463236,88.593956,-0.259615,,,-100.000000,0.016692,...,28.372447,2.131101,-0.69,-1.13,-1.04,-1.656808,-5.852090,32.399420,27.011883,-97.278912
621,688599.SH,天合光能,2022/9/7,90.864226,88.883928,-0.246026,,,-92.857143,0.026502,...,24.783540,1.978879,-0.02,0.08,-0.14,-1.570775,-2.079395,49.188239,38.892722,-33.802817
622,688599.SH,天合光能,2022/9/8,91.236574,89.203624,-0.250897,,,-85.714286,0.040252,...,23.072038,1.837531,-0.49,-0.46,-1.17,-1.605309,-3.824092,43.386254,36.370377,-64.233577


In [169]:
# 构建函数，自动计算模式学习（Overlap）类指标
def OS_indicators(samp):
    # 创建基准表格
    output = (samp.iloc[:, 0:3].sort_values(by = 'date')).reset_index(drop = True)
    # 计算指标
    high, low, open_, close, volume= [samp['h_price'], samp['l_price'], samp['open'], samp['close'], samp['volume']]
    B_UP, B_MID, B_LOW = ta.BBANDS(close , timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
    output['BBANDS_UP'], output['BBANDS_MD'], output['BBANDS_LOW'] = [denoise(B_UP), 
                                                                      denoise(B_MID),
                                                                      denoise(B_LOW),
                                                                     ]
    output['DEMA'] = denoise(ta.DEMA(close, timeperiod=30))
    output['EMA'] = denoise(ta.EMA(close, timeperiod = 30))
    output['HT_TRENDLINE'] = denoise(ta.HT_TRENDLINE(close))
    output['KAMA'] = denoise(ta.KAMA(close, timeperiod = 30))
    output['MA'] = denoise(ta.MA(close, timeperiod = 30, matype = 0))
    mama, kama = ta.MAMA(close, fastlimit = 0.5, slowlimit = 0.05)
    output['MAMA'], output['FAMA'] = [denoise(mama), denoise(kama)]
    output['MIDPOINT'] = denoise(ta.MIDPOINT(close, timeperiod = 14))
    output['MIDPRICE'] = denoise(ta.MIDPRICE(high, low, timeperiod = 14))
    output['SAR'] = denoise(ta.SAR(high, low, acceleration = 0, maximum =0))
    output['SAREXT'] = denoise(ta.SAREXT(high, low, startvalue = 0, offsetonreverse = 0,
                                 accelerationinitlong=0, accelerationlong=0, accelerationmaxlong=0, 
                                         accelerationinitshort=0, accelerationshort=0, accelerationmaxshort=0))
    output['SMA'] = denoise(ta.SMA(close, timeperiod = 30))
    output['T3'] = denoise(ta.T3(close, timeperiod = 5, vfactor = 0))
    output['TEMA'] = denoise(ta.TEMA(close, timeperiod = 30))
    output['TRIMA'] = denoise(ta.TRIMA(close, timeperiod = 30))
    output['WMA'] = denoise(ta.WMA(close, timeperiod = 30))
    output.iloc[:, 3: ] = (output.iloc[:, 3: ]).apply(lambda x: mad(x)) #MAD去极值
    return output

In [170]:
# 获得上证50个股名单
corp_list = list(sz50['code'].unique())

# 创建list，用于保存结果
OS_list = []

# 计算OS类指标
for name in corp_list:
    samp = sz50.query(f"code == '{name}'")
    samp_output = OS_indicators(samp)
    OS_list.append(samp_output)

# 拼接所有个股结果
OS_tb = pd.concat(OS_list)

In [171]:
OS_tb

Unnamed: 0,code,name,date,BBANDS_UP,BBANDS_MD,BBANDS_LOW,DEMA,EMA,HT_TRENDLINE,KAMA,...,FAMA,MIDPOINT,MIDPRICE,SAR,SAREXT,SMA,T3,TEMA,TRIMA,WMA
0,600010.SH,包钢股份,2012/1/10,,,,,,,,...,,,,,,,,,,
1,600010.SH,包钢股份,2012/1/11,,,,,,,,...,,,,,,,,,,
2,600010.SH,包钢股份,2012/1/12,,,,,,,,...,,,,,,,,,,
3,600010.SH,包钢股份,2012/1/13,,,,,,,,...,,,,,,,,,,
4,600010.SH,包钢股份,2012/1/16,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,688599.SH,天合光能,2022/9/5,15.776442,15.396,15.015558,15.423535,15.702123,15.592135,16.984432,...,15.594406,15.460,15.540,88.33,-88.33,15.745000,15.540049,15.362803,15.790042,15.631699
620,688599.SH,天合光能,2022/9/6,15.879101,15.212,14.544899,15.308882,15.633599,15.552942,16.890199,...,15.589477,15.255,15.335,88.33,-88.33,15.698333,15.472105,15.208913,15.761375,15.560409
621,688599.SH,天合光能,2022/9/7,15.853997,15.204,14.554003,15.318143,15.627561,15.527413,16.868897,...,15.584849,15.255,15.310,88.33,-88.33,15.675000,15.416551,15.238938,15.732833,15.550194
622,688599.SH,天合光能,2022/9/8,15.758514,15.140,14.521486,15.270981,15.592879,15.515858,16.837993,...,15.500434,15.255,15.285,88.33,-88.33,15.649000,15.366977,15.185209,15.704292,15.512452


In [191]:
# 构建函数，用于计算量价指标
def VnP_indicators(samp):
    # 创建基准表格
    output = (samp.iloc[:, 0:3].sort_values(by = 'date')).reset_index(drop = True)
    # 计算指标，并同时降噪
    high, low, open_, close, volume= [samp['h_price'], samp['l_price'], samp['open'], samp['close'], samp['volume']]
    output['AD'] = denoise(ta.AD(high, low, close, volume))
    output['ADOSC'] = denoise(ta.ADOSC(high, low, close, volume))
    output['OBV'] = denoise(ta.OBV(close, volume))
    output['ATR'] = denoise(ta.ATR(high, low, close))
    output['NATR'] = denoise(ta.NATR(high, low, close))
    output['TRANGE'] = denoise(ta.TRANGE(high, low, close))
    output['AVGPRICE'] = denoise(ta.AVGPRICE(open_, high, low, close))
    output['MEDPRICE'] = denoise(ta.MEDPRICE(high, low))
    output['TYPPRICE'] = denoise(ta.TYPPRICE(high, low, close))
    output['WCLPRICE'] = denoise(ta.WCLPRICE(high, low, close))
    output['HT_DCPERIOD'] = denoise(ta.HT_DCPERIOD(close))
    output['HT_DCPHASE'] = denoise(ta.HT_DCPHASE(close))
    inphase, quadrature= ta.HT_PHASOR(close)
    output['HT_PHASOR_inphase'] = denoise(inphase)
    output['HT_PHASOR_quadrature'] = denoise(quadrature)
    sine, leadsine = ta.HT_SINE(close)
    output['HT_SINE'] = denoise(sine)
    output['HT_TRENDMODE'] = denoise(ta.HT_TRENDMODE(close))
    output.iloc[:, 3: ] = (output.iloc[:, 3: ]).apply(lambda x: mad(x)) #去极值
    return output

In [192]:
# 获得上证50成分股名单
corp_list = list(sz50['code'].unique())

# 创建list用于保存结果
VnP_list = []

# 循环计算每个个股的量价指标
for name in corp_list:
    samp = sz50.query(f"code == '{name}'")
    samp_output = VnP_indicators(samp)
    VnP_list.append(samp_output)

# 拼接
VnP_tb = pd.concat(VnP_list)

In [194]:
VnP_tb

Unnamed: 0,code,name,date,AD,ADOSC,OBV,ATR,NATR,TRANGE,AVGPRICE,MEDPRICE,TYPPRICE,WCLPRICE,HT_DCPERIOD,HT_DCPHASE,HT_PHASOR_inphase,HT_PHASOR_quadrature,HT_SINE,HT_TRENDMODE
0,600010.SH,包钢股份,2012/1/10,2.864893e+08,,1.067824e+09,,,,3.539107,3.545006,3.537866,3.544964,,,,,,4.656613e-10
1,600010.SH,包钢股份,2012/1/11,-6.085505e+08,,2.605608e+07,,,,3.539107,3.545006,3.537866,3.544964,,,,,,4.656613e-10
2,600010.SH,包钢股份,2012/1/12,-1.915184e+09,,,,,,3.539107,3.545006,3.537866,3.544964,,,,,,4.656613e-10
3,600010.SH,包钢股份,2012/1/13,-2.370972e+09,,,,,,3.539107,3.545006,3.537866,3.544964,,,,,,4.656613e-10
4,600010.SH,包钢股份,2012/1/16,-2.172240e+09,,,,,,3.539107,3.545006,3.537866,3.544964,,,,,,4.656613e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,688599.SH,天合光能,2022/9/5,-5.837790e+08,-1.583330e+07,-1.564760e+09,0.496243,3.297298,0.66,15.457225,15.475298,15.477517,15.478627,14.554066,19.232712,-0.195925,-0.043696,0.329406,4.656613e-10
620,688599.SH,天合光能,2022/9/6,-5.837790e+08,-1.780207e+07,-1.564760e+09,0.536512,3.664697,1.06,15.434736,15.453703,15.457089,15.458782,15.647031,3.127502,-0.133655,0.158605,0.054558,4.656613e-10
621,688599.SH,天合光能,2022/9/7,-5.837790e+08,-1.414008e+07,-1.564760e+09,0.570332,3.670092,1.01,15.412868,15.432652,15.437092,15.439312,16.968898,0.374034,-0.010870,0.066577,0.006528,4.656613e-10
622,688599.SH,天合光能,2022/9/8,-5.837790e+08,-1.318813e+07,-1.564760e+09,0.568880,3.769914,0.55,15.388680,15.409571,15.415485,15.418442,18.503087,1.889353,-0.041134,-0.234776,0.032969,4.656613e-10


In [196]:
# 在行情数据基础上拼接动量、模式学习、量价指标
result = (sz50.loc[:, ['code', 'name', 'date', 'close', 'close', 'open', 'h_price', 'l_price']]).merge(MOM_tb, on = ['code', 'name', 'date'])
result = result.merge(OS_tb, on = ['code', 'name', 'date'])
result = result.merge(VnP_tb, on = ['code', 'name', 'date'])

In [212]:
result.to_csv('/Users/wujiewang/Desktop/毕设实证/数据集/sz50/dwt_factors.csv')

In [106]:
result[result['name'] == '中国石化']

Unnamed: 0,code,name,date,close,close.1,open,h_price,l_price,AD,ADOSC,...,FAMA,MIDPOINT,MIDPRICE,SAR,SAREXT,SMA,T3,TEMA,TRIMA,WMA
2673,600028.SH,中国石化,2015/4/28,5.28,5.28,5.30,5.65,5.16,-1.338090e+09,,...,,,,,,,,,,
2674,600028.SH,中国石化,2015/5/5,4.83,4.83,5.00,5.22,4.81,-2.247376e+09,,...,,,,,,,,,,
2675,600028.SH,中国石化,2015/4/27,5.22,5.22,4.91,5.22,4.84,-1.116735e+09,,...,,,,,,,,,,
2676,600028.SH,中国石化,2015/4/29,5.06,5.06,5.18,5.18,4.95,-1.223645e+09,,...,,,,,,,,,,
2677,600028.SH,中国石化,2015/4/30,4.95,4.95,5.05,5.09,4.92,-1.710647e+09,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5341,600028.SH,中国石化,2020/9/28,3.22,3.22,3.22,3.23,3.21,4.219148e+09,-6.268789e+07,...,3.253739,3.255,3.260,5.65,-5.65,3.252333,3.248454,3.231332,3.251458,3.247398
5342,600028.SH,中国石化,2020/10/29,3.28,3.28,3.25,3.28,3.25,4.219148e+09,-3.140581e+07,...,3.253518,3.255,3.260,5.65,-5.65,3.253000,3.246972,3.238724,3.250792,3.249183
5343,600028.SH,中国石化,2020/10/27,3.27,3.27,3.28,3.28,3.27,4.219148e+09,-2.857587e+07,...,3.254499,3.255,3.255,5.65,-5.65,3.253000,3.247209,3.243473,3.250333,3.250280
5344,600028.SH,中国石化,2020/9/25,3.22,3.22,3.23,3.23,3.21,4.219148e+09,-2.482037e+07,...,3.254526,3.255,3.255,5.65,-5.65,3.252333,3.246885,3.238635,3.249792,3.248151
