In [1]:
import pandas as pd
import numpy as np
import os
import sys
import math
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore') #忽略格式警告
excels=[pd.read_excel('D:\python\Asta\\'+fname) for fname in tqdm_notebook(os.listdir('D:\python\Asta')) if 'xlsx' in fname]
rawdata=pd.concat(excels).reset_index(drop=True)
rawdata

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,Stkcd,Trddt,Opnprc,Hiprc,Loprc,Clsprc,Dnshrtrd,Dretwd,Dretnd,ChangeRatio
0,证券代码,交易日期,日开盘价,日最高价,日最低价,日收盘价,日个股交易股数,考虑现金红利再投资的日个股回报率,不考虑现金红利的日个股回报率,涨跌幅
1,没有单位,没有单位,元/股,元/股,元/股,元/股,股,没有单位,没有单位,
2,000001,2018-12-17,10.16,10.33,10.1,10.29,57127487,0.011799,0.011799,0.011799
3,000001,2018-12-18,10.2,10.32,10.1,10.12,53774430,-0.016521,-0.016521,-0.016521
4,000001,2018-12-19,10.14,10.18,9.9,9.94,59800701,-0.017787,-0.017787,-0.017787
...,...,...,...,...,...,...,...,...,...,...
5266015,900957,2023-12-11,0.381,0.387,0.376,0.385,87900,0.005222,0.005222,0.005222
5266016,900957,2023-12-12,0.385,0.393,0.382,0.391,83378,0.015584,0.015584,0.015584
5266017,900957,2023-12-13,0.39,0.39,0.384,0.388,63100,-0.007673,-0.007673,-0.007673
5266018,900957,2023-12-14,0.378,0.397,0.378,0.395,89700,0.018041,0.018041,0.018041


In [2]:
columns=rawdata.columns
Stkcd=np.delete(np.unique(rawdata.iloc[2:,0]),[-2,-1])
Stkcd

array(['000001', '000002', '000004', ..., '900955', '900956', '900957'],
      dtype=object)

In [3]:
class DataMining:
    """输入目标股票ID，得到DTW相似性矩阵和ANN模型label"""
    def __init__(self,stkcd,max_warping_window=math.inf):
        """stkcd：证券代码 
           obstk：目标个股各特征时间序列
           label:标签
           vecdtw:DTW距离特征向量,维数：（股票数目，特征数）"""
        self.stkcd=stkcd
        self.obstk=rawdata[rawdata['Stkcd']==stkcd]
        self.obstk=self.obstk.reset_index(drop=True)
        self.vecdtw=[]
        self.max_warping_window = max_warping_window
        
    def std(self,ts):
        """ts:时间序列
           返回标准化ts"""
        ts=np.array(ts)
        if max(ts)!=min(ts):
            ts=(ts-min(ts))/(max(ts)-min(ts))
        else:
            ts=np.ones(len(ts))
        return ts
    
    def dtw(self,feature,start,l,d=lambda x,y:np.sqrt((x-y)**2)):
        """feature:特征（如日开盘价Opnprc）
           start:开始日期
           l:样本天数
           d:距离
           计算目标ts和其他ts某一特征的DTW"""
        vecdtw=[]
        istart=self.obstk[self.obstk['Trddt']==start].index.tolist()[0]
        ts1=np.array(self.obstk.loc[istart:istart+l-1,feature])
        ts1=self.std(ts1)
#         print(len(ts1))
        for stkcd in tqdm_notebook(np.delete(Stkcd,np.where(Stkcd==self.stkcd))):
            ts2=rawdata[rawdata['Stkcd']==stkcd].reset_index(drop=True)
            if istart not in ts2.index:
                continue
            ts2=np.array(ts2.loc[istart:istart+l-1,feature])
            ts2=self.std(ts2)
#             print(len(ts2))
            cost = sys.maxsize * np.ones((len(ts1),len(ts2)))

            # 初始化（0.0）位置数据
            cost[0, 0] = d(ts1[0], ts2[0])
            for i in range(1, len(ts1)):
                cost[i, 0] = cost[i-1, 0] + d(ts1[i], ts2[0])

            for j in range(1, len(ts2)):
                cost[0, j] = cost[0, j-1] + d(ts1[0], ts2[j])

        # 滑动窗口
            for i in range(1, len(ts1)):
                for j in range(max(1, i - self.max_warping_window),
                                min(len(ts2), i + self.max_warping_window)):
                    choices = cost[i - 1, j - 1], cost[i, j-1], cost[i-1, j]
                    cost[i, j] = min(choices) + d(ts1[i], ts2[j])

        # 返回DTW distance
            vecdtw.append(cost[-1, -1])
        return vecdtw
    
    def Label(self,label,start,l,d=lambda x,y:np.sqrt((x-y)**2)):
        """ANN模型的label"""
        self.label=np.array(self.dtw(label,start,l,d=lambda x,y:np.sqrt((x-y)**2))).T
        
    
    def DTW(self,start,l,d=lambda x,y:np.sqrt((x-y)**2)):
        """返回目标个股与其他各个股票间DTW相似性矩阵self.vecdtw
           维数：（股票数目，特征数）
           """
        for i in range(2,7):
            self.vecdtw.append(self.dtw(feature=columns[i],start=start,l=l,d=d))
        self.vecdtw=np.array(self.vecdtw).T

In [4]:
A=DataMining('601988')
# print(A.obstk)
# print(A.obstk.iloc[0,1])
# A.obstk[A.obstk['Trddt']=='2018-12-17'].index.tolist()[0]
# A.DTW(start='2018-12-17',l=250)
# A.vecdtw

In [5]:
A.vecdtw.append(A.dtw(start='2018-12-17',l=250,feature='Opnprc'))

  0%|          | 0/5418 [00:00<?, ?it/s]

In [10]:
A.vecdtw.append(A.dtw(start='2018-12-17',l=250,feature='Hiprc'))

  0%|          | 0/5418 [00:00<?, ?it/s]

In [11]:
A.vecdtw.append(A.dtw(start='2018-12-17',l=250,feature='Loprc'))

  0%|          | 0/5418 [00:00<?, ?it/s]

In [12]:
A.vecdtw.append(A.dtw(start='2018-12-17',l=250,feature='Clsprc'))

  0%|          | 0/5418 [00:00<?, ?it/s]

In [13]:
A.vecdtw.append(A.dtw(start='2018-12-17',l=250,feature='Dnshrtrd'))

  0%|          | 0/5418 [00:00<?, ?it/s]

In [15]:
A.Label(label='Dretwd',start='2018-12-17',l=250)

  0%|          | 0/5418 [00:00<?, ?it/s]

In [35]:
A.vecdtw=np.array(A.vecdtw).T
A.vecdtw

array([[62.98316365, 62.41194588, 61.95442501, 60.54244696, 21.43819194],
       [21.14861476, 19.98081826, 19.73495903, 17.50625799, 20.23442138],
       [23.49593291, 26.44545533, 32.59268161, 24.17913516, 19.20391331],
       ...,
       [46.1066487 , 44.67969667, 48.21198479, 45.91500449, 18.40698974],
       [42.76187107, 43.77029703, 44.13953194, 43.55326009, 20.97952198],
       [21.452159  , 20.55245915, 19.48039216, 20.32181655, 17.10672705]])

In [48]:
simi=np.array(pd.read_excel('D:\python\similarity_to_601988.xlsx').loc[:,'相似度'])
simi=simi.reshape(5140,1)

In [49]:
print(simi)

[[1.        ]
 [0.9235003 ]
 [0.23260513]
 ...
 [0.02918053]
 [0.37926871]
 [0.17862576]]


In [55]:
A.label=np.array(A.label).reshape(len(A.label),1)

In [69]:
print(A.label)

[[16.38823203]
 [21.49306547]
 [23.3484508 ]
 ...
 [24.61511903]
 [17.44398919]
 [12.93938885]]


In [70]:
X=np.hstack((A.vecdtw[:len(simi),:],simi))
Y=A.label[:len(simi),:]

In [71]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [72]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.datasets import make_regression

In [73]:
# from sklearn.preprocessing import MinMaxScaler
# scalarX, scalarY = MinMaxScaler(), MinMaxScaler()
# scalarX.fit(x_train)
# scalarY.fit(y_train)
# x_train = scalarX.transform(x_train)
# y_train = scalarY.transform(y_train)
# 定义并拟合模型
model = Sequential()
model.add(Dense(8, input_dim=6, activation='sigmoid'))
model.add(Dense(6, activation='relu'))
model.add(Dense(1, activation='relu'))
model.compile(loss='mse', optimizer='adam')
model.fit(x_train, y_train, epochs=100, verbose=0)

<keras.src.callbacks.History at 0x21b062ed9d0>

In [76]:
# 作出预测
y_pred = model.predict(x_test)



In [81]:
print(y_pred)

[[31.309578]
 [29.609488]
 [34.239788]
 ...
 [27.543558]
 [30.088552]
 [27.812237]]


In [83]:
from sklearn.metrics import mean_squared_error 
print(mean_squared_error(y_test,y_pred))

288.5248622958635


In [85]:
import eli5
from eli5.permutation_importance import get_score_importances
def score(x, y):
    y_pred = model.predict(x)
    return mean_squared_error(y, y_pred)

base_score, score_decreases = get_score_importances(score, x_test, y_test)
feature_importances = np.mean(score_decreases, axis=0)



In [88]:
feature_importances=np.array([feature_importances[i]/sum(feature_importances) for i in range(len(feature_importances))])
print(feature_importances)

[0.06278977 0.63958443 0.02955962 0.17541652 0.0656632  0.02698647]


In [100]:
import math
minindex=0
minsum=math.inf
for i in range(X.shape[0]):
    if sum(feature_importances*X[i])<minsum:
        minindex=i
        minsum=sum(feature_importances*X[i])
print(minindex)

2801


In [103]:
delStkcd=np.delete(Stkcd,np.where(Stkcd==A.stkcd))
print(delStkcd[minindex])

301360
