In [1]:
# 该模型为技术封锁场景下的主要模型之一，模型共包含五部分：数据获取、数据预处理、模型搭建、模型训练、指标检测。
# 其中：
# 预处理部分包括两类算法：MinMax数据归一化、数据随机打散；
# 模型搭建包括一类算法： 自编码器算法；
# 模型训练部分包括两类算法： 随机梯度下降算法类（Adma、SGD等）、反向传播算法；
# 另外为了使模型稳健的运行，增加了两类模型优化算法： 参数随机初始化算法、正则化算法。


import baostock as bs
import pandas as pd
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
import seaborn as sns

sns.set(color_codes=True)
import matplotlib.pyplot as plt

### 一、数据获取 ###
# 登陆系统
lg = bs.login()
print('login respond error_code:'+lg.error_code)
print('login respond  error_msg:'+lg.error_msg)

# 获取沪深A股历史K线数据
# 分钟线指标：date,time,code,open,high,low,close,volume,amount,adjustflag
rs = bs.query_history_k_data_plus("sh.603259",
    "date,time,code,open,high,low,close,volume,amount,adjustflag",
    start_date='2021-05-01', end_date='2021-12-31',
    frequency="5", adjustflag="3")
print('query_history_k_data_plus respond error_code:'+rs.error_code)
print('query_history_k_data_plus respond  error_msg:'+rs.error_msg)

# 打印获取的数据集
data_list = []
while (rs.error_code == '0') & rs.next():
    # 获取一条记录，将记录合并在一起
    data_list.append(rs.get_row_data())
result = pd.DataFrame(data_list, columns=rs.fields)

# 数据集输出到csv文件 
result.to_csv("D:\\ymkd.csv", index=False)
print(result)

# 登出系统
bs.logout()

# 读取数据
merged_data = pd.read_csv('D:\\ymkd.csv', index_col='time',
                              usecols=['time','open','high','low','close','volume','amount','adjustflag'])
merged_data.index = pd.to_datetime(merged_data.index, format='%Y%m%d%H%M%S%f')

####乖离率计算函数
def bias(df,N):
    label='bias_{}'.format(N)
    df[label]=(df['close']-df['close'].rolling(N,min_periods=1).mean())/df['close'].rolling(N,min_periods=1).mean()*100
    df[label]=df[label].map(lambda x:round(x,2))
    return df.iloc[-1][label]
bias(merged_data,6)
print(merged_data.head())
# merged_data.plot()

# 设定训练集和测试集的时间区间
dataset_train = merged_data['20210506093500':'20210630150000']
dataset_test = merged_data['20210630150000':]
dataset_train.plot(figsize=(12, 6))



### 二、数据预处理 ###
# 算法1：MinMax数据归一化
scaler = preprocessing.MinMaxScaler()  
X_train = pd.DataFrame(scaler.fit_transform(dataset_train),columns=dataset_train.columns,index=dataset_train.index)

# 算法2：数据随机打散
X_train.sample(frac=1)

X_test = pd.DataFrame(scaler.transform(dataset_test),columns=dataset_test.columns,index=dataset_test.index)



### 三、模型搭建：建立自编码器模型 ###
# 算法3：自编码器算法
def AutoEncoder_build(model, X_train, act_func):
    tf.random.set_seed(10)

    # act_func = 'elu'

    # Input layer:
    model = tf.keras.Sequential()  # Sequential() is a container that describes the network structure of the neural network, sequentially processing the model

    # First hidden layer, connected to input vector X.
    model.add(tf.keras.layers.Dense(10, activation=act_func,  # activation function
                                    kernel_initializer='glorot_uniform',  # Weight initialization  算法4：参数随机初始化算法
                                    kernel_regularizer=tf.keras.regularizers.l2(0.0),   # 算法5：正则化算法
                                    # Regularization to prevent overfitting
                                    input_shape=(X_train.shape[1],)
                                    )
              )

    model.add(tf.keras.layers.Dense(2, activation=act_func,
                                    kernel_initializer='glorot_uniform'))

    model.add(tf.keras.layers.Dense(10, activation=act_func,
                                    kernel_initializer='glorot_uniform'))

    model.add(tf.keras.layers.Dense(X_train.shape[1],
                                    kernel_initializer='glorot_uniform'))

    model.compile(loss='mse', optimizer='adam')  # 设置编译器

    print(model.summary())
    tf.keras.utils.plot_model(model, show_shapes=True)

    return model


model = tf.keras.Sequential()
model = AutoEncoder_build(model=model, X_train=X_train, act_func='elu')



### 四、模型训练 ###
# 算法6：随机梯度下降算法类（Adma、SGD等）
# 算法7：反向传播算法
def AutoEncoder_main(model, Epochs, BATCH_SIZE, validation_split):
    # Train model for 100 epochs, batch size of 10:
    # noise
    factor = 0.5
    X_train_noise = X_train + factor * np.random.normal(0, 1, X_train.shape)
    X_train_noise = np.clip(X_train_noise, 0., 1.)

    history = model.fit(np.array(X_train_noise), np.array(X_train),
                        batch_size=BATCH_SIZE,
                        epochs=Epochs,
                        shuffle=True, 
                        validation_split=validation_split,  # Training set ratio
                        #                       validation_data=(X_train,X_train), # Validation set
                        verbose=1)

    return history


# Figure
def plot_AE_history(history):
    plt.plot(history.history['loss'],
             'b',
             label='Training loss')
    plt.plot(history.history['val_loss'],
             'r',
             label='Validation loss')
    plt.legend(loc='upper right')
    plt.xlabel('Epochs')
    plt.ylabel('Loss, [mse]')
    plt.ylim([0, .1])
    plt.show()


history = AutoEncoder_main(model=model, Epochs=40, BATCH_SIZE=10, validation_split=0.05)

#plot_AE_history(history)



### 五、指标检测 ###
# 查看模型认知水平
X_pred = model.predict(np.array(X_train))
X_pred = pd.DataFrame(X_pred,
                      columns=X_train.columns)
X_pred.index = X_train.index

scored = pd.DataFrame(index=X_train.index)
scored['Loss_mae'] = np.mean(np.abs(X_pred-X_train), axis = 1)
plt.figure()
sns.distplot(scored['Loss_mae'],
             bins = 10,
             kde= True,
            color = 'blue')
plt.xlim([0.0,.5])
plt.show()

# 对测试集进行异常诊断
X_pred = model.predict(np.array(X_test))
X_pred = pd.DataFrame(X_pred,
                      columns=X_test.columns)

X_pred.index = X_test.index

threshod = 0.15
scored = pd.DataFrame(index=X_test.index)
scored['Loss_mae'] = np.mean(np.abs(X_pred-X_test), axis = 1)
scored['Threshold'] = threshod
scored['Anomaly'] = scored['Loss_mae'] > scored['Threshold']
scored.head()

X_pred_train = model.predict(np.array(X_train))
X_pred_train = pd.DataFrame(X_pred_train,
                      columns=X_train.columns)
X_pred_train.index = X_train.index

scored_train = pd.DataFrame(index=X_train.index)
scored_train['Loss_mae'] = np.mean(np.abs(X_pred_train-X_train), axis = 1)
scored_train['Threshold'] = threshod
scored_train['Anomaly'] = scored_train['Loss_mae'] > scored_train['Threshold']
scored = pd.concat([scored_train, scored])

scored.plot(logy=True,  figsize = (10,6),xlim = ['2021-12-01','2021-12-31'],ylim = [1e-2,1e1], color = ['blue','red'])

plt.vlines(['2021-12-15 15:00:00'],ymin=1e-2,ymax=1e1,label='vlines',color='green')
plt.show()

ModuleNotFoundError: No module named 'baostock'