In [1]:
# !rm models -r
# !rm models_2 -r
!mkdir models
!mkdir models_2

In [2]:
import pandas as pd
from pandas import DataFrame
import datetime
from sklearn.preprocessing import StandardScaler # pip3 install --upgrade --force-reinstall scikit-learn --target . -i https://pypi.mirrors.ustc.edu.cn/simple
from collections import deque
import numpy as np

from tensorflow.keras.models import Sequential #pip3 install --upgrade --force-reinstall keras --target . -i https://pypi.mirrors.ustc.edu.cn/simple
from tensorflow.keras.models import load_model #pip3 install --upgrade --force-reinstall keras --target . -i https://pypi.mirrors.ustc.edu.cn/simple
from tensorflow.keras.layers import LSTM,Dense,Dropout
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,Callback,CSVLogger,ReduceLROnPlateau
from sklearn.model_selection import train_test_split
# from keras.utils import multi_gpu_utils
import os
from io import StringIO
import gzip
import shutil
import matplotlib.pyplot as plt
import math
import time
from shutil import copyfile
# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "/kaggle/input/stocks-code/stocks.py", dst = "../working/stocks.py")
 
# import all our functions
from stocks import stocks_all
from stocks import bankuai
os.environ["CUDA_VISIBLE_DEVICES"] = '0,1'
plt.style.use('fivethirtyeight')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #消除tensorflow警告

model_saved_log_char = datetime.datetime.now().strftime('%Y%m%d%h%m%s')


In [3]:
#获取数据
start = datetime.datetime(2000,1,1)
end =  datetime.date.today()

#参数整理
EarlyStopping_monitor='val_loss' #monitor——被监测的量
EarlyStopping_patience=10 #检测值停止变化的次数

_mem_days=[1,3,5] #滑动区间，根据几天的数据做预测
_lstm_layers,_dense_layers=[1,5],[1,5] #图层数
# 这里我们设置的units=32的大小，其实代表得是LSTM单元内的隐藏层的尺寸。
# 对于LSTM而言，每个单元有3个门，对应了4个激活函数（3个sigmoid,一个tanh）。也就是说有4个神经元数量为32的前馈网络层。
_units= [32,64]

# #测试
# _mem_days=[3] #滑动区间，根据几天的数据做预测
# _lstm_layers,_dense_layers=[1],[1] #图层数
# _units= [32]


optimizer='adam' #优化器:控制梯度下降和梯度爆炸
loss = 'mse' #损失层
metrics=['mape'] #评价函数
batch_size=32 #每次训练在训练集中取batchsize个样本训练；.batch_size=1时为在线学习，也是标准的SGD,如果数据集比较小，则完全可以采用全数据集的形式;GPU对2的幂次的batch可以发挥更佳的性能，因此设置成16、32、64、128…时往往要比设置为整10、整100的倍数时表现更优
epochs=50 #一个 epoch（代）是指整个数据集正向反向训练一次。

model_verbose = 0

In [4]:
#文件路径 data_
# path = '/kaggle/input/stocks-data-20221216/'
log_file_name = '/kaggle/working/models'
model_saved_file='/kaggle/working/models_2'
BASE_PATH = '/kaggle/input/all-stocks/all_stocks'

model_saved_log = f'/kaggle/working/models_2/{ model_saved_log_char}_models.csv'

# #创建任务总模型目录
log_csv_file = open(model_saved_log, 'a')

# 写表头code,loss,mape,val_loss,val_mape,modelname
model_log = f'code,loss,mape,val_loss,val_mape,modelname\n'
log_csv_file.write(model_log)
log_csv_file.close()

In [5]:
exception_file_full_name = f'/kaggle/working/models_2/{ model_saved_log_char}_exception.txt'

#创建异常文件
exception_file = open(exception_file_full_name, 'a')

# 写表头code,loss,mape,val_loss,val_mape,modelname
exception_log = f'---------------Exception:{str(end)}------------------\n'
exception_file.write(exception_log)
exception_file.close()

In [6]:
#模型callback类
class CustomCallback(Callback):
#     print('-----------------CustomCallback-----------------')
    code = ''
    the_mem_days=0
    the_lstm_layers=0
    the_dense_layers=0
    the_units = 0
    csv_file_name = ''
    model_path = ''
    saveModelFile = False
    saveModelLog = True

    #epoch,loss,mape,val_loss,val_mape,code,the_mem_days,the_lstm_layers,the_dense_layers,the_units
    csv_file = DataFrame()



    def __init__(self,path,csv_file_name,code,the_mem_days,the_lstm_layers,the_dense_layers,the_units,
                 saveModelFile=False,saveModelLog=True):
        self.model_path = path
        self.csv_file_name = csv_file_name
        self.code = code
        self.the_mem_days = the_mem_days
        self.the_lstm_layers = the_lstm_layers
        self.the_dense_layers = the_dense_layers
        self.the_units = the_units
        self.saveModelFile = saveModelFile
        self.saveModelLog=saveModelLog
#         print('-----------------CustomCallback__init__-----------------')
        #
        if not os.path.exists(csv_file_name):
#             print('-----------------os.path.exists(csv_file_name)-----------------')
            # #创建任务总模型目录
            _temp_file = open(csv_file_name, 'a') 
            _temp_file_header = f'epoch,loss,mape,val_loss,val_mape,code,the_mem_days,the_lstm_layers,the_dense_layers,the_units\n'
            _temp_file.write(_temp_file_header)
            _temp_file.close()
#         print('-----------------self.csv_file-----------------')
        self.csv_file = pd.read_csv(csv_file_name, lineterminator='\n', header=0)  
                

    def on_epoch_end(self, epoch, logs={}):
#         print('-----------------self.on_epoch_end-----------------')
        if self.saveModelFile == True:
#             print('-----------------self.saveModelFile-----------------')
            loss = logs['loss']
            filepath =  f'{self.model_path}/{loss:.2f}_{self.code}_{epoch:02}_mem_{self.the_mem_days}_ltsm_{self.the_lstm_layers}_dense_{self.the_dense_layers}_unit_{self.the_units}.h5'

            loss = logs['loss']
            mape = logs['mape']
            val_loss = logs['val_loss']
            val_mape = logs['val_mape']
            log_csv_file = open(model_saved_log, 'a+')
            # code,loss,mape,val_loss,val_mape,modelname
            model_log = f'c{self.code},{loss:.2f},{mape:.2f},{val_loss:.2f},{val_mape:.2f},{filepath}\n'
            log_csv_file.write(model_log)
            log_csv_file.close()


            self.model.save(filepath,save_format='h5')

        if self.saveModelLog == True:
#             print('-----------------self.saveModelFile-----------------')
            if not math.isnan(logs['loss']) :
#                 print('-----------------self.isnan-----------------')
                _i_ = len(self.csv_file)
                row = {
                    'epoch':epoch,
                    'loss' : float(round(logs['loss'],2) ),
                    'mape':round(logs['mape'],2)  ,
                    'val_loss': round(logs['val_loss'],2) ,
                    'val_mape': round(logs['val_mape'],2)  ,

                    'code': self.code,
                    'the_mem_days': self.the_mem_days,
                    'the_lstm_layers': self.the_lstm_layers,
                    'the_dense_layers': self.the_dense_layers,
                    'the_units': self.the_units
                }

                row_index = len(self.csv_file)
                self.csv_file.loc[row_index] = row

                self.csv_file.to_csv(self.csv_file_name,index=False)
       
                
    def test(self):
        print('suc')

In [7]:
def open_dataframe(path):
    os_file = open(path, 'rb')  # 打开压缩文件对象
    file_stream = os_file.read()
    message = gzip.decompress(file_stream).decode('GBK')

    dataframe = pd.read_csv(StringIO(message), lineterminator='\n', 
                            header=0,dtype={"CODE": str},index_col="日期")
    os_file.close()

    return dataframe

In [8]:
def set_nane_zero(x):
    if x == 'None':
        return 0
    else:
        return x
#打开数据，数据清洗1
def lstm_cleanm_data(file_path,klt):
    
    data = open_dataframe(file_path)
    
    data = data.drop(['CODE'], axis=1)
#     data = data.drop(['NAME'], axis=1)
    data.sort_index(inplace=True,ascending=True)  # 排序
    
    #判断处理类型
    if klt==101:
        #(['CODE', '开盘', '收盘', '最高', '最低', '成交量', '成交额', '振幅', '涨跌额', '换手率'])
        data = data[data.columns[0:9]]
    
    # 数据处理
    for col in data.columns:
        data[col].fillna(0, inplace=True)  # 将数学成绩为空值用0填
        data[col] = data[col].map(set_nane_zero)
    # 删除close的空值
    for col in ['开盘', '收盘', '最高', '最低', '成交量', '成交额', '振幅', '涨跌额', '换手率']:
        null_idx = data.loc[(data[col] == 0) | (data[col] == np.nan )|(data[col] == 'None')].index
        data = data.drop(null_idx)


    return data

In [9]:
#数据处理
def stock_price_ltsm_data_processing(f,predays,isPredict:bool = False):
    # 1.数据处理
    predays = int(predays)
    f['labels'] = f['收盘'].shift(-predays) #新增滑动列:f['labels'][predays]行等于f['labels'][0],predays正数向上滑动，负数向下滑动

    x_data = f[:].iloc[:, :-1].values  # .values将dataframe转换为array

    # 1.数据处理-归一化
    scaler = StandardScaler() #数据标准化
    sca_x = scaler.fit_transform(x_data) #除最后一例的所有行 把新增的滑动列去掉，再进行归一化

    # date_begin = str(x_data[predays,:].index)

    # 测试
    # x_data = f[-(predays + n):].iloc[:, :-1]
    #
    # x = []  # 参数x
    # y = []  # 输出参数y，用于模型巡礼和结果对比
    # for i in range(predays, len(x_data)+1):
    #     # 序列长度为19
    #     x.append(x_data.iloc[i - predays:i, :])
    #     # 标签长度为1
    #     if i < len(x_data):
    #         y.append(x_data.iloc[i, 0])  # 第i行第0个值，'tclose'
    #     else:
    #         y.append(np.nan)  # 预测值占位

    #测试完
    length = len(sca_x)
    if isPredict == True:
        length += 1

    # 1.数据处理-分组，准备对照数据
    x = []  # 参数x
    y = []  # 输出参数y，用于模型巡礼和结果对比
    for i in range(predays,length):
        # 序列长度为19
        x.append(sca_x[i - predays:i, :])
        # 标签长度为1
        if i < len(sca_x):
            y.append(x_data[i, 0])  # 第i行第0个值，'tclose'
        else:
            y.append(np.nan)  # 预测值占位

    # x_lately = x[predays:] #记录失效数据
    # x = x[:predays] #删除失效数据

    x,y= np.array(x), np.array(y)
    x = np.reshape(x,(x.shape[0],x.shape[1],x.shape[2]))



    return  x,y

In [10]:
_time_start = time.time()
_time_limit = 10

In [11]:
import zipfile
 

def file2zip(packagePath, zipPath):
    '''
  :param packagePath: 文件夹路径
  :param zipPath: 压缩包路径
  :return:
  '''
    if os.path.exists(zipPath):
        os.remove(zipPath)
    zip = zipfile.ZipFile(zipPath, 'w', zipfile.ZIP_DEFLATED)
    for path, dirNames, fileNames in os.walk(packagePath):
        fpath = path.replace(packagePath, '')
        for name in fileNames:
            fullName = os.path.join(path, name)
            name = fpath + '\\' + name
            zip.write(fullName, name)
    zip.close()

In [12]:
#建模
def build_models(f,code,mem_days,lstm_layers,dense_layers,units,saveModelFile ,saveModelLog,thread_count ):
    
    build_models_times = 0

    for the_mem_days in mem_days:
        new_df = f.copy(deep=True)
        x, y = stock_price_ltsm_data_processing(new_df,the_mem_days,False)
        x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False, test_size=0.2)
        for the_lstm_layers in lstm_layers:
            for the_dense_layers in dense_layers:
                for the_units in units:
#                     print('-----------------callback-----------------')
                    callback = [
                        EarlyStopping(monitor=EarlyStopping_monitor, patience=EarlyStopping_patience),
                        # CSVLogger(filename, separator=',', append=True),
                        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=0, mode='auto',
                                          min_delta=0.0001, cooldown=0, min_lr=0),
                        CustomCallback(model_saved_file,f'{log_file_name}/{code}.csv',code,the_mem_days,the_lstm_layers,the_dense_layers,the_units,
                                       saveModelFile=saveModelFile,saveModelLog=saveModelLog)
                    ]
#                     print('-----------------Sequential-----------------')
                    #构建神经网络
                    model = Sequential()
#                     from keras.utils import multi_gpu_utils
#                     model = multi_gpu_utils(model, gpus=2)
                    model.add(CuDNNLSTM(the_units,input_shape=x.shape[1:],return_sequences=True)) #第一层
                    model.add(Dropout(0.1)) #防止过拟合

                    for i in range(the_lstm_layers):
                        model.add(CuDNNLSTM(the_units,return_sequences=True)) #要有返回值
                        model.add(Dropout(0.1)) #防止过拟合

                    model.add(CuDNNLSTM(the_units))
                    model.add(Dropout(0.1)) #防止过拟合

                    for i in range(the_dense_layers):
                        model.add(Dense(the_units,activation='relu'))  #全连接层
                        model.add(Dropout(0.1)) #防止过拟合

                    model.add(Dense(1)) #输出层

                    model.compile(optimizer='adam' ,#优化器
                                  loss = 'mse' ,#损失层
                                  metrics=['mape'])#评价函数) #编译

                    print(f'thread{thread_count},{code},NO.{build_models_times}:{the_mem_days},{the_lstm_layers},{the_dense_layers},{the_units},{str(datetime.datetime.now())}')
                    model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test),verbose=model_verbose,callbacks=callback)
#                     print('-----------------build_models_times-----------------')
                    build_models_times+=1
    return build_models_times

In [13]:
def lstm_model_fit(begin,files,thread_count,klt):
    thread_count = thread_count
    try:
        
        for index in range(begin,len(files)):
            #超时打包
            if time.time()-_time_start >_time_limit*60*60 :
                print('time out')
                return
            
            code = files[index]
            file_path = f'{BASE_PATH}/{klt}/{code}.gzip'
            
            if os.path.exists(file_path):
#                 fileName = file_path
                print(f'thread{thread_count},{index},{code}:lstm_model_fit')
                    
                data = lstm_cleanm_data(file_path,klt)
              
                
                print(f'fit_model：thread{thread_count},{index},{code}:lstm_cleanm_data_{file_path}')
                fit_model = build_models(data.copy(deep=True),code,_mem_days,_lstm_layers,_dense_layers,_units,True,True,f'{thread_count},{index}')
                
                print(f'log_df：thread{thread_count},{index},{code}:fit_model_{fit_model}')
                
                log_df = pd.read_csv(f'{log_file_name}/{code}.csv', lineterminator='\n', header=0)
#                 print('-----------------min_loss_row-----------------')
                min_loss_row = log_df.sort_values(by='loss',ascending=True)[0:1].to_dict(orient='records')[0]
                # min_loss_row = log_df.loc[0:1,:].to_dict()

                # log_df = log_df.sort_values(by='loss',axis=0,ascending=True)
#                 print('-----------------最优解-----------------')
                # # min_loss_row = log_df.iloc[0,:]
                loss = min_loss_row['loss']
                mape = min_loss_row['mape']
                val_loss = min_loss_row['val_loss']
                val_mape = min_loss_row['val_mape']
                # the_mem_days, the_lstm_layers, the_dense_layers, the_units
                _mem_day = int(min_loss_row['the_mem_days'])
                _lstm_layer = int(min_loss_row['the_lstm_layers'])
                _dense_layer = int(min_loss_row['the_dense_layers'])
                _unit = int(min_loss_row['the_units'])
                save_model = fit_model

#                 save_model = build_models(data.copy(deep=True),code,[_mem_day],[_lstm_layer],[_dense_layer],[_unit],True,True,f'{thread_count},{index}')
#                 return
#                 print(f'thread{thread_count},{index},{code}:save_model_{save_model}')
                # 把不符合标准的模型从csv和文件列表中删除
#                 print('-----------------更新文件-----------------')
                save_model_csv = pd.read_csv(model_saved_log)
                #code被解析为int，再文件保存时，加上字符c保证解析为code
                min_loss = save_model_csv.loc[save_model_csv['code'] == 'c'+code].sort_values('loss',ascending=True)[0:1].to_dict(orient='records')[0]['loss']
                rows = save_model_csv.loc[(save_model_csv['code'] == 'c'+code )& (save_model_csv['loss'] > min_loss)]
                for row in rows.to_dict(orient='records'):
                    
                    filename = row['modelname']
                    if os.path.exists(filename):
                        os.remove(filename)

                save_model_csv = save_model_csv.drop(rows.index)
                save_model_csv.to_csv(model_saved_log, index=False)
                print(f'thread{thread_count},{index},{code}:save_model_csv_{model_saved_log}')
            else:
                print(f'code:{code};not exit')
        print('-----------------完成循环-----------------')

    except Exception as reason:
        print(f'-----------------Exception-----------------')
        if reason != '超时':
            print(f'Exception:thread{thread_count},{index}:{str(reason)}')
            exception_file = open(exception_file_full_name, 'a')

            # 写表头code,loss,mape,val_loss,val_mape,modelname
            exception_log = f'\'{code}\':{reason}\n'
            exception_file.write(model_log)
            exception_file.close()

            lstm_model_fit(index+1,files,thread_count,klt)
        else:
            print(str(reason))

In [14]:
# _data =lstm_cleanm_data( '/kaggle/input/stocks-data-20221216/301089.gzip')
# ttt = build_models(_data.copy(deep=True),'301089',_mem_days,_lstm_layers,_dense_layers,_units,True,True,0)
# #测试
_mem_days=[3] #滑动区间，根据几天的数据做预测
_lstm_layers,_dense_layers=[1],[1] #图层数
_units= [64]

batch_size=10 #每次训练在训练集中取batchsize个样本训练；.batch_size=1时为在线学习，也是标准的SGD,如果数据集比较小，则完全可以采用全数据集的形式;GPU对2的幂次的batch可以发挥更佳的性能，因此设置成16、32、64、128…时往往要比设置为整10、整100的倍数时表现更优
epochs=100 #一个 epoch（代）是指整个数据集正向反向训练一次。


model_verbose = 0

_time_limit = 10.2
# print(str(_lstm_layers))
_time_start = time.time()

_file_begin = 1860
_file_end = 1861
#stocks_all,bankuai
# files = os.listdir(path)
print(f'文件范围{_file_begin}-{_file_end}')

lstm_model_fit(0,stocks_all[_file_begin:_file_end],0,101)
# threding_lstm(3,0,file[_file_begin:_file_end])

comp_time = datetime.datetime.now()
_time_end = time.time()
print(f'完成时间{comp_time},用时：{_time_end-_time_start} s')

文件范围1860-1861
thread0,0,002326:lstm_model_fit
fit_model：thread0,0,002326:lstm_cleanm_data_/kaggle/input/all-stocks/all_stocks/101/002326.gzip
thread0,0,002326,NO.0:3,1,1,64,2023-03-01 18:17:51.036505
log_df：thread0,0,002326:fit_model_1
thread0,0,002326:save_model_csv_/kaggle/working/models_2/20230301Mar031677694666_models.csv
-----------------完成循环-----------------
完成时间2023-03-01 18:19:20.244566,用时：93.68566679954529 s


In [15]:
file2zip('/kaggle/working/', '/kaggle/working/output.zip')
print(f'打包完成{datetime.datetime.utcnow()}')

打包完成2023-03-01 18:19:20.336732


In [16]:
from IPython.display import FileLink

FileLink('output.zip')