In [1]:
import os
import sys
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from pprint import pprint


from sklearn import metrics
from tensorflow import keras
from keras.models import Sequential, load_model
from keras.layers import Bidirectional, Dropout, Activation, Dense, LSTM
from keras.regularizers import l1, l2, l1_l2
from keras.callbacks import EarlyStopping,ModelCheckpoint

from scikeras.wrappers import KerasClassifier as scikeras_KerasClassifier

sys.path.insert(0, os.path.abspath('../dataset_mngr'))

import split_merge as sm
import balance_light as balance
import model_mngr as modmgr

print('done')

2024-05-24 07:54:55.674914: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-24 07:54:56.169480: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


done


In [4]:
PATH_DATA = "/Data"
PATH_DATA_DTS=PATH_DATA+"/DTS_FULL/"
PATH_MODELS= "/usr/local/models/"

SUFFIX_TRAIN="_TRAIN.zip"
SUFFIX_VAL="_VAL.zip"
SUFFIX_CONF="_CONF.zip"

print('done')

done


Update and save the scaler if needed

In [5]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"

df_class=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_TRAIN,sep=",",index_col=["OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class=df_class.dropna(subset=[label])
df_class=df_class.sort_index()

df_norm,norm_scaler= balance.normalize_df(df_in=df_class,str_label=label,tuple_ft_range=(-1,1))

file_name=dts_name+"_train_colab_lstm_norm_2405"
scaler_name=file_name+"_scaler.save"
joblib.dump(norm_scaler,filename=PATH_MODELS+scaler_name)
print('done')

done


Load train et val df, normalize,  undersample  and preparation for LSTM

In [6]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
scaler_name=file_name+"_scaler.save"
scaler=joblib.load(PATH_MODELS+scaler_name)


df_class=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_TRAIN,sep=",",index_col=["OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class=df_class.dropna(subset=[label])
df_class=df_class.loc['1995-01-01':] # drop rows < 1995-01-01
df_class=df_class.sort_index()
df_class_val=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_VAL,sep=",",index_col=["OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class_val=df_class_val.dropna(subset=[label])
df_class_val=df_class_val.sort_index()

# normalize df_class and df_class_val
df_class_train_norm=balance.normalize_df_scaler(df_in=df_class, str_label=label,scaler=scaler)
df_class_val_norm=balance.normalize_df_scaler(df_in=df_class_val, str_label=label,scaler=scaler)

list_feat = df_class.columns.values.tolist()
list_feat.remove(label)

nb_val=20000 #211000
df_class_train_norm=balance.class_custom_undersampler(df_class_train_norm,label,nb_val)
df_x_train, col_y_train = sm.split_df_x_y(
    df_in=df_class_train_norm, list_features=list_feat, str_label=label, drop_na=True)
# print(col_y_train.value_counts().sort_index())

nb_val=5000#53000
df_class_val_norm=balance.class_custom_undersampler(df_class_val_norm,label,nb_val)
df_x_val, col_y_val = sm.split_df_x_y(
    df_in=df_class_val_norm, list_features=list_feat, str_label=label, drop_na=True)

sequence_length = 10

x_train=df_x_train.values
y_train=col_y_train.values
x_val=df_x_val.values
y_val=col_y_val.values
x_train_lstm,y_train_lstm=sm.prepare_sequences(x_train,y_train,sequence_length)
x_val_lstm,y_val_lstm=sm.prepare_sequences(x_val,y_val,sequence_length)

print('done')

done


In [7]:
list_param_valid = [
                    {'fit__batch_size': 256, 'model__dropout': 0.05, 'model__layers': [64, 10], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
                    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
                    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
]

input_dim = x_train.shape[-1]
num_classes = 4
epochs = 3#350
suffix="lstm_v1"
filename_tmp_model =PATH_MODELS+dts_name+"_"+suffix+".keras"
patience = 3

val_accuracy=0.0
obj_acc=0.4
cpt_param=0
try_limit=3
pct_check_class=0.7 # check if at least n% of the validation set per class

len_val=len(x_val_lstm)
check_class_limit=(len_val/num_classes)*pct_check_class
check_class=False # check if at least obj_acc accuracy per class

while(cpt_param<len(list_param_valid) and check_class==False):
    param_valid=list_param_valid[cpt_param]
    print(param_valid)
    cpt=0

    while(cpt<try_limit and check_class==False):
        cpt+=1
        es = EarlyStopping(monitor="val_loss", patience=patience, mode="min", verbose=2)
        mc = ModelCheckpoint(filename_tmp_model, monitor="val_loss",
                            mode="min", save_freq="epoch", save_best_only=True)
        lstm_model = scikeras_KerasClassifier(model=modmgr.create_scikeras_lstm_model, optimizer="adam",optimizer__momentum=param_valid['optimizer__momentum'],
                                            optimizer__lr=param_valid['optimizer__lr'], model__layers=param_valid['model__layers'], model__dropout=param_valid['model__dropout'],
                                                callbacks=[es, mc], verbose=1)

        history = lstm_model.fit(
            x_train_lstm, y_train_lstm, batch_size=param_valid['fit__batch_size'], epochs=epochs, validation_data=(x_val_lstm, y_val_lstm))

        train_loss = history.history_['loss']
        val_loss = history.history_['val_loss']

        # Plot loss
        # epochs_done = range(1, len(train_loss) + 1)
        # plt.plot(epochs_done, train_loss, 'bo-', label='Training Loss')
        # plt.plot(epochs_done, val_loss, 'ro-', label='Validation Loss')
        # plt.legend()
        # plt.show()

        saved_model = load_model(filename_tmp_model)
        # saved_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['loss'])
        # loss, accuracy = saved_model.evaluate(x_valid, y_valid)

        # Prediction on validation
        y_pred = saved_model.predict(x_val_lstm)
        y_pred_classes = np.argmax(y_pred, axis=1)

        # Accuracy on validation
        val_accuracy = metrics.accuracy_score(y_val_lstm.argmax(axis=1), y_pred_classes)
        print(f"Accuracy on Validation Set: {val_accuracy} {cpt=}")

        # check prediction au moins 30 par classe
        if val_accuracy>=obj_acc:
            check_class=True
            for i in range(num_classes):
                nb_lab=sum(y_pred_classes == i)
                if nb_lab<check_class_limit  :
                    check_class=False
                    print(f"Check class {i=} {nb_lab=} {check_class=} {check_class_limit=}")

    if cpt>=try_limit :
        cpt_param+=1
        print(f"Optim fail {cpt=} param suivant {cpt_param=}")

if cpt>=try_limit :
    print(f"Optim fail {cpt=}")

else :
    confusion = metrics.confusion_matrix(y_val_lstm.argmax(axis=1), y_pred_classes)
    print(confusion)

    for i in range(num_classes):
        print(f"Categ {i}: real {sum(y_val_lstm.argmax(axis=1) == i)} predict {sum(y_pred_classes == i)}")

    #check saved model
    saved_model = load_model(filename_tmp_model)
    y_pred = saved_model.predict(x_val_lstm)
    confusion = metrics.confusion_matrix(y_val_lstm.argmax(axis=1), y_pred.argmax(axis=1))
    print(confusion)

{'fit__batch_size': 256, 'model__dropout': 0.05, 'model__layers': [64, 10], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9}
layers=[64, 10] meta={'classes_': array([0, 1, 2, 3]), 'target_type_': 'multilabel-indicator', 'y_dtype_': dtype('bool'), 'y_ndim_': 2, 'X_dtype_': dtype('float64'), 'X_shape_': (79991, 10, 27), 'n_features_in_': 10, 'target_encoder_': ClassifierLabelEncoder(), 'n_classes_': 4, 'n_outputs_': 1, 'n_outputs_expected_': 1, 'feature_encoder_': FunctionTransformer()}


  super().__init__(**kwargs)


Epoch 1/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 56ms/step - accuracy: 0.2763 - loss: 1.3819 - val_accuracy: 0.2656 - val_loss: 1.3833
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 51ms/step - accuracy: 0.2850 - loss: 1.3771 - val_accuracy: 0.2805 - val_loss: 1.3753
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 40ms/step - accuracy: 0.2977 - loss: 1.3697 - val_accuracy: 0.2888 - val_loss: 1.3677
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step
Accuracy on Validation Set: 0.28877995097794007 cpt=1
layers=[64, 10] meta={'classes_': array([0, 1, 2, 3]), 'target_type_': 'multilabel-indicator', 'y_dtype_': dtype('bool'), 'y_ndim_': 2, 'X_dtype_': dtype('float64'), 'X_shape_': (79991, 10, 27), 'n_features_in_': 10, 'target_encoder_': ClassifierLabelEncoder(), 'n_classes_': 4, 'n_outputs_': 1, 'n_outputs_expected_': 1, 'feature_encoder_': FunctionTransformer()}
Epoch 1/3


  super().__init__(**kwargs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 39ms/step - accuracy: 0.2729 - loss: 1.3837 - val_accuracy: 0.2754 - val_loss: 1.3817
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 32ms/step - accuracy: 0.2855 - loss: 1.3771 - val_accuracy: 0.2996 - val_loss: 1.3696
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 57ms/step - accuracy: 0.3013 - loss: 1.3675 - val_accuracy: 0.3150 - val_loss: 1.3572
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step
Accuracy on Validation Set: 0.3149917462858286 cpt=2
layers=[64, 10] meta={'classes_': array([0, 1, 2, 3]), 'target_type_': 'multilabel-indicator', 'y_dtype_': dtype('bool'), 'y_ndim_': 2, 'X_dtype_': dtype('float64'), 'X_shape_': (79991, 10, 27), 'n_features_in_': 10, 'target_encoder_': ClassifierLabelEncoder(), 'n_classes_': 4, 'n_outputs_': 1, 'n_outputs_expected_': 1, 'feature_encoder_': FunctionTransformer()}


  super().__init__(**kwargs)


Epoch 1/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 41ms/step - accuracy: 0.2734 - loss: 1.3829 - val_accuracy: 0.2791 - val_loss: 1.3811
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 37ms/step - accuracy: 0.2914 - loss: 1.3753 - val_accuracy: 0.2914 - val_loss: 1.3681
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 35ms/step - accuracy: 0.3000 - loss: 1.3673 - val_accuracy: 0.3011 - val_loss: 1.3604
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step
Accuracy on Validation Set: 0.3010854884698114 cpt=3
Optim fail cpt=3 param suivant cpt_param=1
Optim fail cpt=3


In [None]:
input_dim = x_train.shape[-1]
window_size = sequence_length
dropout = 0.2
num_classes = 4

# cat_y_train = keras.utils.to_categorical(col_y_train, num_classes)
# cat_y_valid = keras.utils.to_categorical(col_y_valid, num_classes)

# df_x_train_exp = np.expand_dims(df_x_train, axis=2)
# df_x_valid_exp = np.expand_dims(df_x_valid, axis=2)


model_LSTM = Sequential()
model_LSTM.add(LSTM(units=20, return_sequences=False,#True
               input_shape=(window_size, input_dim)))
#,kernel_regularizer=l2(0.1), recurrent_regularizer=l2(0.1), bias_regularizer=l2(0.1)
model_LSTM.add(Dropout(rate=dropout))   
# model_LSTM.add(Dropout(rate=dropout))
# model_LSTM.add(Bidirectional(LSTM((window_size * 2), return_sequences=True)))
# model_LSTM.add(Dropout(rate=dropout))
# model_LSTM.add(Bidirectional(LSTM(window_size, return_sequences=False)))
model_LSTM.add(Dense(units=num_classes, activation='softmax'))

model_LSTM.compile(loss='categorical_crossentropy',
                   optimizer='adam', metrics=['accuracy'])

history = model_LSTM.fit(x_train_lstm, y_train_lstm, batch_size=1024,
                         shuffle=False, epochs=20, validation_data=(x_val_lstm, y_val_lstm))#,verbose=0

train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Plot loss
epochs = range(1, len(train_accuracy) + 1)
plt.plot(epochs, train_accuracy, 'bo-', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'ro-', label='Validation accuracy')
plt.legend()
plt.show()


In [53]:
# print if keras can use the gpu to train the model
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())



[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12302096189872760406
xla_global_id: -1
]
