In [None]:
import os
from datetime import datetime
import pandas as pd
import numpy as np
import joblib
import importlib
import ast
from pathlib import Path
from sqlalchemy.orm import sessionmaker
import matplotlib.pyplot as plt
from pprint import pprint
import seaborn as sns
import gc

from sklearn import metrics

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset,RandomSampler
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import sqlite_io as sio
import add_indicators as indic
import split_merge as sm
import balance  # wait for new release https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1081
import model_mngr as modmgr

importlib.reload(sio)
importlib.reload(modmgr)
importlib.reload(sm)
importlib.reload(balance)

In [5]:
PATH_DATA = "C:\\Projets\\Data"
PATH_DB_FWK="C:\\Projets\\Data\\sqlite\\dataset_market.db"
PATH_DB_STOCK="C:\\Projets\\Data\\sqlite\\dataset_paris_stock_adjusted.db"
PATH_DATA_DTS=PATH_DATA+"\\DTS_FULL\\"

SUFFIX_TRAIN="_TRAIN.zip"
SUFFIX_VAL="_VAL.zip"
SUFFIX_CONF="_CONF.zip"

CONNECTION TO SQLITE

In [6]:
if "con_stock" in locals():
        sio.close_connection(con_stock)
con_stock = sio.get_connection(str_db_path=PATH_DB_STOCK)

if "con_fwk" in locals():
        sio.close_connection(con_fwk)
con_fwk = sio.get_connection(str_db_path=PATH_DB_FWK)

my_session_maker = sessionmaker(bind=con_fwk)
session=my_session_maker()

table_stock="DS_PARIS_1D_ADJ_CLEAN"

GET DATA

In [None]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"

df_base=sio.get_candles_to_df(session=session,con=con_stock, target_table=table_stock,tradable=True)
df_work=pd.DataFrame()
for code_value in df_base.index.get_level_values('CODE').unique():
    sub_df=df_base[df_base.index.get_level_values('CODE') == code_value]
    df_work_tmp = indic.add_indicators_to_df(con=con_fwk, df_in=sub_df, dts_name=dts_name,symbol=multi_symbol)
    df_work = pd.concat([df_work, df_work_tmp])
    
df_work.sort_index(inplace=True)
df_work.info() 

In [None]:
# df_work[10000:10010]
# pd.set_option('display.max_columns', None)
# print(df_work.describe())

df_work.round(5).to_csv(
    PATH_DATA_DTS+dts_name+"_BASE.zip", sep=",")

START HERE FOR BASE DATASET (all labels included)

In [7]:
dts_name="PARIS_TREND_1D_20D_V2"
# dts_name="PARIS_TREND_1D_50D_V1"
multi_symbol="PARIS_STOCK"

df_work=pd.read_csv(PATH_DATA_DTS+dts_name+"_BASE.zip",sep=",",index_col=["OPEN_DATETIME","CODE"],parse_dates=["OPEN_DATETIME"])
df_work.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,OPEN,HIGH,LOW,CLOSE,VOLUME,sma20,pos_sma20,sma50,sma200,pos_sma50,...,adx14,adx14_neg,adx14_pos,adx14_dif,avg_vol14,pos_avg_vol14,pos_sma20_200,williamsr_14,perf_sma_50_5d,perf_sma_200_5d
OPEN_DATETIME,CODE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010-04-26,AB.PA,12.98,12.98,12.2,12.68,62866.0,,,12.68,,0.0,...,0.0,0.0,0.0,0.0,,,,,,
2010-04-27,AB.PA,12.74,12.83,12.61,12.7,22370.0,,,12.69,,0.00079,...,0.0,0.0,0.0,0.0,,,,,,
2010-04-28,AB.PA,12.7,12.7,12.41,12.5,8211.0,,,12.62667,,-0.01003,...,0.0,0.0,0.0,0.0,,,,,,
2010-04-29,AB.PA,12.6,12.65,12.46,12.64,4676.0,,,12.63,,0.00079,...,0.0,0.0,0.0,0.0,,,,,,
2010-04-30,AB.PA,12.63,12.71,12.55,12.65,4470.0,,,12.634,,0.00127,...,0.0,0.0,0.0,0.0,,,,,,


Check Dataframe

In [8]:
# drop rows with no pos_sma200 
df_work=df_work.dropna(subset=['pos_sma200'])

# if williamsr_14 >0 =0 if williamsr_14<-100 = -100
# df_work['williamsr_14']=df_work['williamsr_14'].apply(lambda x: 0 if x>0 else x)
# df_work['williamsr_14']=df_work['williamsr_14'].apply(lambda x: -100 if x<-100 else x)

# if williamsr_14 >0 =0 if williamsr_14<-100 = -100
df_work.loc[df_work['williamsr_14'] > 0, 'williamsr_14'] = 0
df_work.loc[df_work['williamsr_14'] < -100, 'williamsr_14'] = -100

# print min and max of the columns williamsr_14, perf_sma_50_5d, perf_sma_200_5d
# print(f"{df_work['williamsr_14'].min()=}")  inf-100
# print(f"{df_work['williamsr_14'].max()=}") sup 0

# df_check=df_work[df_work['perf_sma_50_5d'] > 1]
# df_check=df_check[df_check['ret_1d'] <= 2]
# print(df_check.index.get_level_values('CODE').unique())
# df_check[df_check.index.get_level_values('CODE')=='AI.PA']
# df_check.head(5)
# df_check=df_work[df_work.index.get_level_values('CODE')=='AI.PA']
# CATG
# mask = df_work['stdev20_1d'] > 1000
# df_work.drop(df_work[mask].index, inplace=True)
# df_check[6000:6010]


In [None]:
df_work[10000:10010]

In [9]:
df_work = indic.drop_indicators_by_type(
    con=con_fwk, df_in=df_work, dts_name=dts_name, symbol=multi_symbol, ind_type=0)
list_label = indic.get_ind_list_by_type_for_dts(
    con=con_fwk, dts_name=dts_name, symbol_code=multi_symbol, ind_type=2)
print(list_label)

           LABEL
0   lab_perf_20d
1   lab_perf_50d
2  lab_perf_125d


In [10]:
# df_work=df_work.droplevel('CODE') !!!!!!
df_work.sort_index(inplace=True)
df_work[10000:10010]

Unnamed: 0_level_0,Unnamed: 1_level_0,OPEN,HIGH,LOW,CLOSE,VOLUME,pos_sma20,pos_sma50,pos_sma200,pos_sma50_200,pos_sma20_50,...,pos_donchian20_lo,adx14,adx14_neg,adx14_pos,adx14_dif,pos_avg_vol14,pos_sma20_200,williamsr_14,perf_sma_50_5d,perf_sma_200_5d
OPEN_DATETIME,CODE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1995-11-24,BN.PA,5.3121,5.3292,5.3035,5.3121,1112239.0,0.01177,0.01509,0.03347,0.01811,0.00328,...,0.0784,20.2887,17.98263,24.68813,6.70549,0.57971,0.02145,-43.92655,-0.00423,0.00475
1995-11-24,BOI.PA,2.4317,2.4317,2.3817,2.3817,3392.0,-0.03791,-0.04912,0.10459,0.16165,-0.01165,...,0.0,40.6824,27.6034,14.7141,-12.8893,0.08447,0.14812,-100.0,-0.00193,0.008
1995-11-24,CDI.PA,5.5472,5.5958,5.5472,5.5958,85024.0,0.01444,0.04278,0.15177,0.10453,0.02793,...,0.04362,14.69556,19.13562,20.95757,1.82195,0.11627,0.13538,-0.0,0.00866,0.00701
1995-11-24,ELEC.PA,3.9661,3.9661,3.9661,3.9661,162.0,-0.02968,-0.0517,-0.03635,0.01619,-0.0227,...,0.02507,14.74654,55.61796,43.88082,-11.73714,0.63943,-0.00688,-71.42857,-0.00965,0.00166
1995-11-24,GFC.PA,3.3486,3.3486,3.2076,3.2093,32614.0,0.00242,0.03218,0.24682,0.20795,0.02969,...,0.06339,33.095,12.06264,26.76666,14.70403,2.22395,0.24381,-61.60998,0.0024,0.01102
1995-11-24,LAT.PA,24.8529,24.8529,24.8529,24.8529,154.0,-0.00517,-0.01589,-0.05827,-0.04307,-0.01078,...,0.05672,11.41274,49.78475,43.88114,-5.90361,0.26857,-0.05338,-31.34328,-0.00242,-0.00235
1995-11-24,LI.PA,2.0554,2.0554,2.0435,2.0435,143309.0,0.01303,-0.00332,-0.02706,-0.02382,-0.01615,...,0.03726,36.83981,9.74307,17.22198,7.47891,2.38002,-0.03958,-41.56051,-0.00162,-0.00054
1995-11-24,RE.PA,4.871,4.871,4.7188,4.7188,33953.0,0.00203,-0.01415,-0.00043,0.01391,-0.01614,...,0.06898,12.87952,19.22271,23.88377,4.66107,0.72411,-0.00245,-44.43796,-0.01148,0.00182
1995-11-24,SAVE.PA,15.3615,15.6143,15.3615,15.3615,28928.0,0.03593,0.05608,0.05257,-0.00332,0.01945,...,0.08714,48.09569,4.50842,30.13034,25.62192,0.41972,0.01606,-36.89894,0.00149,0.00358
1995-11-24,TEP.PA,1.4524,1.4524,1.3325,1.3325,83775.0,-0.13757,-0.15298,-0.16537,-0.01463,-0.01787,...,0.0,26.08714,65.25452,17.32163,-47.93289,3.00736,-0.03224,-100.0,-0.00313,-0.00357


In [12]:
lab_studied = "lab_perf_50d"
algo_studied = "LSTM_CLASS"
dts_name="PARIS_TREND_1D_20D_V2"

df_work_lab = indic.drop_indicators_not_selected(con=con_fwk, df_in=df_work, dts_name=dts_name, symbol=multi_symbol,label=lab_studied,algo=algo_studied)
# print(df_work_lab.head(5))

# move CODE to column to be able to slit the dataset
df_work_lab['TICKER'] = df_work_lab.index.get_level_values('CODE')
df_work_lab=df_work_lab.droplevel('CODE')

df_split=sm.split_df_by_label_strat(
    df_in=df_work_lab, list_label=[lab_studied], split_timeframe="M",random_split=False,split_strat=(80,10,10))
df_selected = df_split['df_'+lab_studied+'_train']
df_valid = df_split['df_'+lab_studied+'_valid']
df_confirm = df_split['df_'+lab_studied+'_confirm']
df_selected.sort_index(inplace=True)
df_valid.sort_index(inplace=True)
df_confirm.sort_index(inplace=True)

print(f"selected: {df_selected.shape=} valid: {df_valid.shape=} confirm: {df_confirm.shape=}")
df_selected[10000:10010]

selected: df_selected.shape=(838987, 29) valid: df_valid.shape=(231843, 29) confirm: df_confirm.shape=(244596, 29)


Unnamed: 0_level_0,pos_sma20,pos_sma50,pos_sma200,rsi14,sma20_rsi14,ret_5d,pos_top20,pos_top50,pos_bot20,pos_bot50,...,cmf_20,adx14,adx14_neg,adx14_pos,adx14_dif,pos_avg_vol14,pos_sma20_200,perf_sma_50_5d,perf_sma_200_5d,TICKER
OPEN_DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-01-04,0.03032,0.0445,0.04734,68.61368,57.45887,0.02203,0.0,0.0,0.06381,0.10456,...,0.68106,34.559,16.30154,32.80825,16.50671,1.43364,0.01652,0.01085,0.0035,SAVE.PA
1996-01-04,0.00911,-0.01411,-0.06019,51.10742,48.38847,0.0184,-0.03493,-0.11954,0.05738,0.10499,...,-0.00654,21.08814,30.49097,26.14443,-4.34654,0.10186,-0.06867,-0.01641,-0.00343,TEP.PA
1996-01-04,0.0506,0.03251,0.1697,58.77622,45.64184,0.03184,-0.02624,-0.02624,0.10498,0.10498,...,0.11007,19.47531,27.82168,32.68546,4.86378,1.73709,0.11336,0.0085,0.01067,TFI.PA
1996-01-04,0.01749,-0.03933,0.04489,49.29749,38.37917,0.04334,-0.04469,-0.10454,0.0657,0.0657,...,0.24571,22.52874,21.82438,25.90113,4.07675,1.35506,0.02694,-0.00896,0.00673,VIRP.PA
1996-01-05,0.01285,0.00631,-0.00376,52.83213,49.86578,-0.00893,-0.02243,-0.03173,0.04537,0.05623,...,0.14429,14.92596,28.76524,30.40562,1.64038,0.50658,-0.01641,0.0069,0.00293,BN.PA
1996-01-05,0.03801,0.02653,0.10581,61.28371,49.0916,0.03733,0.0,-0.03275,0.05942,0.07388,...,0.50814,29.47521,17.54691,16.77613,-0.77078,0.12624,0.06531,-0.00504,0.00746,BOI.PA
1996-01-05,0.03567,0.12029,0.28952,66.79495,71.57781,0.00596,-0.01651,-0.01651,0.11823,0.22153,...,0.42051,41.67504,14.69324,43.10116,28.40792,0.71424,0.24511,0.02108,0.01041,CDI.PA
1996-01-05,-0.00497,-0.04962,-0.10276,44.68482,41.48714,0.00438,-0.04037,-0.11778,0.00438,0.00438,...,0.05534,10.2024,38.68822,47.13757,8.44935,1.14001,-0.09827,-0.00897,0.00085,ELEC.PA
1996-01-05,0.01013,0.0241,0.19591,57.52017,55.21953,0.01443,-0.01506,-0.05496,0.04269,0.0758,...,-0.01669,26.80769,14.80556,34.98566,20.1801,0.10965,0.18391,0.00753,0.01116,GFC.PA
1996-01-05,0.06142,0.06319,0.01671,63.83981,49.25421,0.0,0.0,0.0,0.17008,0.17008,...,0.08836,15.6068,35.58655,52.39946,16.81291,0.95004,-0.04213,0.00509,-0.0004,LAT.PA


In [13]:
label=lab_studied
df_class=balance.add_class_by_lab_nb_lines(df_in=df_selected,str_label=lab_studied,nb_class=5,bool_replace_label=False)
min_max_lab_by_class = df_class.groupby(label+'_class')[label].agg(['min', 'max'])
print(min_max_lab_by_class)

                        min      max
lab_perf_50d_class                  
0                  -0.83200 -0.07920
1                  -0.07919 -0.00751
2                  -0.00750  0.04948
3                   0.04949  0.12576
4                   0.12577  4.92040


In [14]:
label=lab_studied
df_class=balance.add_class_by_lab_nb_lines(df_in=df_selected,str_label=lab_studied,nb_class=5,bool_replace_label=True)
df_class.sort_index(inplace=True)
categ_50={0:[-1,-0.0792],1:[-0.0792,-0.0075],2:[-0.0075,0.04948],3:[0.04948,0.12576],4:[0.12576,5]}
# categ_20={0:[-1,-0.0520],1:[-0.0520,-0.0089],2:[-0.0089,0.0235],3:[0.0235,0.0713],4:[0.0713,4]}
df_class_val=balance.add_lab_by_class(df_in=df_valid,str_label=lab_studied, categ=categ_50,bool_replace_label=True) # categ
df_class_val.sort_index(inplace=True)
df_class_conf=balance.add_lab_by_class(df_in=df_confirm,str_label=lab_studied, categ=categ_50,bool_replace_label=True) # categ
df_class_conf.sort_index(inplace=True)
print(df_class.loc[:, label].dropna().iloc[[0, -1]])
print(df_class_val.loc[:, label].dropna().iloc[[0, -1]])
print(df_class_conf.loc[:, label].dropna().iloc[[0, -1]])
# df_class_clean=df_class.drop(['OPEN','HIGH','LOW','CLOSE','VOLUME','lab_perf_125d','lab_perf_20d','lab_perf_50d'],axis=1)
data = df_class[label]
print(data.value_counts().sort_index())
data_val = df_class_val[label]
print(data_val.value_counts().sort_index())
data_conf = df_class_conf[label]
print(data_conf.value_counts().sort_index())
df_class[10000:10010]
# min_max_lab_by_class = df_class.groupby(label+'_class')[label].agg(['min', 'max'])
# print(min_max_lab_by_class)

# lab_perf_20d : train min nb rows 211000 validation 53000 confirm 55000

OPEN_DATETIME
1989-10-27    4
2017-01-31    4
Name: lab_perf_50d, dtype: int64
OPEN_DATETIME
2017-02-01    3.0
2020-06-30    4.0
Name: lab_perf_50d, dtype: float64
OPEN_DATETIME
2020-07-01    4.0
2023-11-01    1.0
Name: lab_perf_50d, dtype: float64
lab_perf_50d
0    167804
1    167794
2    167818
3    167779
4    167792
Name: count, dtype: int64
lab_perf_50d
0.0    56691
1.0    48735
2.0    45584
3.0    41231
4.0    39552
Name: count, dtype: int64
lab_perf_50d
0.0    61105
1.0    51970
2.0    43750
3.0    41295
4.0    46476
Name: count, dtype: int64


Unnamed: 0_level_0,pos_sma20,pos_sma50,pos_sma200,rsi14,sma20_rsi14,ret_5d,pos_top20,pos_top50,pos_bot20,pos_bot50,...,adx14,adx14_neg,adx14_pos,adx14_dif,pos_avg_vol14,pos_sma20_200,perf_sma_50_5d,perf_sma_200_5d,TICKER,lab_perf_50d
OPEN_DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-01-04,0.03032,0.0445,0.04734,68.61368,57.45887,0.02203,0.0,0.0,0.06381,0.10456,...,34.559,16.30154,32.80825,16.50671,1.43364,0.01652,0.01085,0.0035,SAVE.PA,1
1996-01-04,0.00911,-0.01411,-0.06019,51.10742,48.38847,0.0184,-0.03493,-0.11954,0.05738,0.10499,...,21.08814,30.49097,26.14443,-4.34654,0.10186,-0.06867,-0.01641,-0.00343,TEP.PA,4
1996-01-04,0.0506,0.03251,0.1697,58.77622,45.64184,0.03184,-0.02624,-0.02624,0.10498,0.10498,...,19.47531,27.82168,32.68546,4.86378,1.73709,0.11336,0.0085,0.01067,TFI.PA,2
1996-01-04,0.01749,-0.03933,0.04489,49.29749,38.37917,0.04334,-0.04469,-0.10454,0.0657,0.0657,...,22.52874,21.82438,25.90113,4.07675,1.35506,0.02694,-0.00896,0.00673,VIRP.PA,3
1996-01-05,0.01285,0.00631,-0.00376,52.83213,49.86578,-0.00893,-0.02243,-0.03173,0.04537,0.05623,...,14.92596,28.76524,30.40562,1.64038,0.50658,-0.01641,0.0069,0.00293,BN.PA,1
1996-01-05,0.03801,0.02653,0.10581,61.28371,49.0916,0.03733,0.0,-0.03275,0.05942,0.07388,...,29.47521,17.54691,16.77613,-0.77078,0.12624,0.06531,-0.00504,0.00746,BOI.PA,3
1996-01-05,0.03567,0.12029,0.28952,66.79495,71.57781,0.00596,-0.01651,-0.01651,0.11823,0.22153,...,41.67504,14.69324,43.10116,28.40792,0.71424,0.24511,0.02108,0.01041,CDI.PA,4
1996-01-05,-0.00497,-0.04962,-0.10276,44.68482,41.48714,0.00438,-0.04037,-0.11778,0.00438,0.00438,...,10.2024,38.68822,47.13757,8.44935,1.14001,-0.09827,-0.00897,0.00085,ELEC.PA,2
1996-01-05,0.01013,0.0241,0.19591,57.52017,55.21953,0.01443,-0.01506,-0.05496,0.04269,0.0758,...,26.80769,14.80556,34.98566,20.1801,0.10965,0.18391,0.00753,0.01116,GFC.PA,3
1996-01-05,0.06142,0.06319,0.01671,63.83981,49.25421,0.0,0.0,0.0,0.17008,0.17008,...,15.6068,35.58655,52.39946,16.81291,0.95004,-0.04213,0.00509,-0.0004,LAT.PA,4


In [15]:
#  SAVE DATASETS
file_name="PARIS_TREND_1D_50D_V2"
df_class.round(5).to_csv(
    PATH_DATA_DTS+file_name+SUFFIX_TRAIN, sep=",")
df_class_val.round(5).to_csv(
    PATH_DATA_DTS+file_name+SUFFIX_VAL, sep=",")
df_class_conf.round(5).to_csv(
    PATH_DATA_DTS+file_name+SUFFIX_CONF, sep=",")

Calculate and save scaler

In [16]:
dts_name="PARIS_TREND_1D_50D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_50d"

df_class=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_TRAIN,sep=",",index_col=["OPEN_DATETIME","TICKER"],parse_dates=["OPEN_DATETIME"])
df_class=df_class.dropna(subset=[label])
df_class=df_class.sort_index()

df_norm,norm_scaler= balance.normalize_df(df_in=df_class,str_label=label,tuple_ft_range=(-1,1))

file_name=dts_name+"_train_colab_lstm_norm_2405"
scaler_name=file_name+"_scaler.save"
joblib.dump(norm_scaler,filename=PATH_DATA_DTS+scaler_name)

# df_class_val=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_VAL,sep=",",index_col=["OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
# df_class_val.dropna(subset=[label], inplace=True)
# df_class_val.sort_index(inplace=True)

# list_feat = df_class.columns.values.tolist()
# list_feat.remove(label)
# X, y = sm.split_df_x_y(
#     df_in=df_class, list_features=list_feat, str_label=label, drop_na=True)
# nb_val=211000
# method = RandomUnderSampler(sampling_strategy={0:nb_val,1:nb_val,2:nb_val,3:nb_val}) 
# df_x_train, col_y_train=  method.fit_resample(X, y)
# print(col_y_train.value_counts().sort_index())

# X, y = sm.split_df_x_y(
#     df_in=df_class_val, list_features=list_feat, str_label=label, drop_na=True)
# nb_val=53000
# method = RandomUnderSampler(sampling_strategy={0:nb_val,1:nb_val,2:nb_val,3:nb_val}) # 53000 pour lab 20 et nn pour lab 50
# df_x_val, col_y_val=  method.fit_resample(X, y)
# print(col_y_val.value_counts().sort_index())

['C:\\Projets\\Data\\DTS_FULL\\PARIS_TREND_1D_50D_V2_train_colab_lstm_norm_2405_scaler.save']

Load train et val df, normalize,  undersample  and preparation for LSTM

In [17]:
dts_name="PARIS_TREND_1D_50D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_50d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
scaler_name=file_name+"_scaler.save"
scaler=joblib.load(PATH_DATA_DTS+scaler_name)


df_class=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_TRAIN,sep=",",index_col=["OPEN_DATETIME","TICKER"],parse_dates=["OPEN_DATETIME"])
df_class=df_class.dropna(subset=[label])
df_class=df_class.loc['1995-01-01':] # drop rows < 1995-01-01
df_class=df_class.sort_index()
df_class_val=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_VAL,sep=",",index_col=["OPEN_DATETIME","TICKER"],parse_dates=["OPEN_DATETIME"])
df_class_val=df_class_val.dropna(subset=[label])
df_class_val=df_class_val.sort_index()

# normalize df_class and df_class_val
df_class_train_norm=balance.normalize_df_scaler(df_in=df_class, str_label=label,scaler=scaler)
df_class_val_norm=balance.normalize_df_scaler(df_in=df_class_val, str_label=label,scaler=scaler)

print(f"{df_class_train_norm.shape=} {df_class_val_norm.shape=}")
print(df_class_train_norm[10000:10005])
# print type of index of df_class_train_norm


df_class_train_norm.shape=(831990, 28) df_class_val_norm.shape=(231793, 28)
                       pos_sma20  pos_sma50  pos_sma200     rsi14  \
OPEN_DATETIME TICKER                                                
1998-01-20    RE.PA    -0.401871  -0.530231   -0.504369  0.387174   
              SAVE.PA  -0.429049  -0.551279   -0.530901  0.333374   
              TEP.PA   -0.379269  -0.510884   -0.525588  0.509715   
              TFI.PA   -0.396483  -0.499134   -0.470829  0.504751   
              VIRP.PA  -0.414437  -0.515018   -0.598517  0.253438   

                       sma20_rsi14    ret_5d  pos_top20  pos_top50  pos_bot20  \
OPEN_DATETIME TICKER                                                            
1998-01-20    RE.PA       0.182813 -0.503245   0.983320   0.983966  -0.918870   
              SAVE.PA     0.195811 -0.515616   0.980994   0.981731  -0.967591   
              TEP.PA      0.198386 -0.495371   0.984695   0.985288  -0.902783   
              TFI.PA      0.370092 

In [None]:
print(f"{type(df_class_train_norm.index[0])= } {type(df_class_train_norm.index[1])= }")

In [18]:
import gc

list_feat = df_class.columns.values.tolist()
list_feat.remove(label)

sequence_length = 10
col_sequence = "SEQUENCE"

# for each TICKER in index of df_class_train_norm, sort data with index and prepare sequences
df_class_train_norm_sorted = df_class_train_norm.sort_index(level=['TICKER', 'OPEN_DATETIME'])
df_class_val_norm_sorted = df_class_val_norm.sort_index(level=['TICKER', 'OPEN_DATETIME'])

# Prepare sequences for each TICKER
df_class_train_seq = pd.DataFrame()
cnt=0
for ticker in df_class_train_norm_sorted.index.get_level_values('TICKER').unique():
    sub_df=df_class_train_norm_sorted[df_class_train_norm_sorted.index.get_level_values('TICKER') == ticker]
    sub_df = sm.prepare_sequences_df(
        df_in=sub_df, list_features=list_feat, sequence_length=sequence_length, str_new_col=col_sequence)
    cnt+=1
    if cnt%20==0:
        print(f"time {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {cnt=} {ticker=}")
        gc.collect()
    # if cnt==3:
    #     break
    
# concatenate all TICKER data in the same df
    df_class_train_seq = pd.concat([df_class_train_seq, sub_df])

print((f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} train seq ok"))

df_class_val_seq = pd.DataFrame()
cnt=0
for ticker in df_class_val_norm_sorted.index.get_level_values('TICKER').unique():
    sub_df=df_class_val_norm_sorted[df_class_val_norm_sorted.index.get_level_values('TICKER') == ticker]
    sub_df = sm.prepare_sequences_df(
        df_in=sub_df, list_features=list_feat, sequence_length=sequence_length, str_new_col=col_sequence)
    cnt+=1
    if cnt%20==0:
        print(f"time {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {cnt=} {ticker=}")
        gc.collect()
    # if cnt==3:
    #     break
    
# concatenate all TICKER data in the same df
    df_class_val_seq = pd.concat([df_class_val_seq, sub_df])

print(f"{df_class_train_seq.shape=} {df_class_val_seq.shape=}")
print(df_class_train_seq[10000:10005])

# df_class_train_norm=sm.prepare_sequences_df(df_in=df_class_train_norm,list_features=list_feat,sequence_length=sequence_length,str_new_col=col_sequence)
# df_class_val_norm=sm.prepare_sequences_df(df_in=df_class_val_norm,list_features=list_feat,sequence_length=sequence_length,str_new_col=col_sequence)

# df_x_train, col_y_train = sm.split_df_x_y(
#     df_in=df_class_train_norm, list_features=list_feat, str_label=label, drop_na=True)

# df_x_val, col_y_val = sm.split_df_x_y(
#     df_in=df_class_val_norm, list_features=list_feat, str_label=label, drop_na=True)



# x_train=df_x_train.values
# y_train=col_y_train.values
# x_val=df_x_val.values
# y_val=col_y_val.values
# x_train_lstm,y_train_lstm=sm.prepare_sequences(x_train,y_train,sequence_length)
# x_val_lstm,y_val_lstm=sm.prepare_sequences(x_val,y_val,sequence_length)



time 2024-07-25 16:25:42 cnt=20 ticker='ALDEL.PA'
time 2024-07-25 16:30:23 cnt=40 ticker='ALNOV.PA'
time 2024-07-25 16:34:46 cnt=60 ticker='BEN.PA'
time 2024-07-25 16:39:52 cnt=80 ticker='CDI.PA'
time 2024-07-25 16:44:16 cnt=100 ticker='DG.PA'
time 2024-07-25 16:48:28 cnt=120 ticker='ETL.PA'
time 2024-07-25 16:52:56 cnt=140 ticker='GTT.PA'
time 2024-07-25 16:57:20 cnt=160 ticker='LBIRD.PA'
time 2024-07-25 17:02:48 cnt=180 ticker='MRN.PA'
time 2024-07-25 17:08:16 cnt=200 ticker='POXEL.PA'
time 2024-07-25 17:15:57 cnt=220 ticker='SCR.PA'
time 2024-07-25 17:23:48 cnt=240 ticker='TRI.PA'
2024-07-25 17:29:42 train seq ok
time 2024-07-25 17:31:24 cnt=20 ticker='ALCYB.PA'
time 2024-07-25 17:33:11 cnt=40 ticker='ALLDL.PA'
time 2024-07-25 17:34:58 cnt=60 ticker='ATO.PA'
time 2024-07-25 17:37:02 cnt=80 ticker='CAP.PA'
time 2024-07-25 17:39:13 cnt=100 ticker='CRLA.PA'
time 2024-07-25 17:41:11 cnt=120 ticker='ENGI.PA'
time 2024-07-25 17:43:20 cnt=140 ticker='GDS.PA'
time 2024-07-25 17:45:15 cnt=16

In [19]:
# TODO put this in a function ??
gc.collect()
def format_float(x):
    return '{:.5f}'.format(x) if x is not None else None

def array_to_string(x):
    return np.array2string(x,separator='_') if x is not None else None


vfunc = np.vectorize(format_float) 

df_class_train_seq2=df_class_train_seq.copy()
df_class_val_seq2=df_class_val_seq.copy()

df_class_train_seq2[col_sequence] = df_class_train_seq2[col_sequence].apply(vfunc)
df_class_val_seq2[col_sequence] = df_class_val_seq2[col_sequence].apply(vfunc)

df_class_train_seq2[col_sequence] = df_class_train_seq2[col_sequence].apply(array_to_string)
df_class_val_seq2[col_sequence] = df_class_val_seq2[col_sequence].apply(array_to_string)

df_class_train_seq2.round(5).to_csv(
    PATH_DATA_DTS+dts_name+"_TRAIN_seq_6", sep=",", float_format='%.5f')
df_class_val_seq2.round(5).to_csv(
    PATH_DATA_DTS+dts_name+"_VAL_seq_6", sep=",", float_format='%.5f')

START HERE TO LOAD DATASETS WITH SEQUENCE

In [21]:
dts_name="PARIS_TREND_1D_50D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_50d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
col_sequence = "SEQUENCE"
df_class_train_csv=pd.read_csv(PATH_DATA_DTS+dts_name+"_TRAIN_seq_6.zip",sep=",",index_col=["TICKER","OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class_train_csv=df_class_train_csv.dropna(subset=[col_sequence])
df_class_train_csv=df_class_train_csv.sort_index()
df_class_val_csv=pd.read_csv(PATH_DATA_DTS+dts_name+"_VAL_seq_6.zip",sep=",",index_col=["TICKER","OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class_val_csv=df_class_val_csv.dropna(subset=[col_sequence])
df_class_val_csv=df_class_val_csv.sort_index()
gc.collect()
# keep only index, label and sequence
df_class_train_csv=df_class_train_csv[[label,col_sequence]]
df_class_val_csv=df_class_val_csv[[label,col_sequence]]

df_class_train_csv[col_sequence] = df_class_train_csv[col_sequence].str.replace("_", ",").apply(ast.literal_eval)
df_class_train_csv[col_sequence]  = df_class_train_csv[col_sequence] .apply(lambda x: np.array(x, dtype=np.float32))

df_class_val_csv[col_sequence] = df_class_val_csv[col_sequence].str.replace("_", ",").apply(ast.literal_eval)
df_class_val_csv[col_sequence]  = df_class_val_csv[col_sequence] .apply(lambda x: np.array(x, dtype=np.float32))


print(f"{df_class_train_csv.shape=}")
print(df_class_train_csv[1015:1020])
print(f"{df_class_val_csv.shape=}")
print(df_class_val_csv[1015:1020])

# decision is made between market sessions so we have shift the label of 1 day for each ticker
df_class_train_csv[label] = df_class_train_csv.groupby(level='TICKER')[label].shift(1)
df_class_train_csv=df_class_train_csv.dropna(subset=[label])
df_class_val_csv[label] = df_class_val_csv.groupby(level='TICKER')[label].shift(1)
df_class_val_csv=df_class_val_csv.dropna(subset=[label])
print(df_class_train_csv[1014:1019])
print(df_class_val_csv[1014:1019])


df_class_train_csv.shape=(829695, 2)
                      lab_perf_50d  \
TICKER OPEN_DATETIME                 
AB.PA  2015-02-11                1   
       2015-02-12                0   
       2015-02-13                0   
       2015-02-16                0   
       2015-02-17                0   

                                                               SEQUENCE  
TICKER OPEN_DATETIME                                                     
AB.PA  2015-02-11     [[-0.39987, -0.49388, -0.45321, 0.33119, 0.245...  
       2015-02-12     [[-0.40479, -0.49565, -0.45233, 0.33764, 0.256...  
       2015-02-13     [[-0.39282, -0.4837, -0.43821, 0.39571, 0.2664...  
       2015-02-16     [[-0.35487, -0.4481, -0.40128, 0.51362, 0.2836...  
       2015-02-17     [[-0.35342, -0.4441, -0.39371, 0.5362, 0.29874...  
df_class_val_csv.shape=(229327, 2)
                       lab_perf_50d  \
TICKER  OPEN_DATETIME                 
ABCA.PA 2017-09-21              3.0   
        2017-09-22        

In [22]:
df_class_train_csv.to_pickle(PATH_DATA_DTS+dts_name+"_TRAIN_seq_6.pckl")
df_class_val_csv.to_pickle(PATH_DATA_DTS+dts_name+"_VAL_seq_6.pckl")

START HERE TO DIRECTLY LOAD THE PICKLE FILES

In [None]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
col_sequence = "SEQUENCE"

df_class_train_csv=pd.read_pickle(PATH_DATA_DTS+dts_name+"_TRAIN_seq_6.pckl")  #the train will be split in train + val
df_class_test_csv=pd.read_pickle(PATH_DATA_DTS+dts_name+"_VAL_seq_6.pckl") #the val is finally used as a test dataset
print(df_class_train_csv[1014:1019])

In [None]:
# df_class_train_csv split into train and val with 0.75/0.25 by open datetime using sm.split_df_by_label_strat
df_class_train_csv.reset_index(level='TICKER',inplace=True)

df_split=sm.split_df_by_label_strat(
    df_in=df_class_train_csv, list_label=[label], split_timeframe="D",random_split=False,split_strat=(80,20,0))
df_train_split=df_split['df_'+label+'_train']
df_val_split=df_split['df_'+label+'_valid']

df_train_split.set_index('TICKER',append=True,inplace=True)
df_val_split.set_index('TICKER',append=True,inplace=True)
df_train_split.sort_index(inplace=True)
df_val_split.sort_index(inplace=True)
print(df_train_split[1014:1019])


In [None]:
# print(f"{df_class_train_csv.shape=} {df_class_val_csv.shape=}")
print(df_train_split[label].value_counts().sort_index()) # undersampling at 109200
print(df_val_split[label].value_counts().sort_index()) # undersampling at 43900
print(df_class_test_csv[label].value_counts().sort_index()) # undersampling at 41500

nb_val=30000 #109200
df_class_train_under=balance.class_custom_undersampler(df_train_split,label,nb_val) # undersampling todo

nb_val=5000 #41500
df_class_val_under=balance.class_custom_undersampler(df_val_split,label,nb_val)
df_class_test_under=balance.class_custom_undersampler(df_class_test_csv,label,nb_val)

print(df_class_train_under[label].value_counts().sort_index()) 
print(df_class_val_under[label].value_counts().sort_index()) 
print(df_class_test_under[label].value_counts().sort_index()) 


x_train_tensor = torch.as_tensor(df_class_train_under[col_sequence], dtype=torch.float)
y_train_tensor = torch.tensor(df_class_train_under[label], dtype=torch.int64)

# x_val_tensor = torch.tensor(df_class_val_under['col_sequence_3'], dtype=torch.float)
x_val_tensor = torch.as_tensor(df_class_val_under[col_sequence], dtype=torch.float)
y_val_tensor = torch.tensor(df_class_val_under[label], dtype=torch.int64)

x_test_tensor = torch.as_tensor(df_class_test_under[col_sequence], dtype=torch.float)
y_test_tensor = torch.tensor(df_class_test_under[label], dtype=torch.int64)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

# initiate a pytorch randomsampler for train data
# train_sampler = RandomSampler(train_dataset,num_samples=100000,replacement=True)

batch_size=512
num_workers=7

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False,drop_last=True,num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size,drop_last=True,num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size,drop_last=True,num_workers=num_workers)
 
print(f"{train_loader.dataset.tensors[0].shape=} {val_loader.dataset.tensors[0].shape=} {test_loader.dataset.tensors[0].shape=}") 

In [None]:
print(f"{train_loader.dataset.tensors[0].shape=} {val_loader.dataset.tensors[0].shape=} {test_loader.dataset.tensors[0].shape=}")
#print next(iter(train_loader))
pprint(next(iter(test_loader)))

Correlation (Copy from the Tensorflow notebook), not tested here !

In [None]:
corr_train = df_x_train.corr()
plt.clf()
fig, ax = plt.subplots(figsize=(6, 6))
corr_train.replace(1,0,inplace=True)
corr_train=corr_train.applymap(lambda x : None if x< 0.7 and x>-0.7 else x)
corr_train.dropna(axis=0,how='all',inplace=True)
corr_train.dropna(axis=1,how='all',inplace=True)

# corr_train_check=corr_train[corr_train >0.8]
corr_train_check=corr_train
sns.heatmap(corr_train_check, annot=False, cmap='coolwarm', vmin=-1, vmax=1, ax=ax)

In [None]:
sns.scatterplot(data=df_class, x='pos_sma200', y='pos_top50', hue='lab_perf_20d', palette='Set1')

In [None]:
###############################################
###### REFACTO USING PYTORCH LIGHTNING ########
###############################################

# Define LSTM model
class DynamicLSTMModel(pl.LightningModule):
    def __init__(self, layer_configs, lr, criterion):
        super(DynamicLSTMModel, self).__init__()
        self.layers = nn.ModuleList()

        for config in layer_configs:
            # print(f"{config=}")
            if config['type'] == 'LSTM':
                layer = nn.LSTM(input_size=config['input_dim'], hidden_size=config['hidden_dim'], num_layers=config['num_layers'],
                                batch_first=True, dropout=config['dropout'], bidirectional=config['bidirectional'])
            elif config['type'] == 'Linear':
                layer = nn.Linear(config['input_dim'], config['output_dim'])
            elif config['type'] == 'Softmax':
                layer = nn.Softmax(dim=config['dim'])
            elif config['type'] == 'ReLU':
                layer = nn.ReLU()
            elif config['type'] == 'Sigmoid':
                layer = nn.Sigmoid()
            else:
                raise ValueError(f"Unsupported layer type: {config['type']}")
            self.layers.append(layer)

        self.lr = lr
        self.criterion = criterion
        self.validation_step_outputs = []

        self.save_hyperparameters()

    def forward(self, x):
        for layer in self.layers:
            if isinstance(layer, nn.LSTM):
                # LSTM layers require special handling for initial states
                batch_size = x.size(0)
                hidden_dim = layer.hidden_size
                num_layers = layer.num_layers * 2 if layer.bidirectional else layer.num_layers
                h0 = torch.zeros(num_layers, batch_size,
                                 hidden_dim).to(x.device)
                c0 = torch.zeros(num_layers, batch_size,
                                 hidden_dim).to(x.device)
                x, _ = layer(x, (h0, c0))
                x = x[:, -1, :]
            else:
                x = layer(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        _, predicted = torch.max(y_hat.data, 1)
        loss = self.criterion(y_hat, y)
        correct = (predicted == y).sum().item()
        total = len(y)
        self.log("train_loss", loss, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        self.log("train_acc", correct/total, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        output = {"loss": loss, "train_loss": loss,
                  "train_correct": correct, "train_total": total}
        return output

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        _, predicted = torch.max(y_hat.data, 1)
        loss = self.criterion(y_hat, y)
        correct = (predicted == y).sum().item()
        total = len(y)
        # output=f"val_loss: {loss}, val_correct: {correct}, val_total: {y.size(0)}"
        output = {"loss": loss, "val_loss": loss,
                  "val_correct": correct, "val_total": total}
        # self.log(output)
        self.log("val_loss", loss, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        self.log("val_acc", correct/total, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        return output

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        _, predicted = torch.max(y_hat.data, 1)
        loss = self.criterion(y_hat, y)
        correct = (predicted == y).sum().item()
        total = len(y)
        # output=f"val_loss: {loss}, val_correct: {correct}, val_total: {y.size(0)}"
        output = {"loss": loss, "test_loss": loss, "test_correct": correct,
                  "test_total": total, "test_acc": correct/total}
        # self.log(output)
        self.log("test_loss", loss, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        self.log("test_acc", correct/total, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        return output

    # def test_epoch_end(self, outputs):
    #     avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
    #     self.log('test_loss_epoch', avg_loss)

    # def on_validation_epoch_end(self, outputs):
    #     avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    #     total_correct = sum(x['val_correct'] for x in outputs)
    #     total = sum(x['val_total'] for x in outputs)
    #     tensorboard_logs = {'val_loss': avg_loss}
    #     return {'val_loss': avg_loss, 'progress_bar': tensorboard_logs, 'val_acc': total_correct / total}

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=self.lr)


input_dim = x_train_tensor.shape[2]
num_classes = 5
epochs = 10  # 350
suffix = "lstm_pytorch_v1"
tb_directory = "tb_logs"
debug = False
patience = 5

obj_acc = 0.25
cpt_param = 0
try_limit = 3
pct_check_class = 0.3  # check if at least n% of the validation set per class
criterion = nn.CrossEntropyLoss()

len_val = x_val_tensor.shape[0]
check_class_limit = (len_val/num_classes)*pct_check_class
check_class = False  # check if at least obj_acc accuracy per class

list_param_valid = [
    {'layer_configs': [
        {'type': 'Linear', 'input_dim': input_dim, 'output_dim': input_dim},
        {'type': 'LSTM', 'input_dim': input_dim, 'hidden_dim': 64,         'num_layers': 1, 'dropout': 0.0, 'bidirectional': True},
        # Note: LSTM bidirectional output is doubled
        {'type': 'Linear', 'input_dim': 64 * 2, 'output_dim': num_classes},
        {'type': 'Softmax', 'dim': 1}
    ], 'optimizer__lr': 0.01},
    {'layer_configs': [
        {'type': 'Linear', 'input_dim': input_dim, 'output_dim': input_dim},
        {'type': 'LSTM', 'input_dim': input_dim, 'hidden_dim': 32,
         'num_layers': 2, 'dropout': 0.2, 'bidirectional': False},
        # Note: LSTM bidirectional output is doubled
        {'type': 'Linear', 'input_dim': 32, 'output_dim': num_classes},
        {'type': 'Softmax', 'dim': 1}
    ], 'optimizer__lr': 0.01},
    # {'fit__batch_size': 256, 'model__dropout': 0.05, 'model__layers': [64, 10], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
]

while (cpt_param < len(list_param_valid) and check_class == False):  # loop for parameters
    gc.collect()
    param_valid = list_param_valid[cpt_param]  # select the current param line
    print(f"{param_valid=}")
    cpt = 0
    filename_tmp_model = dts_name+"_"+suffix+".pckl"

    while (cpt < try_limit and check_class == False):  # loop for train models until good results
        cpt += 1

        model = DynamicLSTMModel(layer_configs=param_valid['layer_configs'], lr=param_valid['optimizer__lr'], criterion=criterion)

        if cpt == 1 and debug:
            print(model)
            print(len(list(model.parameters())))
            for i in range(len(list(model.parameters()))):
                print(list(model.parameters())[i].size())

        checkpoint_callback = ModelCheckpoint(
            dirpath=PATH_DATA+"\\Models\\",  # Specify the directory to save the model
            # Specify the filename format
            filename=f"{dts_name}_{suffix}_{datetime.now().strftime('%Y%m%d')}_{cpt_param}_{cpt}",
            save_top_k=1,  # Save only the top k models according to the monitored quantity
            verbose=True,
            monitor='val_loss',  # Specify the metric to monitor
            mode='min',  # Mode can be either 'min', 'max', or 'auto'
            save_last=False  # Optionally, you can choose to save the last model
        )

        early_stop_callback = EarlyStopping(
            monitor="val_loss", min_delta=0.001, patience=patience, verbose=True, mode="min")
        logger = TensorBoardLogger(tb_directory, name="my_model")
        trainer = pl.Trainer(max_epochs=epochs, callbacks=[
                             early_stop_callback, checkpoint_callback], logger=logger)

        trainer.fit(model, train_loader, val_loader)

        writer = SummaryWriter(log_dir=tb_directory+"/model_summary")
        model_summary = str(model).replace(
            '\n', '<br/>').replace(' ', '&nbsp;')
        writer.add_text("model_v"+str(logger.version), model_summary)
        writer.close()

        # trainer.test(dataloaders=test_loader)
        print(f"{checkpoint_callback.best_model_path=}")
        best_model = DynamicLSTMModel.load_from_checkpoint(
            checkpoint_callback.best_model_path)
        result = trainer.test(best_model, dataloaders=test_loader)
        # print(f"{result[0]=}")
        # print(
        #     f"Optim {cpt=} {checkpoint_callback.best_model_path=} {result[0]['test_acc_epoch']=}")

        if result[0]['test_acc_epoch'] > obj_acc:
            # calculate the confusion matrix
            y_pred = best_model(x_val_tensor)
            _, y_pred_classes = torch.max(y_pred, 1)
            confusion = metrics.confusion_matrix(y_val_tensor, y_pred_classes)

            print(confusion)

            check_class = True

            for i in range(num_classes):
                nb_lab = sum(y_pred_classes == i)
                if nb_lab < check_class_limit:
                    check_class = False
                    print(
                        f"Check class {i=} {nb_lab=} {check_class=} {check_class_limit=}")

            # check saved model, load to check it's OK
            if check_class:
                torch.save(model, filename_tmp_model)
                saved_model = torch.load(filename_tmp_model)
                saved_model.eval()
                y_pred = saved_model(x_val_tensor)
                _, y_pred_classes = torch.max(y_pred, 1)
                confusion = metrics.confusion_matrix(
                    y_val_tensor, y_pred_classes)
                print(confusion)

    if cpt >= try_limit:
        cpt_param += 1
        print(f"Optim fail {cpt=} param suivant {cpt_param=}")

In [None]:
%load_ext tensorboard
%tensorboard --logdir tb_logs

In [None]:
########################################
###### SAVE CODE FOR BASIC PYTORCH #####
###### BEFORE PYTORCH LIGHTNING ########

list_param_valid = [
                    {'model__dropout': 0.05, 'model__hidden_dim': 16, 'model__num_layers': 2, 'optimizer__lr': 0.1},
                    # {'fit__batch_size': 256, 'model__dropout': 0.05, 'model__layers': [64, 10], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
                    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
                    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
]

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, dropout):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

input_dim = x_train_tensor.shape[2]
num_classes = 5
epochs = 6#350
suffix="lstm_pytorch_v1"
filename_tmp_model = dts_name+"_"+suffix+".pckl"
patience = 3

val_accuracy=0.0
obj_acc=0.25
cpt_param=0 
try_limit=5
pct_check_class=0.4 # check if at least n% of the validation set per class
criterion = nn.CrossEntropyLoss()

len_val=x_val_tensor.shape[0]
check_class_limit=(len_val/num_classes)*pct_check_class
check_class=False # check if at least obj_acc accuracy per class

while(cpt_param<len(list_param_valid) and check_class==False):
    param_valid=list_param_valid[cpt_param] #select the current param line
    print(param_valid)
    cpt=0

    while(cpt<try_limit and check_class==False):
        cpt+=1
        
        model = LSTMModel(input_dim=input_dim, hidden_dim=param_valid['model__hidden_dim'], num_layers=param_valid['model__num_layers'], num_classes=num_classes, dropout=param_valid['model__dropout'])
        criterion = nn.CrossEntropyLoss()
        optimizer = Adam(model.parameters(), lr=param_valid['optimizer__lr'])

        if cpt==1:
            print(model)
            print(len(list(model.parameters())))
            for i in range(len(list(model.parameters()))):
                print(list(model.parameters())[i].size())

        # Training loop
        hist = np.zeros(epochs)
        for epoch in range(epochs):
            for i, (x_batch, y_batch) in enumerate(train_loader):
                model.train()
                optimizer.zero_grad()
                outputs = model(x_batch)
                loss = criterion(outputs, y_batch)

                loss.backward()
                optimizer.step()

            if epoch % 1 == 0 :   #change % 
                print(f"Epoch {epoch+1} CrossEntropyLoss: {loss.item()}")
            hist[epoch] = loss.item()


        # Validation
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for x_batch, y_batch in val_loader:
                outputs = model(x_batch)
                _, predicted = torch.max(outputs.data, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()

        val_accuracy = correct / total
        print(f"Epoch {epoch+1}, Loss: {loss.item()}, Validation Accuracy: {val_accuracy}")

        if val_accuracy>obj_acc:
            print(f"Optim success {cpt=} {val_accuracy=}")
            check_class=True #exit directly

            # calculate the confusion matrix
            y_pred = model(x_val_tensor)
            _, y_pred_classes = torch.max(y_pred, 1)
            confusion = metrics.confusion_matrix(y_val_tensor, y_pred_classes)
            print(confusion)

            for i in range(num_classes):
                nb_lab=sum(y_pred_classes == i)
                if nb_lab<check_class_limit  :
                    check_class=False
                    print(f"Check class {i=} {nb_lab=} {check_class=} {check_class_limit=}")
                # print(f"Categ {i}: real {sum(y_val_tensor == i)} predict {sum(y_pred_classes == i)}")


            #check saved model, load to check it's OK
            if check_class:
                torch.save(model, filename_tmp_model)
                saved_model = torch.load(filename_tmp_model)
                saved_model.eval()
                y_pred = saved_model(x_val_tensor)
                _, y_pred_classes = torch.max(y_pred, 1)
                confusion = metrics.confusion_matrix(y_val_tensor, y_pred_classes)
                print(confusion)

    if cpt>=try_limit :
        cpt_param+=1
        print(f"Optim fail {cpt=} param suivant {cpt_param=}")

In [None]:
plt.plot(hist, label="Training loss")
plt.legend()
plt.show()

In [None]:
input_dim = x_train.shape[-1]
window_size = sequence_length
dropout = 0.2
num_classes = 4

# cat_y_train = keras.utils.to_categorical(col_y_train, num_classes)
# cat_y_valid = keras.utils.to_categorical(col_y_valid, num_classes)

# df_x_train_exp = np.expand_dims(df_x_train, axis=2)
# df_x_valid_exp = np.expand_dims(df_x_valid, axis=2)


model_LSTM = Sequential()
model_LSTM.add(LSTM(units=20, return_sequences=False,#True
               input_shape=(window_size, input_dim)))
#,kernel_regularizer=l2(0.1), recurrent_regularizer=l2(0.1), bias_regularizer=l2(0.1)
model_LSTM.add(Dropout(rate=dropout))   
# model_LSTM.add(Dropout(rate=dropout))
# model_LSTM.add(Bidirectional(LSTM((window_size * 2), return_sequences=True)))
# model_LSTM.add(Dropout(rate=dropout))
# model_LSTM.add(Bidirectional(LSTM(window_size, return_sequences=False)))
model_LSTM.add(Dense(units=num_classes, activation='softmax'))

model_LSTM.compile(loss='categorical_crossentropy',
                   optimizer='adam', metrics=['accuracy'])

history = model_LSTM.fit(x_train_lstm, y_train_lstm, batch_size=1024,
                         shuffle=False, epochs=20, validation_data=(x_val_lstm, y_val_lstm))#,verbose=0

train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Plot loss
epochs = range(1, len(train_accuracy) + 1)
plt.plot(epochs, train_accuracy, 'bo-', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'ro-', label='Validation accuracy')
plt.legend()
plt.show()


In [None]:
# print if keras can use the gpu to train the model
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

