In [1]:
import os
from datetime import datetime
import pandas as pd
import numpy as np
import joblib
import importlib
import ast
from pathlib import Path
from sqlalchemy.orm import sessionmaker
import matplotlib.pyplot as plt
from pprint import pprint
import seaborn as sns
import gc

from sklearn import metrics

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset,RandomSampler
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import sqlite_io as sio
import add_indicators as indic
import split_merge as sm
import balance  # wait for new release https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1081
import model_mngr as modmgr

importlib.reload(sio)
importlib.reload(modmgr)
importlib.reload(sm)
importlib.reload(balance)

<module 'balance' from 'C:\\Projets\\MarketDataEnrichment\\dataset_mngr\\balance.py'>

In [2]:
PATH_DATA = "C:\\Projets\\Data"
PATH_DB_FWK="C:\\Projets\\Data\\sqlite\\dataset_market.db"
PATH_DB_STOCK="C:\\Projets\\Data\\sqlite\\dataset_paris_stock_adjusted.db"
PATH_DATA_DTS=PATH_DATA+"\\DTS_FULL\\"

SUFFIX_TRAIN="_TRAIN.zip"
SUFFIX_VAL="_VAL.zip"
SUFFIX_CONF="_CONF.zip"

CONNECTION TO SQLITE

In [8]:
if "con_stock" in locals():
        sio.close_connection(con_stock)
con_stock = sio.get_connection(str_db_path=PATH_DB_STOCK)

if "con_fwk" in locals():
        sio.close_connection(con_fwk)
con_fwk = sio.get_connection(str_db_path=PATH_DB_FWK)

my_session_maker = sessionmaker(bind=con_fwk)
session=my_session_maker()

table_stock="DS_PARIS_1D_ADJ_CLEAN"

GET DATA

In [None]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"

df_base=sio.get_candles_to_df(session=session,con=con_stock, target_table=table_stock,tradable=True)
df_work=pd.DataFrame()
for code_value in df_base.index.get_level_values('CODE').unique():
    sub_df=df_base[df_base.index.get_level_values('CODE') == code_value]
    df_work_tmp = indic.add_indicators_to_df(con=con_fwk, df_in=sub_df, dts_name=dts_name,symbol=multi_symbol)
    df_work = pd.concat([df_work, df_work_tmp])
    
df_work.sort_index(inplace=True)
df_work.info() 

In [8]:
# df_work[10000:10010]
# pd.set_option('display.max_columns', None)
# print(df_work.describe())

df_work.round(5).to_csv(
    PATH_DATA_DTS+dts_name+"_BASE.zip", sep=",")

START HERE FOR BASE DATASET

In [4]:
dts_name="PARIS_TREND_1D_20D_V2"
# dts_name="PARIS_TREND_1D_50D_V1"
multi_symbol="PARIS_STOCK"

df_work=pd.read_csv(PATH_DATA_DTS+dts_name+"_BASE.zip",sep=",",index_col=["OPEN_DATETIME","CODE"],parse_dates=["OPEN_DATETIME"])
df_work.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,OPEN,HIGH,LOW,CLOSE,VOLUME,sma20,pos_sma20,sma50,sma200,pos_sma50,...,adx14,adx14_neg,adx14_pos,adx14_dif,avg_vol14,pos_avg_vol14,pos_sma20_200,williamsr_14,perf_sma_50_5d,perf_sma_200_5d
OPEN_DATETIME,CODE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010-04-26,AB.PA,12.98,12.98,12.2,12.68,62866.0,,,12.68,,0.0,...,0.0,0.0,0.0,0.0,,,,,,
2010-04-27,AB.PA,12.74,12.83,12.61,12.7,22370.0,,,12.69,,0.00079,...,0.0,0.0,0.0,0.0,,,,,,
2010-04-28,AB.PA,12.7,12.7,12.41,12.5,8211.0,,,12.62667,,-0.01003,...,0.0,0.0,0.0,0.0,,,,,,
2010-04-29,AB.PA,12.6,12.65,12.46,12.64,4676.0,,,12.63,,0.00079,...,0.0,0.0,0.0,0.0,,,,,,
2010-04-30,AB.PA,12.63,12.71,12.55,12.65,4470.0,,,12.634,,0.00127,...,0.0,0.0,0.0,0.0,,,,,,


Check Dataframe

In [5]:
# drop rows with no pos_sma200 
df_work=df_work.dropna(subset=['pos_sma200'])

# if williamsr_14 >0 =0 if williamsr_14<-100 = -100
# df_work['williamsr_14']=df_work['williamsr_14'].apply(lambda x: 0 if x>0 else x)
# df_work['williamsr_14']=df_work['williamsr_14'].apply(lambda x: -100 if x<-100 else x)

# if williamsr_14 >0 =0 if williamsr_14<-100 = -100
df_work.loc[df_work['williamsr_14'] > 0, 'williamsr_14'] = 0
df_work.loc[df_work['williamsr_14'] < -100, 'williamsr_14'] = -100

# print min and max of the columns williamsr_14, perf_sma_50_5d, perf_sma_200_5d
# print(f"{df_work['williamsr_14'].min()=}")  inf-100
# print(f"{df_work['williamsr_14'].max()=}") sup 0

# df_check=df_work[df_work['perf_sma_50_5d'] > 1]
# df_check=df_check[df_check['ret_1d'] <= 2]
# print(df_check.index.get_level_values('CODE').unique())
# df_check[df_check.index.get_level_values('CODE')=='AI.PA']
# df_check.head(5)
# df_check=df_work[df_work.index.get_level_values('CODE')=='AI.PA']
# CATG
# mask = df_work['stdev20_1d'] > 1000
# df_work.drop(df_work[mask].index, inplace=True)
# df_check[6000:6010]


In [10]:
df_work[10000:10010]

Unnamed: 0_level_0,Unnamed: 1_level_0,OPEN,HIGH,LOW,CLOSE,VOLUME,pos_sma20,pos_sma50,pos_sma200,pos_sma50_200,pos_sma20_50,...,pos_donchian20_lo,adx14,adx14_neg,adx14_pos,adx14_dif,pos_avg_vol14,pos_sma20_200,williamsr_14,perf_sma_50_5d,perf_sma_200_5d
OPEN_DATETIME,CODE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-06-02,ABEO.PA,7.6136,7.9789,7.6136,7.9789,6332.0,0.09139,-0.03555,-0.53194,-0.51468,-0.11631,...,0.26911,26.15567,22.77986,24.12433,1.34447,0.93774,-0.57113,-31.8163,-0.03229,-0.02455
2020-06-03,ABEO.PA,8.1711,8.671,8.1519,8.4595,8982.0,0.14359,0.02936,-0.50131,-0.51553,-0.09989,...,0.34555,25.89215,20.36515,32.16729,11.80214,1.50382,-0.56393,-11.45907,-0.03301,-0.02453
2020-06-04,ABEO.PA,8.5557,8.5557,7.7866,8.4019,7735.0,0.12282,0.02657,-0.50225,-0.51513,-0.08572,...,0.28909,24.74737,23.41954,28.54604,5.12651,1.39693,-0.55669,-16.46677,-0.03314,-0.02459
2020-06-05,ABEO.PA,8.4211,8.7864,8.248,8.5749,6423.0,0.13198,0.05137,-0.48943,-0.51437,-0.07121,...,0.30793,24.10544,21.58743,29.66498,8.07755,1.15302,-0.54895,-12.08848,-0.03011,-0.02463
2020-06-08,ABEO.PA,8.7672,9.0363,8.5941,8.7479,19545.0,0.14221,0.07638,-0.4765,-0.51365,-0.05763,...,0.33431,23.9359,20.13379,31.31449,11.18069,3.09019,-0.54168,-14.42361,-0.02444,-0.02464
2020-06-09,ABEO.PA,8.7479,9.0363,8.3826,8.8056,8807.0,0.13831,0.08706,-0.47041,-0.51282,-0.04502,...,0.34312,23.25077,21.27235,28.39698,7.12462,1.37399,-0.53476,-11.53788,-0.02087,-0.02461
2020-06-10,ABEO.PA,9.0363,9.0363,8.2673,8.6518,6484.0,0.10674,0.06883,-0.47704,-0.51072,-0.03425,...,0.26761,22.33359,20.60949,25.39907,4.78959,1.06259,-0.52748,-19.22981,-0.01504,-0.02473
2020-06-11,ABEO.PA,8.5557,8.5557,8.0173,8.1711,5301.0,0.03645,0.01229,-0.50358,-0.5096,-0.02332,...,0.19718,20.89547,22.22109,23.2208,0.99971,0.83871,-0.52104,-50.00289,-0.01374,-0.02486
2020-06-12,ABEO.PA,7.9981,8.075,7.5175,7.6328,11060.0,-0.03547,-0.05092,-0.53389,-0.50888,-0.01602,...,0.0847,20.25323,26.93381,21.20351,-5.7303,1.63685,-0.51675,-81.1131,-0.01393,-0.02496
2020-06-15,ABEO.PA,7.9789,7.9789,7.5944,7.5944,5996.0,-0.04161,-0.05262,-0.53395,-0.50807,-0.01149,...,0.07924,19.65686,25.52877,20.0974,-5.43137,0.87697,-0.51372,-91.4621,-0.01365,-0.02485


In [9]:
df_work = indic.drop_indicators_by_type(
    con=con_fwk, df_in=df_work, dts_name=dts_name, symbol=multi_symbol, ind_type=0)
list_label = indic.get_ind_list_by_type_for_dts(
    con=con_fwk, dts_name=dts_name, symbol_code=multi_symbol, ind_type=2)
print(list_label)

           LABEL
0   lab_perf_20d
1   lab_perf_50d
2  lab_perf_125d


In [11]:
# df_work=df_work.droplevel('CODE') !!!!!!
df_work.sort_index(inplace=True)
df_work[10000:10010]

Unnamed: 0_level_0,Unnamed: 1_level_0,OPEN,HIGH,LOW,CLOSE,VOLUME,pos_sma20,pos_sma50,pos_sma200,pos_sma50_200,pos_sma20_50,...,pos_donchian20_lo,adx14,adx14_neg,adx14_pos,adx14_dif,pos_avg_vol14,pos_sma20_200,williamsr_14,perf_sma_50_5d,perf_sma_200_5d
OPEN_DATETIME,CODE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1995-11-24,BN.PA,5.3121,5.3292,5.3035,5.3121,1112239.0,0.01177,0.01509,0.03347,0.01811,0.00328,...,0.0784,20.2887,17.98263,24.68813,6.70549,0.57971,0.02145,-43.92655,-0.00423,0.00475
1995-11-24,BOI.PA,2.4317,2.4317,2.3817,2.3817,3392.0,-0.03791,-0.04912,0.10459,0.16165,-0.01165,...,0.0,40.6824,27.6034,14.7141,-12.8893,0.08447,0.14812,-100.0,-0.00193,0.008
1995-11-24,CDI.PA,5.5472,5.5958,5.5472,5.5958,85024.0,0.01444,0.04278,0.15177,0.10453,0.02793,...,0.04362,14.69556,19.13562,20.95757,1.82195,0.11627,0.13538,-0.0,0.00866,0.00701
1995-11-24,ELEC.PA,3.9661,3.9661,3.9661,3.9661,162.0,-0.02968,-0.0517,-0.03635,0.01619,-0.0227,...,0.02507,14.74654,55.61796,43.88082,-11.73714,0.63943,-0.00688,-71.42857,-0.00965,0.00166
1995-11-24,GFC.PA,3.3486,3.3486,3.2076,3.2093,32614.0,0.00242,0.03218,0.24682,0.20795,0.02969,...,0.06339,33.095,12.06264,26.76666,14.70403,2.22395,0.24381,-61.60998,0.0024,0.01102
1995-11-24,LAT.PA,24.8529,24.8529,24.8529,24.8529,154.0,-0.00517,-0.01589,-0.05827,-0.04307,-0.01078,...,0.05672,11.41274,49.78475,43.88114,-5.90361,0.26857,-0.05338,-31.34328,-0.00242,-0.00235
1995-11-24,LI.PA,2.0554,2.0554,2.0435,2.0435,143309.0,0.01303,-0.00332,-0.02706,-0.02382,-0.01615,...,0.03726,36.83981,9.74307,17.22198,7.47891,2.38002,-0.03958,-41.56051,-0.00162,-0.00054
1995-11-24,RE.PA,4.871,4.871,4.7188,4.7188,33953.0,0.00203,-0.01415,-0.00043,0.01391,-0.01614,...,0.06898,12.87952,19.22271,23.88377,4.66107,0.72411,-0.00245,-44.43796,-0.01148,0.00182
1995-11-24,SAVE.PA,15.3615,15.6143,15.3615,15.3615,28928.0,0.03593,0.05608,0.05257,-0.00332,0.01945,...,0.08714,48.09569,4.50842,30.13034,25.62192,0.41972,0.01606,-36.89894,0.00149,0.00358
1995-11-24,TEP.PA,1.4524,1.4524,1.3325,1.3325,83775.0,-0.13757,-0.15298,-0.16537,-0.01463,-0.01787,...,0.0,26.08714,65.25452,17.32163,-47.93289,3.00736,-0.03224,-100.0,-0.00313,-0.00357


In [13]:
lab_studied = "lab_perf_20d"
algo_studied = "LSTM_CLASS"
dts_name="PARIS_TREND_1D_20D_V2"

df_work_lab = indic.drop_indicators_not_selected(con=con_fwk, df_in=df_work, dts_name=dts_name, symbol=multi_symbol,label=lab_studied,algo=algo_studied)
# print(df_work_lab.head(5))

# move CODE to column to be able to slit the dataset
df_work_lab['TICKER'] = df_work_lab.index.get_level_values('CODE')
df_work_lab=df_work_lab.droplevel('CODE')

df_split=sm.split_df_by_label_strat(
    df_in=df_work_lab, list_label=[lab_studied], split_timeframe="M",random_split=False,split_strat=(80,10,10))
df_selected = df_split['df_'+lab_studied+'_train']
df_valid = df_split['df_'+lab_studied+'_valid']
df_confirm = df_split['df_'+lab_studied+'_confirm']
df_selected.sort_index(inplace=True)
df_valid.sort_index(inplace=True)
df_confirm.sort_index(inplace=True)

print(f"selected: {df_selected.shape=} valid: {df_valid.shape=} confirm: {df_confirm.shape=}")
df_selected[10000:10010]

selected: df_selected.shape=(844051, 29) valid: df_valid.shape=(233081, 29) confirm: df_confirm.shape=(247146, 29)


Unnamed: 0_level_0,pos_sma20,pos_sma50,pos_sma200,rsi14,sma20_rsi14,ret_5d,pos_top20,pos_top50,pos_bot20,pos_bot50,...,cmf_20,adx14,adx14_neg,adx14_pos,adx14_dif,pos_avg_vol14,pos_sma20_200,perf_sma_50_5d,perf_sma_200_5d,TICKER
OPEN_DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-01-04,0.03032,0.0445,0.04734,68.61368,57.45887,0.02203,0.0,0.0,0.06381,0.10456,...,0.68106,34.559,16.30154,32.80825,16.50671,1.43364,0.01652,0.01085,0.0035,SAVE.PA
1996-01-04,0.00911,-0.01411,-0.06019,51.10742,48.38847,0.0184,-0.03493,-0.11954,0.05738,0.10499,...,-0.00654,21.08814,30.49097,26.14443,-4.34654,0.10186,-0.06867,-0.01641,-0.00343,TEP.PA
1996-01-04,0.0506,0.03251,0.1697,58.77622,45.64184,0.03184,-0.02624,-0.02624,0.10498,0.10498,...,0.11007,19.47531,27.82168,32.68546,4.86378,1.73709,0.11336,0.0085,0.01067,TFI.PA
1996-01-04,0.01749,-0.03933,0.04489,49.29749,38.37917,0.04334,-0.04469,-0.10454,0.0657,0.0657,...,0.24571,22.52874,21.82438,25.90113,4.07675,1.35506,0.02694,-0.00896,0.00673,VIRP.PA
1996-01-05,0.01285,0.00631,-0.00376,52.83213,49.86578,-0.00893,-0.02243,-0.03173,0.04537,0.05623,...,0.14429,14.92596,28.76524,30.40562,1.64038,0.50658,-0.01641,0.0069,0.00293,BN.PA
1996-01-05,0.03801,0.02653,0.10581,61.28371,49.0916,0.03733,0.0,-0.03275,0.05942,0.07388,...,0.50814,29.47521,17.54691,16.77613,-0.77078,0.12624,0.06531,-0.00504,0.00746,BOI.PA
1996-01-05,0.03567,0.12029,0.28952,66.79495,71.57781,0.00596,-0.01651,-0.01651,0.11823,0.22153,...,0.42051,41.67504,14.69324,43.10116,28.40792,0.71424,0.24511,0.02108,0.01041,CDI.PA
1996-01-05,-0.00497,-0.04962,-0.10276,44.68482,41.48714,0.00438,-0.04037,-0.11778,0.00438,0.00438,...,0.05534,10.2024,38.68822,47.13757,8.44935,1.14001,-0.09827,-0.00897,0.00085,ELEC.PA
1996-01-05,0.01013,0.0241,0.19591,57.52017,55.21953,0.01443,-0.01506,-0.05496,0.04269,0.0758,...,-0.01669,26.80769,14.80556,34.98566,20.1801,0.10965,0.18391,0.00753,0.01116,GFC.PA
1996-01-05,0.06142,0.06319,0.01671,63.83981,49.25421,0.0,0.0,0.0,0.17008,0.17008,...,0.08836,15.6068,35.58655,52.39946,16.81291,0.95004,-0.04213,0.00509,-0.0004,LAT.PA


In [14]:
label=lab_studied
df_class=balance.add_class_by_lab_nb_lines(df_in=df_selected,str_label=lab_studied,nb_class=5,bool_replace_label=False)
min_max_lab_by_class = df_class.groupby(label+'_class')[label].agg(['min', 'max'])
print(min_max_lab_by_class)

                        min      max
lab_perf_20d_class                  
0                  -0.80165 -0.05202
1                  -0.05201 -0.00892
2                  -0.00891  0.02358
3                   0.02359  0.07135
4                   0.07136  3.82176


In [16]:
label=lab_studied
df_class=balance.add_class_by_lab_nb_lines(df_in=df_selected,str_label=lab_studied,nb_class=5,bool_replace_label=True)
df_class.sort_index(inplace=True)
# categ_50={0:[-1,-0.05456],1:[-0.07876,-0.00783],2:[-0.00783,0.04790],3:[0.04790,0.12406],4:[0.12406,6]}
categ_20={0:[-1,-0.0520],1:[-0.0520,-0.0089],2:[-0.0089,0.0235],3:[0.0235,0.0713],4:[0.0713,4]}
df_class_val=balance.add_lab_by_class(df_in=df_valid,str_label=lab_studied, categ=categ_20,bool_replace_label=True) # categ
df_class_val.sort_index(inplace=True)
df_class_conf=balance.add_lab_by_class(df_in=df_confirm,str_label=lab_studied, categ=categ_20,bool_replace_label=True) # categ
df_class_conf.sort_index(inplace=True)
print(df_class.loc[:, label].dropna().iloc[[0, -1]])
print(df_class_val.loc[:, label].dropna().iloc[[0, -1]])
print(df_class_conf.loc[:, label].dropna().iloc[[0, -1]])
# df_class_clean=df_class.drop(['OPEN','HIGH','LOW','CLOSE','VOLUME','lab_perf_125d','lab_perf_20d','lab_perf_50d'],axis=1)
data = df_class[label]
print(data.value_counts().sort_index())
data_val = df_class_val[label]
print(data_val.value_counts().sort_index())
data_conf = df_class_conf[label]
print(data_conf.value_counts().sort_index())
df_class[10000:10010]
# min_max_lab_by_class = df_class.groupby(label+'_class')[label].agg(['min', 'max'])
# print(min_max_lab_by_class)

# lab_perf_20d : train min nb rows 211000 validation 53000 confirm 55000

OPEN_DATETIME
1989-10-27    4
2017-02-28    4
Name: lab_perf_20d, dtype: int64
OPEN_DATETIME
2017-03-01    4.0
2020-07-31    4.0
Name: lab_perf_20d, dtype: float64
OPEN_DATETIME
2020-08-03    3.0
2023-12-13    2.0
Name: lab_perf_20d, dtype: float64
lab_perf_20d
0    168828
1    168821
2    168810
3    168787
4    168805
Name: count, dtype: int64
lab_perf_20d
0.0    52773
1.0    48096
2.0    45225
3.0    44833
4.0    42132
Name: count, dtype: int64
lab_perf_20d
0.0    61234
1.0    48511
2.0    42878
3.0    44301
4.0    50215
Name: count, dtype: int64


Unnamed: 0_level_0,pos_sma20,pos_sma50,pos_sma200,rsi14,sma20_rsi14,ret_5d,pos_top20,pos_top50,pos_bot20,pos_bot50,...,adx14,adx14_neg,adx14_pos,adx14_dif,pos_avg_vol14,pos_sma20_200,perf_sma_50_5d,perf_sma_200_5d,TICKER,lab_perf_20d
OPEN_DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-01-04,0.03032,0.0445,0.04734,68.61368,57.45887,0.02203,0.0,0.0,0.06381,0.10456,...,34.559,16.30154,32.80825,16.50671,1.43364,0.01652,0.01085,0.0035,SAVE.PA,3
1996-01-04,0.00911,-0.01411,-0.06019,51.10742,48.38847,0.0184,-0.03493,-0.11954,0.05738,0.10499,...,21.08814,30.49097,26.14443,-4.34654,0.10186,-0.06867,-0.01641,-0.00343,TEP.PA,2
1996-01-04,0.0506,0.03251,0.1697,58.77622,45.64184,0.03184,-0.02624,-0.02624,0.10498,0.10498,...,19.47531,27.82168,32.68546,4.86378,1.73709,0.11336,0.0085,0.01067,TFI.PA,2
1996-01-04,0.01749,-0.03933,0.04489,49.29749,38.37917,0.04334,-0.04469,-0.10454,0.0657,0.0657,...,22.52874,21.82438,25.90113,4.07675,1.35506,0.02694,-0.00896,0.00673,VIRP.PA,4
1996-01-05,0.01285,0.00631,-0.00376,52.83213,49.86578,-0.00893,-0.02243,-0.03173,0.04537,0.05623,...,14.92596,28.76524,30.40562,1.64038,0.50658,-0.01641,0.0069,0.00293,BN.PA,2
1996-01-05,0.03801,0.02653,0.10581,61.28371,49.0916,0.03733,0.0,-0.03275,0.05942,0.07388,...,29.47521,17.54691,16.77613,-0.77078,0.12624,0.06531,-0.00504,0.00746,BOI.PA,3
1996-01-05,0.03567,0.12029,0.28952,66.79495,71.57781,0.00596,-0.01651,-0.01651,0.11823,0.22153,...,41.67504,14.69324,43.10116,28.40792,0.71424,0.24511,0.02108,0.01041,CDI.PA,4
1996-01-05,-0.00497,-0.04962,-0.10276,44.68482,41.48714,0.00438,-0.04037,-0.11778,0.00438,0.00438,...,10.2024,38.68822,47.13757,8.44935,1.14001,-0.09827,-0.00897,0.00085,ELEC.PA,2
1996-01-05,0.01013,0.0241,0.19591,57.52017,55.21953,0.01443,-0.01506,-0.05496,0.04269,0.0758,...,26.80769,14.80556,34.98566,20.1801,0.10965,0.18391,0.00753,0.01116,GFC.PA,3
1996-01-05,0.06142,0.06319,0.01671,63.83981,49.25421,0.0,0.0,0.0,0.17008,0.17008,...,15.6068,35.58655,52.39946,16.81291,0.95004,-0.04213,0.00509,-0.0004,LAT.PA,4


In [17]:
#  SAVE DATASETS

df_class.round(5).to_csv(
    PATH_DATA_DTS+dts_name+SUFFIX_TRAIN, sep=",")
df_class_val.round(5).to_csv(
    PATH_DATA_DTS+dts_name+SUFFIX_VAL, sep=",")
df_class_conf.round(5).to_csv(
    PATH_DATA_DTS+dts_name+SUFFIX_CONF, sep=",")

Calculate and save scaler

In [18]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"

df_class=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_TRAIN,sep=",",index_col=["OPEN_DATETIME","TICKER"],parse_dates=["OPEN_DATETIME"])
df_class=df_class.dropna(subset=[label])
df_class=df_class.sort_index()

df_norm,norm_scaler= balance.normalize_df(df_in=df_class,str_label=label,tuple_ft_range=(-1,1))

file_name=dts_name+"_train_colab_lstm_norm_2405"
scaler_name=file_name+"_scaler.save"
joblib.dump(norm_scaler,filename=PATH_DATA_DTS+scaler_name)

# df_class_val=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_VAL,sep=",",index_col=["OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
# df_class_val.dropna(subset=[label], inplace=True)
# df_class_val.sort_index(inplace=True)

# list_feat = df_class.columns.values.tolist()
# list_feat.remove(label)
# X, y = sm.split_df_x_y(
#     df_in=df_class, list_features=list_feat, str_label=label, drop_na=True)
# nb_val=211000
# method = RandomUnderSampler(sampling_strategy={0:nb_val,1:nb_val,2:nb_val,3:nb_val}) 
# df_x_train, col_y_train=  method.fit_resample(X, y)
# print(col_y_train.value_counts().sort_index())

# X, y = sm.split_df_x_y(
#     df_in=df_class_val, list_features=list_feat, str_label=label, drop_na=True)
# nb_val=53000
# method = RandomUnderSampler(sampling_strategy={0:nb_val,1:nb_val,2:nb_val,3:nb_val}) # 53000 pour lab 20 et nn pour lab 50
# df_x_val, col_y_val=  method.fit_resample(X, y)
# print(col_y_val.value_counts().sort_index())

['C:\\Projets\\Data\\DTS_FULL\\PARIS_TREND_1D_20D_V2_train_colab_lstm_norm_2405_scaler.save']

Load train et val df, normalize,  undersample  and preparation for LSTM

In [None]:
##########  SAVE   ##########
#############################

dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
scaler_name=file_name+"_scaler.save"
scaler=joblib.load(PATH_DATA_DTS+scaler_name)


df_class=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_TRAIN,sep=",",index_col=["OPEN_DATETIME","TICKER"],parse_dates=["OPEN_DATETIME"])
df_class=df_class.dropna(subset=[label])
df_class=df_class.loc['1995-01-01':] # drop rows < 1995-01-01
df_class=df_class.sort_index()
df_class_val=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_VAL,sep=",",index_col=["OPEN_DATETIME","TICKER"],parse_dates=["OPEN_DATETIME"])
df_class_val=df_class_val.dropna(subset=[label])
df_class_val=df_class_val.sort_index()

# normalize df_class and df_class_val
df_class_train_norm=balance.normalize_df_scaler(df_in=df_class, str_label=label,scaler=scaler)
df_class_val_norm=balance.normalize_df_scaler(df_in=df_class_val, str_label=label,scaler=scaler)

list_feat = df_class.columns.values.tolist()
list_feat.remove(label)

df_x_train, col_y_train = sm.split_df_x_y(
    df_in=df_class_train_norm, list_features=list_feat, str_label=label, drop_na=True)
# nb_val=100000#208000
# method = RandomUnderSampler(sampling_strategy={0:nb_val,1:nb_val,2:nb_val,3:nb_val}) 
# df_x_train, col_y_train=  method.fit_resample(X, y)
# print(col_y_train.value_counts().sort_index())

df_x_val, col_y_val = sm.split_df_x_y(
    df_in=df_class_val_norm, list_features=list_feat, str_label=label, drop_na=True)
# nb_val=25000#53000
# method = RandomUnderSampler(sampling_strategy={0:nb_val,1:nb_val,2:nb_val,3:nb_val}) # 53000 pour lab 20 et nn pour lab 50
# df_x_val, col_y_val=  method.fit_resample(X, y)
# print(col_y_val.value_counts().sort_index())

sequence_length = 10

x_train=df_x_train.values
y_train=col_y_train.values
x_val=df_x_val.values
y_val=col_y_val.values
x_train_lstm,y_train_lstm=sm.prepare_sequences(x_train,y_train,sequence_length)
x_val_lstm,y_val_lstm=sm.prepare_sequences(x_val,y_val,sequence_length)

x_train_tensor = torch.tensor(x_train_lstm, dtype=torch.float)
y_train_tensor = torch.tensor(y_train_lstm, dtype=torch.float)
x_val_tensor = torch.tensor(x_val_lstm, dtype=torch.float)
y_val_tensor = torch.tensor(y_val_lstm, dtype=torch.float)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

# initiate a pytorch randomsampler for train data
train_sampler = RandomSampler(train_dataset,num_samples=100000,replacement=True)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=False,sampler=train_sampler,drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=512,drop_last=True)

#######################################
### SAVE SEQUENCE MANIPULATION ########

dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
col_sequence = "SEQUENCE"
df_class_train_csv=pd.read_csv(PATH_DATA_DTS+dts_name+"_TRAIN_seq_6.zip",sep=",",index_col=["TICKER","OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class_train_csv=df_class_train_csv.dropna(subset=[col_sequence])
df_class_train_csv=df_class_train_csv.sort_index()
df_class_val_csv=pd.read_csv(PATH_DATA_DTS+dts_name+"_VAL_seq_6.zip",sep=",",index_col=["TICKER","OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class_val_csv=df_class_val_csv.dropna(subset=[col_sequence])
df_class_val_csv=df_class_val_csv.sort_index()

# keep only index, label and sequence
df_class_train_csv=df_class_train_csv[[label,col_sequence]]
df_class_val_csv=df_class_val_csv[[label,col_sequence]]

# TODO !!!!!!!!!!!
# !!!!!!!!!!!!!
# df_class_train_csv['col_sequence_2'] = df_class_train_csv[col_sequence].str.replace("_", ",").apply(ast.literal_eval)
# df_class_train_csv['col_sequence_3']  = df_class_train_csv['col_sequence_2'] .apply(lambda x: np.array(x, dtype=np.float32))
# df_class_train_csv['col_sequence_2'] = df_class_train_csv[col_sequence].apply(lambda x: np.fromstring(x.strip('[]'), sep='_'))

df_class_val_csv['col_sequence_2'] = df_class_val_csv[col_sequence].str.replace("_", ",").apply(ast.literal_eval)
df_class_val_csv['col_sequence_3']  = df_class_val_csv['col_sequence_2'] .apply(lambda x: np.array(x, dtype=np.float32))


# print(f"{df_class_train_csv.shape=}")
# print(df_class_train_csv[1005:1010])
print(f"{df_class_val_csv.shape=}")
print(df_class_val_csv[1015:1020])

# decision is made between market sessions so we have shift the label of 1 day for each ticker
df_class_val_csv[label] = df_class_val_csv.groupby(level='TICKER')[label].shift(1)
df_class_val_csv=df_class_val_csv.dropna(subset=[label])
print(df_class_val_csv[1014:1019])


In [3]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
scaler_name=file_name+"_scaler.save"
scaler=joblib.load(PATH_DATA_DTS+scaler_name)


df_class=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_TRAIN,sep=",",index_col=["OPEN_DATETIME","TICKER"],parse_dates=["OPEN_DATETIME"])
df_class=df_class.dropna(subset=[label])
df_class=df_class.loc['1995-01-01':] # drop rows < 1995-01-01
df_class=df_class.sort_index()
df_class_val=pd.read_csv(PATH_DATA_DTS+dts_name+SUFFIX_VAL,sep=",",index_col=["OPEN_DATETIME","TICKER"],parse_dates=["OPEN_DATETIME"])
df_class_val=df_class_val.dropna(subset=[label])
df_class_val=df_class_val.sort_index()

# normalize df_class and df_class_val
df_class_train_norm=balance.normalize_df_scaler(df_in=df_class, str_label=label,scaler=scaler)
df_class_val_norm=balance.normalize_df_scaler(df_in=df_class_val, str_label=label,scaler=scaler)

print(f"{df_class_train_norm.shape=} {df_class_val_norm.shape=}")
print(df_class_train_norm[10000:10005])
# print type of index of df_class_train_norm


df_class_train_norm.shape=(837054, 28) df_class_val_norm.shape=(233059, 28)
                       pos_sma20  pos_sma50  pos_sma200     rsi14  \
OPEN_DATETIME TICKER                                                
1998-01-20    RE.PA    -0.401871  -0.530231   -0.504369  0.387174   
              SAVE.PA  -0.429049  -0.551279   -0.530901  0.333374   
              TEP.PA   -0.379269  -0.510884   -0.525588  0.509715   
              TFI.PA   -0.396483  -0.499134   -0.470829  0.504751   
              VIRP.PA  -0.414437  -0.515018   -0.598517  0.253438   

                       sma20_rsi14    ret_5d  pos_top20  pos_top50  pos_bot20  \
OPEN_DATETIME TICKER                                                            
1998-01-20    RE.PA       0.182813 -0.503245   0.983320   0.983966  -0.918870   
              SAVE.PA     0.195811 -0.515616   0.980994   0.981731  -0.967591   
              TEP.PA      0.198386 -0.495371   0.984695   0.985288  -0.902783   
              TFI.PA      0.370092 

In [4]:
print(f"{type(df_class_train_norm.index[0])= } {type(df_class_train_norm.index[1])= }")

type(df_class_train_norm.index[0])= <class 'tuple'> type(df_class_train_norm.index[1])= <class 'tuple'>


In [27]:
import gc

list_feat = df_class.columns.values.tolist()
list_feat.remove(label)

sequence_length = 10
col_sequence = "SEQUENCE"

# for each TICKER in index of df_class_train_norm, sort data with index and prepare sequences
df_class_train_norm_sorted = df_class_train_norm.sort_index(level=['TICKER', 'OPEN_DATETIME'])
df_class_val_norm_sorted = df_class_val_norm.sort_index(level=['TICKER', 'OPEN_DATETIME'])

# Prepare sequences for each TICKER
df_class_train_seq = pd.DataFrame()
cnt=0
for ticker in df_class_train_norm_sorted.index.get_level_values('TICKER').unique():
    sub_df=df_class_train_norm_sorted[df_class_train_norm_sorted.index.get_level_values('TICKER') == ticker]
    sub_df = sm.prepare_sequences_df(
        df_in=sub_df, list_features=list_feat, sequence_length=sequence_length, str_new_col=col_sequence)
    cnt+=1
    if cnt%20==0:
        print(f"time {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {cnt=} {ticker=}")
        gc.collect()
    # if cnt==3:
    #     break
    
# concatenate all TICKER data in the same df
    df_class_train_seq = pd.concat([df_class_train_seq, sub_df])

print((f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} train seq ok"))

df_class_val_seq = pd.DataFrame()
cnt=0
for ticker in df_class_val_norm_sorted.index.get_level_values('TICKER').unique():
    sub_df=df_class_val_norm_sorted[df_class_val_norm_sorted.index.get_level_values('TICKER') == ticker]
    sub_df = sm.prepare_sequences_df(
        df_in=sub_df, list_features=list_feat, sequence_length=sequence_length, str_new_col=col_sequence)
    cnt+=1
    if cnt%20==0:
        print(f"time {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {cnt=} {ticker=}")
        gc.collect()
    # if cnt==3:
    #     break
    
# concatenate all TICKER data in the same df
    df_class_val_seq = pd.concat([df_class_val_seq, sub_df])

print(f"{df_class_train_seq.shape=} {df_class_val_seq.shape=}")
print(df_class_train_seq[10000:10005])

# df_class_train_norm=sm.prepare_sequences_df(df_in=df_class_train_norm,list_features=list_feat,sequence_length=sequence_length,str_new_col=col_sequence)
# df_class_val_norm=sm.prepare_sequences_df(df_in=df_class_val_norm,list_features=list_feat,sequence_length=sequence_length,str_new_col=col_sequence)

# df_x_train, col_y_train = sm.split_df_x_y(
#     df_in=df_class_train_norm, list_features=list_feat, str_label=label, drop_na=True)

# df_x_val, col_y_val = sm.split_df_x_y(
#     df_in=df_class_val_norm, list_features=list_feat, str_label=label, drop_na=True)



# x_train=df_x_train.values
# y_train=col_y_train.values
# x_val=df_x_val.values
# y_val=col_y_val.values
# x_train_lstm,y_train_lstm=sm.prepare_sequences(x_train,y_train,sequence_length)
# x_val_lstm,y_val_lstm=sm.prepare_sequences(x_val,y_val,sequence_length)



time 2024-07-05 17:04:48 cnt=20 ticker='ALDEL.PA'
time 2024-07-05 17:08:38 cnt=40 ticker='ALNOV.PA'
time 2024-07-05 17:13:25 cnt=60 ticker='BEN.PA'
time 2024-07-05 17:19:47 cnt=80 ticker='CDI.PA'
time 2024-07-05 17:25:08 cnt=100 ticker='DG.PA'
time 2024-07-05 17:29:26 cnt=120 ticker='ETL.PA'
time 2024-07-05 17:33:04 cnt=140 ticker='GTT.PA'
time 2024-07-05 17:36:51 cnt=160 ticker='LBIRD.PA'
time 2024-07-05 17:42:34 cnt=180 ticker='MRN.PA'
time 2024-07-05 17:46:25 cnt=200 ticker='POXEL.PA'
time 2024-07-05 17:52:45 cnt=220 ticker='SCR.PA'
time 2024-07-05 17:59:31 cnt=240 ticker='TRI.PA'
2024-07-05 18:02:44 train seq ok
time 2024-07-05 18:03:53 cnt=20 ticker='ALCYB.PA'
time 2024-07-05 18:04:52 cnt=40 ticker='ALLDL.PA'
time 2024-07-05 18:05:56 cnt=60 ticker='ATO.PA'
time 2024-07-05 18:07:13 cnt=80 ticker='CAP.PA'
time 2024-07-05 18:08:45 cnt=100 ticker='CRLA.PA'
time 2024-07-05 18:09:58 cnt=120 ticker='ENGI.PA'
time 2024-07-05 18:11:19 cnt=140 ticker='GDS.PA'
time 2024-07-05 18:12:53 cnt=16

In [34]:
# TODO put this in a function ??
gc.collect()
def format_float(x):
    return '{:.5f}'.format(x) if x is not None else None

def array_to_string(x):
    return np.array2string(x,separator='_') if x is not None else None


vfunc = np.vectorize(format_float) 

df_class_train_seq2=df_class_train_seq.copy()
df_class_val_seq2=df_class_val_seq.copy()

df_class_train_seq2[col_sequence] = df_class_train_seq2[col_sequence].apply(vfunc)
df_class_val_seq2[col_sequence] = df_class_val_seq2[col_sequence].apply(vfunc)

df_class_train_seq2[col_sequence] = df_class_train_seq2[col_sequence].apply(array_to_string)
df_class_val_seq2[col_sequence] = df_class_val_seq2[col_sequence].apply(array_to_string)

df_class_train_seq2.round(5).to_csv(
    PATH_DATA_DTS+dts_name+"_TRAIN_seq_6", sep=",", float_format='%.5f')
df_class_val_seq2.round(5).to_csv(
    PATH_DATA_DTS+dts_name+"_VAL_seq_6", sep=",", float_format='%.5f')

START HERE TO LOAD DATASETS WITH SEQUENCE

In [3]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
col_sequence = "SEQUENCE"
df_class_train_csv=pd.read_csv(PATH_DATA_DTS+dts_name+"_TRAIN_seq_6.zip",sep=",",index_col=["TICKER","OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class_train_csv=df_class_train_csv.dropna(subset=[col_sequence])
df_class_train_csv=df_class_train_csv.sort_index()
df_class_val_csv=pd.read_csv(PATH_DATA_DTS+dts_name+"_VAL_seq_6.zip",sep=",",index_col=["TICKER","OPEN_DATETIME"],parse_dates=["OPEN_DATETIME"])
df_class_val_csv=df_class_val_csv.dropna(subset=[col_sequence])
df_class_val_csv=df_class_val_csv.sort_index()

# keep only index, label and sequence
df_class_train_csv=df_class_train_csv[[label,col_sequence]]
df_class_val_csv=df_class_val_csv[[label,col_sequence]]

df_class_train_csv[col_sequence] = df_class_train_csv[col_sequence].str.replace("_", ",").apply(ast.literal_eval)
df_class_train_csv[col_sequence]  = df_class_train_csv[col_sequence] .apply(lambda x: np.array(x, dtype=np.float32))

df_class_val_csv[col_sequence] = df_class_val_csv[col_sequence].str.replace("_", ",").apply(ast.literal_eval)
df_class_val_csv[col_sequence]  = df_class_val_csv[col_sequence] .apply(lambda x: np.array(x, dtype=np.float32))


print(f"{df_class_train_csv.shape=}")
print(df_class_train_csv[1015:1020])
print(f"{df_class_val_csv.shape=}")
print(df_class_val_csv[1015:1020])

# decision is made between market sessions so we have shift the label of 1 day for each ticker
df_class_train_csv[label] = df_class_train_csv.groupby(level='TICKER')[label].shift(1)
df_class_train_csv=df_class_train_csv.dropna(subset=[label])
df_class_val_csv[label] = df_class_val_csv.groupby(level='TICKER')[label].shift(1)
df_class_val_csv=df_class_val_csv.dropna(subset=[label])
print(df_class_train_csv[1014:1019])
print(df_class_val_csv[1014:1019])


df_class_train_csv.shape=(834759, 2)
                      lab_perf_20d  \
TICKER OPEN_DATETIME                 
AB.PA  2015-02-11                0   
       2015-02-12                0   
       2015-02-13                1   
       2015-02-16                1   
       2015-02-17                0   

                                                               SEQUENCE  
TICKER OPEN_DATETIME                                                     
AB.PA  2015-02-11     [[-0.39987, -0.49388, -0.45321, 0.33119, 0.245...  
       2015-02-12     [[-0.40479, -0.49565, -0.45233, 0.33764, 0.256...  
       2015-02-13     [[-0.39282, -0.4837, -0.43821, 0.39571, 0.2664...  
       2015-02-16     [[-0.35487, -0.4481, -0.40128, 0.51362, 0.2836...  
       2015-02-17     [[-0.35342, -0.4441, -0.39371, 0.5362, 0.29874...  
df_class_val_csv.shape=(230586, 2)
                       lab_perf_20d  \
TICKER  OPEN_DATETIME                 
ABCA.PA 2017-10-16              2.0   
        2017-10-17        

In [4]:
df_class_train_csv.to_pickle(PATH_DATA_DTS+dts_name+"_TRAIN_seq_6.pckl")
df_class_val_csv.to_pickle(PATH_DATA_DTS+dts_name+"_VAL_seq_6.pckl")

START HERE TO DIRECTLY LOAD THE PICKLE FILES

In [3]:
dts_name="PARIS_TREND_1D_20D_V2"
multi_symbol="PARIS_STOCK"
label = "lab_perf_20d"
file_name=dts_name+"_train_colab_lstm_norm_2405"
col_sequence = "SEQUENCE"

df_class_train_csv=pd.read_pickle(PATH_DATA_DTS+dts_name+"_TRAIN_seq_6.pckl")  #the train will be split in train + val
df_class_test_csv=pd.read_pickle(PATH_DATA_DTS+dts_name+"_VAL_seq_6.pckl") #the val is finally used as a test dataset
print(df_class_train_csv[1014:1019])

                      lab_perf_20d  \
TICKER OPEN_DATETIME                 
AB.PA  2015-02-11              0.0   
       2015-02-12              0.0   
       2015-02-13              0.0   
       2015-02-16              1.0   
       2015-02-17              1.0   

                                                               SEQUENCE  
TICKER OPEN_DATETIME                                                     
AB.PA  2015-02-11     [[-0.39987, -0.49388, -0.45321, 0.33119, 0.245...  
       2015-02-12     [[-0.40479, -0.49565, -0.45233, 0.33764, 0.256...  
       2015-02-13     [[-0.39282, -0.4837, -0.43821, 0.39571, 0.2664...  
       2015-02-16     [[-0.35487, -0.4481, -0.40128, 0.51362, 0.2836...  
       2015-02-17     [[-0.35342, -0.4441, -0.39371, 0.5362, 0.29874...  


In [4]:
# df_class_train_csv split into train and val with 0.75/0.25 by open datetime using sm.split_df_by_label_strat
df_class_train_csv.reset_index(level='TICKER',inplace=True)

df_split=sm.split_df_by_label_strat(
    df_in=df_class_train_csv, list_label=[label], split_timeframe="D",random_split=False,split_strat=(80,20,0))
df_train_split=df_split['df_'+label+'_train']
df_val_split=df_split['df_'+label+'_valid']

df_train_split.set_index('TICKER',append=True,inplace=True)
df_val_split.set_index('TICKER',append=True,inplace=True)
df_train_split.sort_index(inplace=True)
df_val_split.sort_index(inplace=True)
print(df_train_split[1014:1019])


                       lab_perf_20d  \
OPEN_DATETIME TICKER                  
1995-05-18    LI.PA             1.0   
              RE.PA             2.0   
              SAVE.PA           2.0   
              TEP.PA            2.0   
              VIRP.PA           1.0   

                                                                SEQUENCE  
OPEN_DATETIME TICKER                                                      
1995-05-18    LI.PA    [[-0.44468, -0.57928, -0.59788, 0.1422, 0.0469...  
              RE.PA    [[-0.45272, -0.55703, -0.62491, 0.13047, 0.225...  
              SAVE.PA  [[-0.44879, -0.57938, -0.62212, 0.04232, -0.00...  
              TEP.PA   [[-0.47529, -0.6054, -0.6432, -0.20304, -0.006...  
              VIRP.PA  [[-0.44257, -0.56061, -0.74347, 0.358, 0.35239...  


In [5]:
# print(f"{df_class_train_csv.shape=} {df_class_val_csv.shape=}")
print(df_train_split[label].value_counts().sort_index()) # undersampling at 109200
print(df_val_split[label].value_counts().sort_index()) # undersampling at 43900
print(df_class_test_csv[label].value_counts().sort_index()) # undersampling at 41500

nb_val=30000 #109200
df_class_train_under=balance.class_custom_undersampler(df_train_split,label,nb_val) # undersampling todo

nb_val=5000 #41500
df_class_val_under=balance.class_custom_undersampler(df_val_split,label,nb_val)
df_class_test_under=balance.class_custom_undersampler(df_class_test_csv,label,nb_val)

print(df_class_train_under[label].value_counts().sort_index()) 
print(df_class_val_under[label].value_counts().sort_index()) 
print(df_class_test_under[label].value_counts().sort_index()) 


x_train_tensor = torch.as_tensor(df_class_train_under[col_sequence], dtype=torch.float)
y_train_tensor = torch.tensor(df_class_train_under[label], dtype=torch.int64)

# x_val_tensor = torch.tensor(df_class_val_under['col_sequence_3'], dtype=torch.float)
x_val_tensor = torch.as_tensor(df_class_val_under[col_sequence], dtype=torch.float)
y_val_tensor = torch.tensor(df_class_val_under[label], dtype=torch.int64)

x_test_tensor = torch.as_tensor(df_class_test_under[col_sequence], dtype=torch.float)
y_test_tensor = torch.tensor(df_class_test_under[label], dtype=torch.int64)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

# initiate a pytorch randomsampler for train data
# train_sampler = RandomSampler(train_dataset,num_samples=100000,replacement=True)

batch_size=512
num_workers=7

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False,drop_last=True,num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size,drop_last=True,num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size,drop_last=True,num_workers=num_workers)
 
print(f"{train_loader.dataset.tensors[0].shape=} {val_loader.dataset.tensors[0].shape=} {test_loader.dataset.tensors[0].shape=}") 

lab_perf_20d
0.0    123176
1.0    110279
2.0    109418
3.0    109278
4.0    114613
Name: count, dtype: int64
lab_perf_20d
0.0    43973
1.0    56382
2.0    57460
3.0    57611
4.0    52314
Name: count, dtype: int64
lab_perf_20d
0.0    52460
1.0    47659
2.0    44551
3.0    44057
4.0    41585
Name: count, dtype: int64
lab_perf_20d
0.0    30000
1.0    30000
2.0    30000
3.0    30000
4.0    30000
Name: count, dtype: int64
lab_perf_20d
0.0    5000
1.0    5000
2.0    5000
3.0    5000
4.0    5000
Name: count, dtype: int64
lab_perf_20d
0.0    5000
1.0    5000
2.0    5000
3.0    5000
4.0    5000
Name: count, dtype: int64


  x_train_tensor = torch.as_tensor(df_class_train_under[col_sequence], dtype=torch.float)
  x_train_tensor = torch.as_tensor(df_class_train_under[col_sequence], dtype=torch.float)
  y_train_tensor = torch.tensor(df_class_train_under[label], dtype=torch.int64)
  x_val_tensor = torch.as_tensor(df_class_val_under[col_sequence], dtype=torch.float)


train_loader.dataset.tensors[0].shape=torch.Size([150000, 10, 27]) val_loader.dataset.tensors[0].shape=torch.Size([25000, 10, 27]) test_loader.dataset.tensors[0].shape=torch.Size([25000, 10, 27])


  y_val_tensor = torch.tensor(df_class_val_under[label], dtype=torch.int64)
  x_test_tensor = torch.as_tensor(df_class_test_under[col_sequence], dtype=torch.float)
  y_test_tensor = torch.tensor(df_class_test_under[label], dtype=torch.int64)


In [None]:
print(f"{train_loader.dataset.tensors[0].shape=} {val_loader.dataset.tensors[0].shape=} {test_loader.dataset.tensors[0].shape=}")
#print next(iter(train_loader))
pprint(next(iter(test_loader)))

Correlation (Copy from the Tensorflow notebook), not tested here !

In [None]:
corr_train = df_x_train.corr()
plt.clf()
fig, ax = plt.subplots(figsize=(6, 6))
corr_train.replace(1,0,inplace=True)
corr_train=corr_train.applymap(lambda x : None if x< 0.7 and x>-0.7 else x)
corr_train.dropna(axis=0,how='all',inplace=True)
corr_train.dropna(axis=1,how='all',inplace=True)

# corr_train_check=corr_train[corr_train >0.8]
corr_train_check=corr_train
sns.heatmap(corr_train_check, annot=False, cmap='coolwarm', vmin=-1, vmax=1, ax=ax)

In [None]:
sns.scatterplot(data=df_class, x='pos_sma200', y='pos_top50', hue='lab_perf_20d', palette='Set1')

In [12]:
###############################################
###### REFACTO USING PYTORCH LIGHTNING ########
###############################################

# Define LSTM model
class DynamicLSTMModel(pl.LightningModule):
    def __init__(self, layer_configs, lr, criterion):
        super(DynamicLSTMModel, self).__init__()
        self.layers = nn.ModuleList()

        for config in layer_configs:
            # print(f"{config=}")
            if config['type'] == 'LSTM':
                layer = nn.LSTM(input_size=config['input_dim'], hidden_size=config['hidden_dim'], num_layers=config['num_layers'],
                                batch_first=True, dropout=config['dropout'], bidirectional=config['bidirectional'])
            elif config['type'] == 'Linear':
                layer = nn.Linear(config['input_dim'], config['output_dim'])
            elif config['type'] == 'Softmax':
                layer = nn.Softmax(dim=config['dim'])
            else:
                raise ValueError(f"Unsupported layer type: {config['type']}")
            self.layers.append(layer)

        self.lr = lr
        self.criterion = criterion
        self.validation_step_outputs = []

        self.save_hyperparameters()

    def forward(self, x):
        for layer in self.layers:
            if isinstance(layer, nn.LSTM):
                # LSTM layers require special handling for initial states
                batch_size = x.size(0)
                hidden_dim = layer.hidden_size
                num_layers = layer.num_layers * 2 if layer.bidirectional else layer.num_layers
                h0 = torch.zeros(num_layers, batch_size,
                                 hidden_dim).to(x.device)
                c0 = torch.zeros(num_layers, batch_size,
                                 hidden_dim).to(x.device)
                x, _ = layer(x, (h0, c0))
                x = x[:, -1, :]
            else:
                x = layer(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        _, predicted = torch.max(y_hat.data, 1)
        loss = self.criterion(y_hat, y)
        correct = (predicted == y).sum().item()
        total = len(y)
        self.log("train_loss", loss, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        self.log("train_acc", correct/total, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        output = {"loss": loss, "train_loss": loss,
                  "train_correct": correct, "train_total": total}
        return output

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        _, predicted = torch.max(y_hat.data, 1)
        loss = self.criterion(y_hat, y)
        correct = (predicted == y).sum().item()
        total = len(y)
        # output=f"val_loss: {loss}, val_correct: {correct}, val_total: {y.size(0)}"
        output = {"loss": loss, "val_loss": loss,
                  "val_correct": correct, "val_total": total}
        # self.log(output)
        self.log("val_loss", loss, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        self.log("val_acc", correct/total, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        return output

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        _, predicted = torch.max(y_hat.data, 1)
        loss = self.criterion(y_hat, y)
        correct = (predicted == y).sum().item()
        total = len(y)
        # output=f"val_loss: {loss}, val_correct: {correct}, val_total: {y.size(0)}"
        output = {"loss": loss, "test_loss": loss, "test_correct": correct,
                  "test_total": total, "test_acc": correct/total}
        # self.log(output)
        self.log("test_loss", loss, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        self.log("test_acc", correct/total, on_step=True,
                 on_epoch=True, prog_bar=True, logger=True)
        return output

    # def test_epoch_end(self, outputs):
    #     avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
    #     self.log('test_loss_epoch', avg_loss)

    # def on_validation_epoch_end(self, outputs):
    #     avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    #     total_correct = sum(x['val_correct'] for x in outputs)
    #     total = sum(x['val_total'] for x in outputs)
    #     tensorboard_logs = {'val_loss': avg_loss}
    #     return {'val_loss': avg_loss, 'progress_bar': tensorboard_logs, 'val_acc': total_correct / total}

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=self.lr)


input_dim = x_train_tensor.shape[2]
num_classes = 5
epochs = 10  # 350
suffix = "lstm_pytorch_v1"
tb_directory = "tb_logs"
debug = False
patience = 5

obj_acc = 0.25
cpt_param = 0
try_limit = 3
pct_check_class = 0.3  # check if at least n% of the validation set per class
criterion = nn.CrossEntropyLoss()

len_val = x_val_tensor.shape[0]
check_class_limit = (len_val/num_classes)*pct_check_class
check_class = False  # check if at least obj_acc accuracy per class

list_param_valid = [
    {'layer_configs': [
        {'type': 'Linear', 'input_dim': input_dim, 'output_dim': input_dim},
        {'type': 'LSTM', 'input_dim': input_dim, 'hidden_dim': 64,         'num_layers': 1, 'dropout': 0.0, 'bidirectional': True},
        # Note: LSTM bidirectional output is doubled
        {'type': 'Linear', 'input_dim': 64 * 2, 'output_dim': num_classes},
        {'type': 'Softmax', 'dim': 1}
    ], 'optimizer__lr': 0.01},
    {'layer_configs': [
        {'type': 'Linear', 'input_dim': input_dim, 'output_dim': input_dim},
        {'type': 'LSTM', 'input_dim': input_dim, 'hidden_dim': 32,
         'num_layers': 2, 'dropout': 0.2, 'bidirectional': False},
        # Note: LSTM bidirectional output is doubled
        {'type': 'Linear', 'input_dim': 32, 'output_dim': num_classes},
        {'type': 'Softmax', 'dim': 1}
    ], 'optimizer__lr': 0.01},
    # {'fit__batch_size': 256, 'model__dropout': 0.05, 'model__layers': [64, 10], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
]

while (cpt_param < len(list_param_valid) and check_class == False):  # loop for parameters
    gc.collect()
    param_valid = list_param_valid[cpt_param]  # select the current param line
    print(f"{param_valid=}")
    cpt = 0
    filename_tmp_model = dts_name+"_"+suffix+".pckl"

    while (cpt < try_limit and check_class == False):  # loop for train models until good results
        cpt += 1

        model = DynamicLSTMModel(layer_configs=param_valid['layer_configs'], lr=param_valid['optimizer__lr'], criterion=criterion)

        if cpt == 1 and debug:
            print(model)
            print(len(list(model.parameters())))
            for i in range(len(list(model.parameters()))):
                print(list(model.parameters())[i].size())

        checkpoint_callback = ModelCheckpoint(
            dirpath=PATH_DATA+"\\Models\\",  # Specify the directory to save the model
            # Specify the filename format
            filename=f"{dts_name}_{suffix}_{datetime.now().strftime('%Y%m%d')}_{cpt_param}_{cpt}",
            save_top_k=1,  # Save only the top k models according to the monitored quantity
            verbose=True,
            monitor='val_loss',  # Specify the metric to monitor
            mode='min',  # Mode can be either 'min', 'max', or 'auto'
            save_last=False  # Optionally, you can choose to save the last model
        )

        early_stop_callback = EarlyStopping(
            monitor="val_loss", min_delta=0.001, patience=patience, verbose=True, mode="min")
        logger = TensorBoardLogger(tb_directory, name="my_model")
        trainer = pl.Trainer(max_epochs=epochs, callbacks=[
                             early_stop_callback, checkpoint_callback], logger=logger)

        trainer.fit(model, train_loader, val_loader)

        writer = SummaryWriter(log_dir=tb_directory+"/model_summary")
        model_summary = str(model).replace(
            '\n', '<br/>').replace(' ', '&nbsp;')
        writer.add_text("model_v"+str(logger.version), model_summary)
        writer.close()

        # trainer.test(dataloaders=test_loader)
        print(f"{checkpoint_callback.best_model_path=}")
        best_model = DynamicLSTMModel.load_from_checkpoint(
            checkpoint_callback.best_model_path)
        result = trainer.test(best_model, dataloaders=test_loader)
        # print(f"{result[0]=}")
        # print(
        #     f"Optim {cpt=} {checkpoint_callback.best_model_path=} {result[0]['test_acc_epoch']=}")

        if result[0]['test_acc_epoch'] > obj_acc:
            # calculate the confusion matrix
            y_pred = best_model(x_val_tensor)
            _, y_pred_classes = torch.max(y_pred, 1)
            confusion = metrics.confusion_matrix(y_val_tensor, y_pred_classes)

            print(confusion)

            check_class = True

            for i in range(num_classes):
                nb_lab = sum(y_pred_classes == i)
                if nb_lab < check_class_limit:
                    check_class = False
                    print(
                        f"Check class {i=} {nb_lab=} {check_class=} {check_class_limit=}")

            # check saved model, load to check it's OK
            if check_class:
                torch.save(model, filename_tmp_model)
                saved_model = torch.load(filename_tmp_model)
                saved_model.eval()
                y_pred = saved_model(x_val_tensor)
                _, y_pred_classes = torch.max(y_pred, 1)
                confusion = metrics.confusion_matrix(
                    y_val_tensor, y_pred_classes)
                print(confusion)

    if cpt >= try_limit:
        cpt_param += 1
        print(f"Optim fail {cpt=} param suivant {cpt_param=}")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | layers    | ModuleList       | 49.0 K | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
49.0 K    Trainable params
0         Non-trainable params
49.0 K    Total params
0.196     Total estimated model params size (MB)


param_valid={'layer_configs': [{'type': 'Linear', 'input_dim': 27, 'output_dim': 27}, {'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.0, 'bidirectional': True}, {'type': 'Linear', 'input_dim': 128, 'output_dim': 5}, {'type': 'Softmax', 'dim': 1}], 'optimizer__lr': 0.01}
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.0, 'bidirectional': True}
config={'type': 'Linear', 'input_dim': 128, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Epoch 0: 100%|██████████| 292/292 [00:50<00:00,  5.81it/s, v_num=52, train_loss_step=1.690, train_acc_step=0.213, val_loss_step=1.750, val_acc_step=0.158, val_loss_epoch=1.700, val_acc_epoch=0.200, train_loss_epoch=1.640, train_acc_epoch=0.214]

Metric val_loss improved. New best score: 1.704
Epoch 0, global step 292: 'val_loss' reached 1.70439 (best 1.70439), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_1-v2.ckpt' as top 1


Epoch 1: 100%|██████████| 292/292 [00:50<00:00,  5.83it/s, v_num=52, train_loss_step=1.580, train_acc_step=0.254, val_loss_step=1.620, val_acc_step=0.221, val_loss_epoch=1.650, val_acc_epoch=0.200, train_loss_epoch=1.680, train_acc_epoch=0.204] 

Metric val_loss improved by 0.058 >= min_delta = 0.001. New best score: 1.647
Epoch 1, global step 584: 'val_loss' reached 1.64658 (best 1.64658), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_1-v2.ckpt' as top 1


Epoch 2: 100%|██████████| 292/292 [00:51<00:00,  5.71it/s, v_num=52, train_loss_step=1.580, train_acc_step=0.271, val_loss_step=1.630, val_acc_step=0.215, val_loss_epoch=1.650, val_acc_epoch=0.200, train_loss_epoch=1.640, train_acc_epoch=0.201] 

Epoch 2, global step 876: 'val_loss' reached 1.64645 (best 1.64645), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_1-v2.ckpt' as top 1


Epoch 3: 100%|██████████| 292/292 [01:19<00:00,  3.65it/s, v_num=52, train_loss_step=1.850, train_acc_step=0.0527, val_loss_step=1.730, val_acc_step=0.180, val_loss_epoch=1.700, val_acc_epoch=0.201, train_loss_epoch=1.660, train_acc_epoch=0.204]

Epoch 3, global step 1168: 'val_loss' was not in top 1


Epoch 4: 100%|██████████| 292/292 [01:54<00:00,  2.55it/s, v_num=52, train_loss_step=1.850, train_acc_step=0.0527, val_loss_step=1.730, val_acc_step=0.180, val_loss_epoch=1.700, val_acc_epoch=0.201, train_loss_epoch=1.700, train_acc_epoch=0.200]

Epoch 4, global step 1460: 'val_loss' was not in top 1


Epoch 5: 100%|██████████| 292/292 [01:55<00:00,  2.53it/s, v_num=52, train_loss_step=1.850, train_acc_step=0.0527, val_loss_step=1.730, val_acc_step=0.180, val_loss_epoch=1.700, val_acc_epoch=0.201, train_loss_epoch=1.700, train_acc_epoch=0.200]

Epoch 5, global step 1752: 'val_loss' was not in top 1


Epoch 6: 100%|██████████| 292/292 [01:43<00:00,  2.81it/s, v_num=52, train_loss_step=1.850, train_acc_step=0.0527, val_loss_step=1.730, val_acc_step=0.180, val_loss_epoch=1.700, val_acc_epoch=0.201, train_loss_epoch=1.700, train_acc_epoch=0.200]

Monitored metric val_loss did not improve in the last 5 records. Best score: 1.647. Signaling Trainer to stop.
Epoch 6, global step 2044: 'val_loss' was not in top 1


Epoch 6: 100%|██████████| 292/292 [01:43<00:00,  2.81it/s, v_num=52, train_loss_step=1.850, train_acc_step=0.0527, val_loss_step=1.730, val_acc_step=0.180, val_loss_epoch=1.700, val_acc_epoch=0.201, train_loss_epoch=1.700, train_acc_epoch=0.200]
checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_1-v2.ckpt'
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.0, 'bidirectional': True}
config={'type': 'Linear', 'input_dim': 128, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Testing DataLoader 0: 100%|██████████| 48/48 [00:01<00:00, 39.09it/s]


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | layers    | ModuleList       | 49.0 K | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
49.0 K    Trainable params
0         Non-trainable params
49.0 K    Total params
0.196     Total estimated model params size (MB)


result[0]={'test_loss_epoch': 1.6465201377868652, 'test_acc_epoch': 0.2010498046875}
Optim cpt=1 checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_1-v2.ckpt' result[0]['test_acc_epoch']=0.2010498046875
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.0, 'bidirectional': True}
config={'type': 'Linear', 'input_dim': 128, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Epoch 0: 100%|██████████| 292/292 [00:44<00:00,  6.55it/s, v_num=53, train_loss_step=1.610, train_acc_step=0.213, val_loss_step=1.610, val_acc_step=0.158, val_loss_epoch=1.610, val_acc_epoch=0.201, train_loss_epoch=1.620, train_acc_epoch=0.212]

Metric val_loss improved. New best score: 1.609
Epoch 0, global step 292: 'val_loss' reached 1.60866 (best 1.60866), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_2-v1.ckpt' as top 1


Epoch 1: 100%|██████████| 292/292 [01:22<00:00,  3.55it/s, v_num=53, train_loss_step=1.600, train_acc_step=0.211, val_loss_step=1.600, val_acc_step=0.211, val_loss_epoch=1.600, val_acc_epoch=0.219, train_loss_epoch=1.610, train_acc_epoch=0.217] 

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 1.605
Epoch 1, global step 584: 'val_loss' reached 1.60472 (best 1.60472), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_2-v1.ckpt' as top 1


Epoch 2: 100%|██████████| 292/292 [01:19<00:00,  3.68it/s, v_num=53, train_loss_step=1.590, train_acc_step=0.234, val_loss_step=1.600, val_acc_step=0.209, val_loss_epoch=1.600, val_acc_epoch=0.223, train_loss_epoch=1.610, train_acc_epoch=0.230] 

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 1.602
Epoch 2, global step 876: 'val_loss' reached 1.60233 (best 1.60233), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_2-v1.ckpt' as top 1


Epoch 3: 100%|██████████| 292/292 [00:57<00:00,  5.09it/s, v_num=53, train_loss_step=1.590, train_acc_step=0.250, val_loss_step=1.590, val_acc_step=0.266, val_loss_epoch=1.600, val_acc_epoch=0.226, train_loss_epoch=1.600, train_acc_epoch=0.239] 

Epoch 3, global step 1168: 'val_loss' was not in top 1


Epoch 4: 100%|██████████| 292/292 [01:09<00:00,  4.21it/s, v_num=53, train_loss_step=1.590, train_acc_step=0.232, val_loss_step=1.590, val_acc_step=0.262, val_loss_epoch=1.600, val_acc_epoch=0.233, train_loss_epoch=1.600, train_acc_epoch=0.242] 

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 1.600
Epoch 4, global step 1460: 'val_loss' reached 1.60023 (best 1.60023), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_2-v1.ckpt' as top 1


Epoch 5: 100%|██████████| 292/292 [01:05<00:00,  4.44it/s, v_num=53, train_loss_step=1.580, train_acc_step=0.262, val_loss_step=1.600, val_acc_step=0.242, val_loss_epoch=1.620, val_acc_epoch=0.206, train_loss_epoch=1.600, train_acc_epoch=0.240] 

Epoch 5, global step 1752: 'val_loss' was not in top 1


Epoch 6: 100%|██████████| 292/292 [01:50<00:00,  2.65it/s, v_num=53, train_loss_step=1.590, train_acc_step=0.229, val_loss_step=1.590, val_acc_step=0.225, val_loss_epoch=1.600, val_acc_epoch=0.222, train_loss_epoch=1.600, train_acc_epoch=0.242] 

Epoch 6, global step 2044: 'val_loss' was not in top 1


Epoch 7: 100%|██████████| 292/292 [01:50<00:00,  2.65it/s, v_num=53, train_loss_step=1.580, train_acc_step=0.219, val_loss_step=1.590, val_acc_step=0.225, val_loss_epoch=1.610, val_acc_epoch=0.222, train_loss_epoch=1.600, train_acc_epoch=0.243] 

Epoch 7, global step 2336: 'val_loss' was not in top 1


Epoch 8: 100%|██████████| 292/292 [01:42<00:00,  2.84it/s, v_num=53, train_loss_step=1.580, train_acc_step=0.213, val_loss_step=1.590, val_acc_step=0.238, val_loss_epoch=1.600, val_acc_epoch=0.221, train_loss_epoch=1.600, train_acc_epoch=0.240] 

Epoch 8, global step 2628: 'val_loss' was not in top 1


Epoch 9: 100%|██████████| 292/292 [01:40<00:00,  2.90it/s, v_num=53, train_loss_step=1.590, train_acc_step=0.207, val_loss_step=1.590, val_acc_step=0.242, val_loss_epoch=1.600, val_acc_epoch=0.240, train_loss_epoch=1.590, train_acc_epoch=0.239] 

Monitored metric val_loss did not improve in the last 5 records. Best score: 1.600. Signaling Trainer to stop.
Epoch 9, global step 2920: 'val_loss' reached 1.59940 (best 1.59940), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_2-v1.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 292/292 [01:40<00:00,  2.90it/s, v_num=53, train_loss_step=1.590, train_acc_step=0.207, val_loss_step=1.590, val_acc_step=0.242, val_loss_epoch=1.600, val_acc_epoch=0.240, train_loss_epoch=1.590, train_acc_epoch=0.239]
checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_2-v1.ckpt'
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.0, 'bidirectional': True}
config={'type': 'Linear', 'input_dim': 128, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Testing DataLoader 0: 100%|██████████| 48/48 [00:03<00:00, 13.16it/s]


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | layers    | ModuleList       | 49.0 K | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
49.0 K    Trainable params
0         Non-trainable params
49.0 K    Total params
0.196     Total estimated model params size (MB)


result[0]={'test_loss_epoch': 1.5895849466323853, 'test_acc_epoch': 0.2444254606962204}
Optim cpt=2 checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_2-v1.ckpt' result[0]['test_acc_epoch']=0.2444254606962204
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.0, 'bidirectional': True}
config={'type': 'Linear', 'input_dim': 128, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Epoch 0: 100%|██████████| 292/292 [01:07<00:00,  4.30it/s, v_num=54, train_loss_step=1.590, train_acc_step=0.254, val_loss_step=1.600, val_acc_step=0.244, val_loss_epoch=1.610, val_acc_epoch=0.199, train_loss_epoch=1.620, train_acc_epoch=0.217]

Metric val_loss improved. New best score: 1.615
Epoch 0, global step 292: 'val_loss' reached 1.61464 (best 1.61464), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt' as top 1


Epoch 1: 100%|██████████| 292/292 [00:51<00:00,  5.65it/s, v_num=54, train_loss_step=1.570, train_acc_step=0.277, val_loss_step=1.610, val_acc_step=0.223, val_loss_epoch=1.630, val_acc_epoch=0.198, train_loss_epoch=1.630, train_acc_epoch=0.215] 

Epoch 1, global step 584: 'val_loss' was not in top 1


Epoch 2: 100%|██████████| 292/292 [00:59<00:00,  4.88it/s, v_num=54, train_loss_step=1.630, train_acc_step=0.0762, val_loss_step=1.600, val_acc_step=0.227, val_loss_epoch=1.610, val_acc_epoch=0.199, train_loss_epoch=1.630, train_acc_epoch=0.201]

Epoch 2, global step 876: 'val_loss' reached 1.61392 (best 1.61392), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt' as top 1


Epoch 3: 100%|██████████| 292/292 [01:05<00:00,  4.46it/s, v_num=54, train_loss_step=1.620, train_acc_step=0.195, val_loss_step=1.600, val_acc_step=0.268, val_loss_epoch=1.610, val_acc_epoch=0.224, train_loss_epoch=1.620, train_acc_epoch=0.213] 

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 1.610
Epoch 3, global step 1168: 'val_loss' reached 1.61025 (best 1.61025), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt' as top 1


Epoch 4: 100%|██████████| 292/292 [01:16<00:00,  3.81it/s, v_num=54, train_loss_step=1.610, train_acc_step=0.248, val_loss_step=1.590, val_acc_step=0.264, val_loss_epoch=1.610, val_acc_epoch=0.220, train_loss_epoch=1.610, train_acc_epoch=0.228] 

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 1.608
Epoch 4, global step 1460: 'val_loss' reached 1.60797 (best 1.60797), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt' as top 1


Epoch 5: 100%|██████████| 292/292 [01:16<00:00,  3.83it/s, v_num=54, train_loss_step=1.590, train_acc_step=0.242, val_loss_step=1.590, val_acc_step=0.262, val_loss_epoch=1.610, val_acc_epoch=0.224, train_loss_epoch=1.610, train_acc_epoch=0.232] 

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 1.606
Epoch 5, global step 1752: 'val_loss' reached 1.60628 (best 1.60628), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt' as top 1


Epoch 6: 100%|██████████| 292/292 [01:27<00:00,  3.34it/s, v_num=54, train_loss_step=1.590, train_acc_step=0.234, val_loss_step=1.590, val_acc_step=0.262, val_loss_epoch=1.600, val_acc_epoch=0.227, train_loss_epoch=1.600, train_acc_epoch=0.236] 

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 1.604
Epoch 6, global step 2044: 'val_loss' reached 1.60414 (best 1.60414), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt' as top 1


Epoch 7: 100%|██████████| 292/292 [01:08<00:00,  4.28it/s, v_num=54, train_loss_step=1.590, train_acc_step=0.250, val_loss_step=1.580, val_acc_step=0.266, val_loss_epoch=1.610, val_acc_epoch=0.226, train_loss_epoch=1.600, train_acc_epoch=0.236] 

Epoch 7, global step 2336: 'val_loss' was not in top 1


Epoch 8: 100%|██████████| 292/292 [01:22<00:00,  3.56it/s, v_num=54, train_loss_step=1.590, train_acc_step=0.232, val_loss_step=1.580, val_acc_step=0.275, val_loss_epoch=1.600, val_acc_epoch=0.231, train_loss_epoch=1.600, train_acc_epoch=0.243] 

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 1.602
Epoch 8, global step 2628: 'val_loss' reached 1.60205 (best 1.60205), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt' as top 1


Epoch 9: 100%|██████████| 292/292 [01:07<00:00,  4.35it/s, v_num=54, train_loss_step=1.600, train_acc_step=0.227, val_loss_step=1.590, val_acc_step=0.217, val_loss_epoch=1.610, val_acc_epoch=0.230, train_loss_epoch=1.600, train_acc_epoch=0.242] 

Epoch 9, global step 2920: 'val_loss' was not in top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 292/292 [01:07<00:00,  4.35it/s, v_num=54, train_loss_step=1.600, train_acc_step=0.227, val_loss_step=1.590, val_acc_step=0.217, val_loss_epoch=1.610, val_acc_epoch=0.230, train_loss_epoch=1.600, train_acc_epoch=0.242]
checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt'
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.0, 'bidirectional': True}
config={'type': 'Linear', 'input_dim': 128, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Testing DataLoader 0: 100%|██████████| 48/48 [00:01<00:00, 26.56it/s]


result[0]={'test_loss_epoch': 1.5952256917953491, 'test_acc_epoch': 0.2432454377412796}
Optim cpt=3 checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_0_3-v1.ckpt' result[0]['test_acc_epoch']=0.2432454377412796
Optim fail cpt=3 param suivant cpt_param=1


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | layers    | ModuleList       | 17.2 K | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
17.2 K    Trainable params
0         Non-trainable params
17.2 K    Total params
0.069     Total estimated model params size (MB)


param_valid={'layer_configs': [{'type': 'Linear', 'input_dim': 27, 'output_dim': 27}, {'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.2, 'bidirectional': False}, {'type': 'Linear', 'input_dim': 32, 'output_dim': 5}, {'type': 'Softmax', 'dim': 1}], 'optimizer__lr': 0.01}
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.2, 'bidirectional': False}
config={'type': 'Linear', 'input_dim': 32, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Epoch 0: 100%|██████████| 292/292 [00:30<00:00,  9.55it/s, v_num=55, train_loss_step=1.660, train_acc_step=0.213, val_loss_step=1.650, val_acc_step=0.158, val_loss_epoch=1.630, val_acc_epoch=0.200, train_loss_epoch=1.640, train_acc_epoch=0.215]

Metric val_loss improved. New best score: 1.629
Epoch 0, global step 292: 'val_loss' reached 1.62876 (best 1.62876), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1


Epoch 1: 100%|██████████| 292/292 [00:31<00:00,  9.27it/s, v_num=55, train_loss_step=1.600, train_acc_step=0.271, val_loss_step=1.650, val_acc_step=0.215, val_loss_epoch=1.650, val_acc_epoch=0.200, train_loss_epoch=1.660, train_acc_epoch=0.204]  

Epoch 1, global step 584: 'val_loss' was not in top 1


Epoch 2: 100%|██████████| 292/292 [00:30<00:00,  9.56it/s, v_num=55, train_loss_step=1.600, train_acc_step=0.209, val_loss_step=1.630, val_acc_step=0.203, val_loss_epoch=1.630, val_acc_epoch=0.200, train_loss_epoch=1.640, train_acc_epoch=0.207]  

Epoch 2, global step 876: 'val_loss' reached 1.62858 (best 1.62858), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1


Epoch 3: 100%|██████████| 292/292 [00:31<00:00,  9.18it/s, v_num=55, train_loss_step=1.590, train_acc_step=0.271, val_loss_step=1.630, val_acc_step=0.215, val_loss_epoch=1.620, val_acc_epoch=0.200, train_loss_epoch=1.640, train_acc_epoch=0.212]  

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 1.624
Epoch 3, global step 1168: 'val_loss' reached 1.62427 (best 1.62427), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1


Epoch 4: 100%|██████████| 292/292 [00:33<00:00,  8.63it/s, v_num=55, train_loss_step=1.580, train_acc_step=0.254, val_loss_step=1.600, val_acc_step=0.244, val_loss_epoch=1.620, val_acc_epoch=0.199, train_loss_epoch=1.630, train_acc_epoch=0.208] 

Metric val_loss improved by 0.009 >= min_delta = 0.001. New best score: 1.616
Epoch 4, global step 1460: 'val_loss' reached 1.61572 (best 1.61572), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1


Epoch 5: 100%|██████████| 292/292 [00:35<00:00,  8.16it/s, v_num=55, train_loss_step=1.600, train_acc_step=0.189, val_loss_step=1.610, val_acc_step=0.229, val_loss_epoch=1.610, val_acc_epoch=0.202, train_loss_epoch=1.620, train_acc_epoch=0.210] 

Metric val_loss improved by 0.005 >= min_delta = 0.001. New best score: 1.610
Epoch 5, global step 1752: 'val_loss' reached 1.61035 (best 1.61035), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1


Epoch 6: 100%|██████████| 292/292 [00:49<00:00,  5.90it/s, v_num=55, train_loss_step=1.610, train_acc_step=0.207, val_loss_step=1.610, val_acc_step=0.199, val_loss_epoch=1.610, val_acc_epoch=0.201, train_loss_epoch=1.610, train_acc_epoch=0.190]  

Epoch 6, global step 2044: 'val_loss' reached 1.60966 (best 1.60966), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1


Epoch 7: 100%|██████████| 292/292 [00:46<00:00,  6.29it/s, v_num=55, train_loss_step=1.600, train_acc_step=0.242, val_loss_step=1.610, val_acc_step=0.232, val_loss_epoch=1.610, val_acc_epoch=0.199, train_loss_epoch=1.610, train_acc_epoch=0.193] 

Epoch 7, global step 2336: 'val_loss' reached 1.60950 (best 1.60950), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1


Epoch 8: 100%|██████████| 292/292 [00:48<00:00,  5.99it/s, v_num=55, train_loss_step=1.610, train_acc_step=0.211, val_loss_step=1.610, val_acc_step=0.201, val_loss_epoch=1.610, val_acc_epoch=0.203, train_loss_epoch=1.610, train_acc_epoch=0.191]  

Metric val_loss improved by 0.001 >= min_delta = 0.001. New best score: 1.609
Epoch 8, global step 2628: 'val_loss' reached 1.60923 (best 1.60923), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1


Epoch 9: 100%|██████████| 292/292 [00:49<00:00,  5.88it/s, v_num=55, train_loss_step=1.610, train_acc_step=0.213, val_loss_step=1.610, val_acc_step=0.207, val_loss_epoch=1.610, val_acc_epoch=0.202, train_loss_epoch=1.610, train_acc_epoch=0.184]  

Epoch 9, global step 2920: 'val_loss' reached 1.60885 (best 1.60885), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 292/292 [00:49<00:00,  5.88it/s, v_num=55, train_loss_step=1.610, train_acc_step=0.213, val_loss_step=1.610, val_acc_step=0.207, val_loss_epoch=1.610, val_acc_epoch=0.202, train_loss_epoch=1.610, train_acc_epoch=0.184]
checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt'
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.2, 'bidirectional': False}
config={'type': 'Linear', 'input_dim': 32, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Testing DataLoader 0: 100%|██████████| 48/48 [00:01<00:00, 26.73it/s]


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | layers    | ModuleList       | 17.2 K | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
17.2 K    Trainable params
0         Non-trainable params
17.2 K    Total params
0.069     Total estimated model params size (MB)


result[0]={'test_loss_epoch': 1.6089013814926147, 'test_acc_epoch': 0.2014973908662796}
Optim cpt=1 checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_1.ckpt' result[0]['test_acc_epoch']=0.2014973908662796
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.2, 'bidirectional': False}
config={'type': 'Linear', 'input_dim': 32, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Epoch 0: 100%|██████████| 292/292 [00:38<00:00,  7.63it/s, v_num=56, train_loss_step=1.670, train_acc_step=0.213, val_loss_step=1.710, val_acc_step=0.158, val_loss_epoch=1.670, val_acc_epoch=0.200, train_loss_epoch=1.640, train_acc_epoch=0.209]

Metric val_loss improved. New best score: 1.668
Epoch 0, global step 292: 'val_loss' reached 1.66812 (best 1.66812), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt' as top 1


Epoch 1: 100%|██████████| 292/292 [00:45<00:00,  6.45it/s, v_num=56, train_loss_step=1.570, train_acc_step=0.271, val_loss_step=1.600, val_acc_step=0.215, val_loss_epoch=1.620, val_acc_epoch=0.200, train_loss_epoch=1.630, train_acc_epoch=0.212] 

Metric val_loss improved by 0.051 >= min_delta = 0.001. New best score: 1.617
Epoch 1, global step 584: 'val_loss' reached 1.61696 (best 1.61696), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt' as top 1


Epoch 2: 100%|██████████| 292/292 [00:43<00:00,  6.66it/s, v_num=56, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.200]  

Metric val_loss improved by 0.007 >= min_delta = 0.001. New best score: 1.610
Epoch 2, global step 876: 'val_loss' reached 1.60954 (best 1.60954), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt' as top 1


Epoch 3: 100%|██████████| 292/292 [00:44<00:00,  6.59it/s, v_num=56, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.184]  

Epoch 3, global step 1168: 'val_loss' reached 1.60951 (best 1.60951), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt' as top 1


Epoch 4: 100%|██████████| 292/292 [00:48<00:00,  6.01it/s, v_num=56, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.184]  

Epoch 4, global step 1460: 'val_loss' reached 1.60948 (best 1.60948), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt' as top 1


Epoch 5: 100%|██████████| 292/292 [00:46<00:00,  6.30it/s, v_num=56, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.184]  

Epoch 5, global step 1752: 'val_loss' reached 1.60947 (best 1.60947), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt' as top 1


Epoch 6: 100%|██████████| 292/292 [00:45<00:00,  6.39it/s, v_num=56, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.183]  

Epoch 6, global step 2044: 'val_loss' reached 1.60947 (best 1.60947), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt' as top 1


Epoch 7: 100%|██████████| 292/292 [00:46<00:00,  6.25it/s, v_num=56, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.184]  

Monitored metric val_loss did not improve in the last 5 records. Best score: 1.610. Signaling Trainer to stop.
Epoch 7, global step 2336: 'val_loss' was not in top 1


Epoch 7: 100%|██████████| 292/292 [00:46<00:00,  6.24it/s, v_num=56, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.184]
checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt'
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.2, 'bidirectional': False}
config={'type': 'Linear', 'input_dim': 32, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Testing DataLoader 0: 100%|██████████| 48/48 [00:01<00:00, 38.78it/s]


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | layers    | ModuleList       | 17.2 K | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
17.2 K    Trainable params
0         Non-trainable params
17.2 K    Total params
0.069     Total estimated model params size (MB)


result[0]={'test_loss_epoch': 1.6094847917556763, 'test_acc_epoch': 0.200439453125}
Optim cpt=2 checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_2.ckpt' result[0]['test_acc_epoch']=0.200439453125
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.2, 'bidirectional': False}
config={'type': 'Linear', 'input_dim': 32, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Epoch 0: 100%|██████████| 292/292 [00:32<00:00,  8.88it/s, v_num=57, train_loss_step=1.590, train_acc_step=0.254, val_loss_step=1.600, val_acc_step=0.244, val_loss_epoch=1.610, val_acc_epoch=0.199, train_loss_epoch=1.620, train_acc_epoch=0.214]

Metric val_loss improved. New best score: 1.613
Epoch 0, global step 292: 'val_loss' reached 1.61308 (best 1.61308), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt' as top 1


Epoch 1: 100%|██████████| 292/292 [00:35<00:00,  8.19it/s, v_num=57, train_loss_step=1.600, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.620, train_acc_epoch=0.209]  

Metric val_loss improved by 0.002 >= min_delta = 0.001. New best score: 1.611
Epoch 1, global step 584: 'val_loss' reached 1.61101 (best 1.61101), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt' as top 1


Epoch 2: 100%|██████████| 292/292 [00:36<00:00,  8.09it/s, v_num=57, train_loss_step=1.600, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.191]  

Epoch 2, global step 876: 'val_loss' reached 1.61015 (best 1.61015), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt' as top 1


Epoch 3: 100%|██████████| 292/292 [00:37<00:00,  7.74it/s, v_num=57, train_loss_step=1.600, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.191]  

Epoch 3, global step 1168: 'val_loss' was not in top 1


Epoch 4: 100%|██████████| 292/292 [00:34<00:00,  8.40it/s, v_num=57, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.189]  

Metric val_loss improved by 0.001 >= min_delta = 0.001. New best score: 1.610
Epoch 4, global step 1460: 'val_loss' reached 1.60980 (best 1.60980), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt' as top 1


Epoch 5: 100%|██████████| 292/292 [00:35<00:00,  8.14it/s, v_num=57, train_loss_step=1.600, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.188] 

Epoch 5, global step 1752: 'val_loss' was not in top 1


Epoch 6: 100%|██████████| 292/292 [00:34<00:00,  8.48it/s, v_num=57, train_loss_step=1.620, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.201] 

Epoch 6, global step 2044: 'val_loss' was not in top 1


Epoch 7: 100%|██████████| 292/292 [00:35<00:00,  8.13it/s, v_num=57, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.190]  

Epoch 7, global step 2336: 'val_loss' reached 1.60975 (best 1.60975), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt' as top 1


Epoch 8: 100%|██████████| 292/292 [00:34<00:00,  8.50it/s, v_num=57, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.184]  

Epoch 8, global step 2628: 'val_loss' reached 1.60972 (best 1.60972), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt' as top 1


Epoch 9: 100%|██████████| 292/292 [00:33<00:00,  8.61it/s, v_num=57, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.186]  

Monitored metric val_loss did not improve in the last 5 records. Best score: 1.610. Signaling Trainer to stop.
Epoch 9, global step 2920: 'val_loss' reached 1.60970 (best 1.60970), saving model to 'C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 292/292 [00:33<00:00,  8.60it/s, v_num=57, train_loss_step=1.610, train_acc_step=0.209, val_loss_step=1.610, val_acc_step=0.203, val_loss_epoch=1.610, val_acc_epoch=0.200, train_loss_epoch=1.610, train_acc_epoch=0.186]
checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt'
config={'type': 'Linear', 'input_dim': 27, 'output_dim': 27}
config={'type': 'LSTM', 'input_dim': 27, 'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.2, 'bidirectional': False}
config={'type': 'Linear', 'input_dim': 32, 'output_dim': 5}
config={'type': 'Softmax', 'dim': 1}
Testing DataLoader 0: 100%|██████████| 48/48 [00:01<00:00, 36.43it/s]


result[0]={'test_loss_epoch': 1.6096946001052856, 'test_acc_epoch': 0.2002360075712204}
Optim cpt=3 checkpoint_callback.best_model_path='C:\\Projets\\Data\\Models\\PARIS_TREND_1D_20D_V2_lstm_pytorch_v1_20240723_1_3.ckpt' result[0]['test_acc_epoch']=0.2002360075712204
Optim fail cpt=3 param suivant cpt_param=2


In [13]:
%load_ext tensorboard
%tensorboard --logdir tb_logs

In [None]:
########################################
###### SAVE CODE FOR BASIC PYTORCH #####
###### BEFORE PYTORCH LIGHTNING ########

list_param_valid = [
                    {'model__dropout': 0.05, 'model__hidden_dim': 16, 'model__num_layers': 2, 'optimizer__lr': 0.1},
                    # {'fit__batch_size': 256, 'model__dropout': 0.05, 'model__layers': [64, 10], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
                    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
                    # {'fit__batch_size': 32, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.9},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.7},
                    # {'fit__batch_size': 64, 'fit__epochs': 350, 'model__dropout': 0.05, 'model__layers': [128, 20], 'optimizer__lr': 0.1, 'optimizer__momentum': 0.5},
]

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, dropout):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

input_dim = x_train_tensor.shape[2]
num_classes = 5
epochs = 6#350
suffix="lstm_pytorch_v1"
filename_tmp_model = dts_name+"_"+suffix+".pckl"
patience = 3

val_accuracy=0.0
obj_acc=0.25
cpt_param=0 
try_limit=5
pct_check_class=0.4 # check if at least n% of the validation set per class
criterion = nn.CrossEntropyLoss()

len_val=x_val_tensor.shape[0]
check_class_limit=(len_val/num_classes)*pct_check_class
check_class=False # check if at least obj_acc accuracy per class

while(cpt_param<len(list_param_valid) and check_class==False):
    param_valid=list_param_valid[cpt_param] #select the current param line
    print(param_valid)
    cpt=0

    while(cpt<try_limit and check_class==False):
        cpt+=1
        
        model = LSTMModel(input_dim=input_dim, hidden_dim=param_valid['model__hidden_dim'], num_layers=param_valid['model__num_layers'], num_classes=num_classes, dropout=param_valid['model__dropout'])
        criterion = nn.CrossEntropyLoss()
        optimizer = Adam(model.parameters(), lr=param_valid['optimizer__lr'])

        if cpt==1:
            print(model)
            print(len(list(model.parameters())))
            for i in range(len(list(model.parameters()))):
                print(list(model.parameters())[i].size())

        # Training loop
        hist = np.zeros(epochs)
        for epoch in range(epochs):
            for i, (x_batch, y_batch) in enumerate(train_loader):
                model.train()
                optimizer.zero_grad()
                outputs = model(x_batch)
                loss = criterion(outputs, y_batch)

                loss.backward()
                optimizer.step()

            if epoch % 1 == 0 :   #change % 
                print(f"Epoch {epoch+1} CrossEntropyLoss: {loss.item()}")
            hist[epoch] = loss.item()


        # Validation
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for x_batch, y_batch in val_loader:
                outputs = model(x_batch)
                _, predicted = torch.max(outputs.data, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()

        val_accuracy = correct / total
        print(f"Epoch {epoch+1}, Loss: {loss.item()}, Validation Accuracy: {val_accuracy}")

        if val_accuracy>obj_acc:
            print(f"Optim success {cpt=} {val_accuracy=}")
            check_class=True #exit directly

            # calculate the confusion matrix
            y_pred = model(x_val_tensor)
            _, y_pred_classes = torch.max(y_pred, 1)
            confusion = metrics.confusion_matrix(y_val_tensor, y_pred_classes)
            print(confusion)

            for i in range(num_classes):
                nb_lab=sum(y_pred_classes == i)
                if nb_lab<check_class_limit  :
                    check_class=False
                    print(f"Check class {i=} {nb_lab=} {check_class=} {check_class_limit=}")
                # print(f"Categ {i}: real {sum(y_val_tensor == i)} predict {sum(y_pred_classes == i)}")


            #check saved model, load to check it's OK
            if check_class:
                torch.save(model, filename_tmp_model)
                saved_model = torch.load(filename_tmp_model)
                saved_model.eval()
                y_pred = saved_model(x_val_tensor)
                _, y_pred_classes = torch.max(y_pred, 1)
                confusion = metrics.confusion_matrix(y_val_tensor, y_pred_classes)
                print(confusion)

    if cpt>=try_limit :
        cpt_param+=1
        print(f"Optim fail {cpt=} param suivant {cpt_param=}")

In [None]:
plt.plot(hist, label="Training loss")
plt.legend()
plt.show()

In [None]:
input_dim = x_train.shape[-1]
window_size = sequence_length
dropout = 0.2
num_classes = 4

# cat_y_train = keras.utils.to_categorical(col_y_train, num_classes)
# cat_y_valid = keras.utils.to_categorical(col_y_valid, num_classes)

# df_x_train_exp = np.expand_dims(df_x_train, axis=2)
# df_x_valid_exp = np.expand_dims(df_x_valid, axis=2)


model_LSTM = Sequential()
model_LSTM.add(LSTM(units=20, return_sequences=False,#True
               input_shape=(window_size, input_dim)))
#,kernel_regularizer=l2(0.1), recurrent_regularizer=l2(0.1), bias_regularizer=l2(0.1)
model_LSTM.add(Dropout(rate=dropout))   
# model_LSTM.add(Dropout(rate=dropout))
# model_LSTM.add(Bidirectional(LSTM((window_size * 2), return_sequences=True)))
# model_LSTM.add(Dropout(rate=dropout))
# model_LSTM.add(Bidirectional(LSTM(window_size, return_sequences=False)))
model_LSTM.add(Dense(units=num_classes, activation='softmax'))

model_LSTM.compile(loss='categorical_crossentropy',
                   optimizer='adam', metrics=['accuracy'])

history = model_LSTM.fit(x_train_lstm, y_train_lstm, batch_size=1024,
                         shuffle=False, epochs=20, validation_data=(x_val_lstm, y_val_lstm))#,verbose=0

train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Plot loss
epochs = range(1, len(train_accuracy) + 1)
plt.plot(epochs, train_accuracy, 'bo-', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'ro-', label='Validation accuracy')
plt.legend()
plt.show()


In [53]:
# print if keras can use the gpu to train the model
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())



[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12302096189872760406
xla_global_id: -1
]
