In [1]:
import os
import math
import random #to set the seed to replicate results
from datetime import datetime,timedelta #for today's date
from dateutil.relativedelta import relativedelta
import sys

import psycopg2
import psycopg2.extras
from psycopg2.extensions import AsIs

import pandas as pd 
import numpy as np #for the e_logarithmic filter (and also some other mathematical operations)
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler #for preprocessing, it scales features using statistics that are robust to outliers.
from sklearn.linear_model import LinearRegression
from scipy.stats import linregress #for the slope and the value of Y at X=0 of the linear trend line
from scipy.optimize import curve_fit
import tsmoothie #for the Kalman filter, it is an efficient recursive filter that evaluates the state of a dynamic system starting from a series of measurements subject to noise.

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM #the two main layers of the model
from tensorflow.keras import optimizers #for the training of the model
from tensorflow.keras.models import load_model

random.seed(42)  #set the seed to replicate results

print(tf.config.list_physical_devices('GPU'))

2024-01-11 11:00:40.478005: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-11 11:00:40.478178: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-11 11:00:40.821494: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-11 11:00:41.401551: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2024-01-11 11:00:47.798957: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-11 11:00:49.450219: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-11 11:00:49.450490: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

#### UTILS

In [2]:
'''Function for making sequences (blocks) of test and train data'''
def building_data_sequences(data_X,data_Y, timesteps): #timesteps means how many days we consider for each block

    X=[]
    y_MPNxP = []
    for i in range(len(data_X)-timesteps+1):  #how it works: every timesteps (e.g. 10 days) a block is constituted and for each block data and true values are stored
        X.append(data_X[i:(i+timesteps),:])
        y_MPNxP.append(data_Y[i+timesteps-1])
    return np.array(X), [np.array(y_MPNxP)]

'''Function for computing the analytical parameters'''
def sir_parameters(x,y): #sir stands for slope, intercept, rvalue (actually there's also the average trend line distance or avg_tld, but it came later)

  analytical_params = linregress(x, y)
  slope = analytical_params.slope
  intercept = analytical_params.intercept
  rvalue = analytical_params.rvalue #pay attention that here we have the correlaton coefficient (so not r2 that is the coefficient of determination)
  x_trend_line = slope*x + intercept #this is computed just for the avg_tld
  avg_trend_line_distance = np.mean(np.abs(x_trend_line-y))
  return slope,intercept,rvalue**2,avg_trend_line_distance

#### Defining variables

In [3]:
'''This is the timestep which indicates the window size'''
model_case_version_time_steps= '10'
'''This is the number of periods defined in the target(MPNxP) where x is the number of periods'''
model_case_version_main_target_code='5'

#today = '20220706'
today = datetime.today().strftime('%Y%m%d') #just for names of files (for now)
print('Today is', today)

Today is 20240111


In [4]:
''' Here, we define the case name'''
case = 'EURUSD=X'
print(case)

EURUSD=X


In [5]:
'''Here, we define the list of targets we are going to work on and also the average for each target (this value is used during the training for normalization\rescaling of some analytical parameters)'''
targets =['MPN'+model_case_version_main_target_code+'P']  #this must be changed whenever tha targets change
avg_prices_list = []

In [6]:
#1L = dispersion = 1
#4L = slope + intercept + resqr + dispersion = 1
#5L = all the weighting = 1
#new paramaters case = intercept + slope + end intercept + correlation + dispersion
slope_weighting_exponent_ratio = 1
intercept_weighting_exponent_ratio = 1
end_intercept_weighting_exponent_ratio = 0
rsqr_weighting_exponent_ratio = 1
dispersion_weighting_exponent_ratio = 3

slope_weighting_exponent_predicted_actual = 0
intercept_weighting_exponent_predicted_actual = 0
rsqr_weighting_exponent_predicted_actual = 0
dispersion_weighting_exponent_predicted_actual = 0

analytical_parametrs = str(intercept_weighting_exponent_ratio)+str(slope_weighting_exponent_ratio)+str(end_intercept_weighting_exponent_ratio)+str(rsqr_weighting_exponent_ratio)+str(dispersion_weighting_exponent_ratio)
print(analytical_parametrs)

11013


#### Model Configs

In [7]:
'''Model parameters: in order to understand, consult the original documentation (case_version_cat Tab) '''
n_epochs = 100
batch = 64
correction_n_epochs = 100
correction_batch=64

'''These are the exponent used to define the number of nodes for each layer'''
twoexp_nodes_number_layer_1 = 7
twoexp_nodes_number_layer_2 = 10
twoexp_nodes_number_layer_3 = 7
twoexp_nodes_number_layer_4 = 6
twoexp_nodes_number_layer_5 = 0

lr=0.0005 #learning rate
correction_lr=0.0005

#### Train parameters

In [8]:
'''Other variables to be used during training'''

max_iterations =1#maximum number of iterations for the while loop (we will ee later in the code)
precision = 0.00000000001 #this precision is related to the quality of the compound_run_term value we want to obtain (that is representative of the quality of the model)
attenuation_factor = 0.75 #it us used in the computation of the attenuated_padding_value (see custom_loss_function)

#### Data

In [9]:
''' Conection to PostgreSQL '''
# The credentials to conect to the database
hostname = 'database-1.ctzm0hf7fhri.eu-central-1.rds.amazonaws.com'
database = 'dyDATA_new'
username = 'postgres'
pwd = 'Proc2023awsrdspostgresql'
port_id = 5432
conn = None

In [10]:
#this helps to retreive the data for a particular asset from the database
asset_script="SELECT * FROM "+'\"'+"ASSET_"+case+'\"'+".features_targets_input_view WHERE features_targets_input_view."+'\"'+"cleaned_raw_features_environment_PK"+'\"'+ "= 4"
asset_script

'SELECT * FROM "ASSET_EURUSD=X".features_targets_input_view WHERE features_targets_input_view."cleaned_raw_features_environment_PK"= 4'

#### Database Connection

In [11]:
''' The active financial assets '''
# Here we select the active financial asset from the financial asset list table
try:
  with psycopg2.connect(
      host = hostname,
      dbname = database,
      user = username,
      password = pwd,
      port = port_id
  ) as conn:

    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        print('You are connect to the Database:',database)
        select_script = asset_script # do not forget to set to asset_script when data has been uploaded
        cur.execute(select_script)
        data = cur.fetchall()
        cols = []
        # loop to create the dataframe that contains the active financial assets
        for rec in cur.description:
            cols.append(rec[0])
        dohlcav_mpnxp_data= pd.DataFrame(data = data, columns = cols)
        print('Your SQL has executed successfully')

except Exception as error:
  print(error)

finally:
  if conn is not None:
     conn.close()

You are connect to the Database: dyDATA_new
Your SQL has executed successfully


In [12]:
if case=='TSLA':
  dohlcav_mpnxp_data = dohlcav_mpnxp_data.loc[dohlcav_mpnxp_data['cleaned_raw_features_DCP_date_current_period'] >= '2020-01-01'].reset_index(drop=True)

if case == 'EURUSD=X':
  dohlcav_mpnxp_data['cleaned_raw_features_VTCP_volume_of_transactions_current_period'] = 1

dohlcav_mpnxp_data.head()

Unnamed: 0,cleaned_raw_features_id,cleaned_raw_features_DCP_date_current_period,calculated_features_DNCP,cleaned_raw_features_OPCP_open_price_current_period,cleaned_raw_features_HPCP_high_price_current_period,cleaned_raw_features_LPCP_low_price_current_period,cleaned_raw_features_CPCP_close_price_current_period,cleaned_raw_features_ACPCP_adjusted_close_price_current_period,cleaned_raw_features_VTCP_volume_of_transactions_current_period,cleaned_raw_features_environment_PK,...,calculated_targets_MPN7P,calculated_targets_HPN7P,calculated_targets_LPN7P,calculated_targets_MPN10P,calculated_targets_HPN10P,calculated_targets_LPN10P,calculated_targets_MPN20P,calculated_targets_HPN20P,calculated_targets_LPN20P,calculated_targets_environment_PK
0,1,2003-12-01,37956.0,1.203398,1.204007,1.194401,1.196501,1.196501,1,4,...,1.214403,1.227702,1.1946,1.218895,1.232498,1.1946,1.231004,1.267202,1.1946,4
1,2,2003-12-02,37957.0,1.196101,1.210903,1.1946,1.208897,1.208897,1,4,...,1.218695,1.227702,1.204398,1.219674,1.235895,1.204398,1.232498,1.267202,1.204398,4
2,3,2003-12-03,37958.0,1.209,1.213003,1.2077,1.212298,1.212298,1,4,...,1.219552,1.230603,1.204398,1.222255,1.242005,1.204398,1.235973,1.267202,1.204398,4
3,4,2003-12-04,37959.0,1.212004,1.214403,1.204398,1.208094,1.208094,1,4,...,1.221702,1.232498,1.206593,1.224455,1.244199,1.206593,1.2394,1.267202,1.206593,4
4,5,2003-12-05,37960.0,1.207802,1.219096,1.206593,1.218695,1.218695,1,4,...,1.223496,1.235895,1.212298,1.227152,1.244199,1.212298,1.240049,1.267202,1.212298,4


In [13]:
filtered_columns_1=list(dohlcav_mpnxp_data.columns[:9])#to filter out the dates columns and features columns

In [14]:
filtered_columns_2=[x for x in dohlcav_mpnxp_data.columns if  targets[0][3:] in x ]#feature out the main target columns

In [15]:
#special condition for filtering if the main target is MPN1P
if model_case_version_main_target_code=='1':
  temp=filtered_columns_2[0]
  temp_2=filtered_columns_2[1]
  filtered_columns_2[0]=filtered_columns_2[2]
  filtered_columns_2[1]=temp
  filtered_columns_2[2]=temp_2

In [16]:
filtered_columns_2

['calculated_targets_MPN5P',
 'calculated_targets_HPN5P',
 'calculated_targets_LPN5P']

In [17]:
#to add the last two constant columns to the table
filtered_columns_3=['calculated_targets_HPN1P','calculated_targets_LPN1P']

In [18]:
filtered_columns=filtered_columns_1+filtered_columns_2+filtered_columns_3

In [19]:
print(filtered_columns)

['cleaned_raw_features_id', 'cleaned_raw_features_DCP_date_current_period', 'calculated_features_DNCP', 'cleaned_raw_features_OPCP_open_price_current_period', 'cleaned_raw_features_HPCP_high_price_current_period', 'cleaned_raw_features_LPCP_low_price_current_period', 'cleaned_raw_features_CPCP_close_price_current_period', 'cleaned_raw_features_ACPCP_adjusted_close_price_current_period', 'cleaned_raw_features_VTCP_volume_of_transactions_current_period', 'calculated_targets_MPN5P', 'calculated_targets_HPN5P', 'calculated_targets_LPN5P', 'calculated_targets_HPN1P', 'calculated_targets_LPN1P']


In [20]:
dohlcav_mpnxp_data=dohlcav_mpnxp_data[filtered_columns]

In [21]:
dohlcav_mpnxp_data

Unnamed: 0,cleaned_raw_features_id,cleaned_raw_features_DCP_date_current_period,calculated_features_DNCP,cleaned_raw_features_OPCP_open_price_current_period,cleaned_raw_features_HPCP_high_price_current_period,cleaned_raw_features_LPCP_low_price_current_period,cleaned_raw_features_CPCP_close_price_current_period,cleaned_raw_features_ACPCP_adjusted_close_price_current_period,cleaned_raw_features_VTCP_volume_of_transactions_current_period,calculated_targets_MPN5P,calculated_targets_HPN5P,calculated_targets_LPN5P,calculated_targets_HPN1P,calculated_targets_LPN1P
0,1,2003-12-01,37956.0,1.203398,1.204007,1.194401,1.196501,1.196501,1,1.210903,1.224005,1.194600,1.210903,1.194600
1,2,2003-12-02,37957.0,1.196101,1.210903,1.194600,1.208897,1.208897,1,1.214403,1.227702,1.204398,1.213003,1.207700
2,3,2003-12-03,37958.0,1.209000,1.213003,1.207700,1.212298,1.212298,1,1.219096,1.227702,1.204398,1.214403,1.204398
3,4,2003-12-04,37959.0,1.212004,1.214403,1.204398,1.208094,1.208094,1,1.219552,1.227702,1.206593,1.219096,1.206593
4,5,2003-12-05,37960.0,1.207802,1.219096,1.206593,1.218695,1.218695,1,1.222105,1.230603,1.212298,1.224005,1.215407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5037,5038,2023-03-22,45007.0,1.077424,1.080147,1.075951,1.077424,1.077424,1,1.079914,1.092900,1.072156,1.092900,1.086862
5038,5039,2023-03-23,45008.0,1.087039,1.092900,1.086862,1.087039,1.087039,1,1.079914,1.092900,1.072156,1.084246,1.072156
5039,5040,2023-03-24,45009.0,1.083776,1.084246,1.072156,1.076426,1.076426,1,1.079914,1.092900,1.072156,1.079600,1.074807
5040,5041,2023-03-26,45011.0,1.077819,1.079600,1.074807,1.077819,1.077819,1,1.079914,1.092900,1.072156,1.081081,1.079914


In [22]:
if model_case_version_main_target_code=='1':
  dohlcav_mpnxp_data.columns=["ID","DCP_date_current_period","DNCP_day_number_current_period","OPCP_open_price_current_period","HPCP_high_price_current_period","LPCP_low_price_current_period"
,"CPCP_close_price_current_period","ACPCP_adjusted_close_price_current_period","VTCP_volume_of_transactions_current_period","MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods","HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods","LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods","HPN1P_high_price_next_1_period",
"LPN1P_low_price_next_1_period"
]
else:
  dohlcav_mpnxp_data = dohlcav_mpnxp_data.rename(columns={"cleaned_raw_features_id":"ID",
                                "cleaned_raw_features_DCP_date_current_period": "DCP_date_current_period",
                                "calculated_features_DNCP":"DNCP_day_number_current_period",
                                "cleaned_raw_features_OPCP_open_price_current_period":"OPCP_open_price_current_period",
                                "cleaned_raw_features_HPCP_high_price_current_period":"HPCP_high_price_current_period",
                                "cleaned_raw_features_LPCP_low_price_current_period":"LPCP_low_price_current_period",
                                "cleaned_raw_features_CPCP_close_price_current_period": "CPCP_close_price_current_period",
                                "cleaned_raw_features_ACPCP_adjusted_close_price_current_period":"ACPCP_adjusted_close_price_current_period",
                                "cleaned_raw_features_VTCP_volume_of_transactions_current_period":"VTCP_volume_of_transactions_current_period",
                                filtered_columns_2[0]:"MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods",
                                filtered_columns_2[1]:"HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods",
                                filtered_columns_2[2]:"LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods",
                                filtered_columns_3[0]:"HPN1P_high_price_next_1_period",
                                filtered_columns_3[1]:"LPN1P_low_price_next_1_period",
                                })

In [23]:
dohlcav_mpnxp_data

Unnamed: 0,ID,DCP_date_current_period,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
0,1,2003-12-01,37956.0,1.203398,1.204007,1.194401,1.196501,1.196501,1,1.210903,1.224005,1.194600,1.210903,1.194600
1,2,2003-12-02,37957.0,1.196101,1.210903,1.194600,1.208897,1.208897,1,1.214403,1.227702,1.204398,1.213003,1.207700
2,3,2003-12-03,37958.0,1.209000,1.213003,1.207700,1.212298,1.212298,1,1.219096,1.227702,1.204398,1.214403,1.204398
3,4,2003-12-04,37959.0,1.212004,1.214403,1.204398,1.208094,1.208094,1,1.219552,1.227702,1.206593,1.219096,1.206593
4,5,2003-12-05,37960.0,1.207802,1.219096,1.206593,1.218695,1.218695,1,1.222105,1.230603,1.212298,1.224005,1.215407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5037,5038,2023-03-22,45007.0,1.077424,1.080147,1.075951,1.077424,1.077424,1,1.079914,1.092900,1.072156,1.092900,1.086862
5038,5039,2023-03-23,45008.0,1.087039,1.092900,1.086862,1.087039,1.087039,1,1.079914,1.092900,1.072156,1.084246,1.072156
5039,5040,2023-03-24,45009.0,1.083776,1.084246,1.072156,1.076426,1.076426,1,1.079914,1.092900,1.072156,1.079600,1.074807
5040,5041,2023-03-26,45011.0,1.077819,1.079600,1.074807,1.077819,1.077819,1,1.079914,1.092900,1.072156,1.081081,1.079914


In [24]:
#this variable defines the path to save the results of the LMB
model_path=case + '2022_DECEMBER_FIKRI'

model_path

'EURUSD=X2022_DECEMBER_FIKRI'

In [25]:
#to check the existence of the specified path so as to be sure of saving results in the right path
if os.path.exists(model_path):
  print("YES")
else:
  print("NO")

YES


In [26]:
# dohlcav_mpnxp_data = dohlcav_mpnxp_data.replace(',','', regex=True) #remove the ',' otherwise it's impossible to deal with numbers in the dataset
dohlcav_mpnxp_data.tail(int(model_case_version_main_target_code)+1) # to visualize likely columns with NaN values in the dataset


Unnamed: 0,ID,DCP_date_current_period,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
5036,5037,2023-03-21,45006.0,1.071915,1.078725,1.070492,1.071915,1.071915,1,1.079914,1.0929,1.072156,1.080147,1.075951
5037,5038,2023-03-22,45007.0,1.077424,1.080147,1.075951,1.077424,1.077424,1,1.079914,1.0929,1.072156,1.0929,1.086862
5038,5039,2023-03-23,45008.0,1.087039,1.0929,1.086862,1.087039,1.087039,1,1.079914,1.0929,1.072156,1.084246,1.072156
5039,5040,2023-03-24,45009.0,1.083776,1.084246,1.072156,1.076426,1.076426,1,1.079914,1.0929,1.072156,1.0796,1.074807
5040,5041,2023-03-26,45011.0,1.077819,1.0796,1.074807,1.077819,1.077819,1,1.079914,1.0929,1.072156,1.081081,1.079914
5041,5042,2023-03-27,45012.0,1.08003,1.081081,1.079914,1.080964,1.080964,1,1.079914,1.0929,1.072156,1.081081,1.079914


In [27]:
''' This comprises the list of target in our datasets'''
targets_list=["MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods",
                        "HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods",
                        "LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods",
                        'HPN1P_high_price_next_1_period','LPN1P_low_price_next_1_period']

In [28]:
targets_list

['MPN5P_median_price_next_5_periods',
 'HPN5P_highest_price_next_5_periods',
 'LPN5P_lowest_price_next_5_periods',
 'HPN1P_high_price_next_1_period',
 'LPN1P_low_price_next_1_period']

In [29]:
for i in targets_list:
  if targets[0] in i:
    main_target_column=i

In [30]:
main_target_column

'MPN5P_median_price_next_5_periods'

In [31]:
'''This function helps to locate the index of where the Nan Value begins in our target(MPNxP) and later used for the stop actual variable in configuration parameter section'''
'''function is now redundant since the FDU does this work now'''
def stop_target_value(x):
  for i in x:
    if targets[0] in i:
       temp=dohlcav_mpnxp_data[i].apply(lambda y: math.isnan(float(y)) or y=='#NUM!')
       for i in range(len(temp)):
          if temp[i]== True:
             stop=i
             return stop
# if dohlcav_mpnxp_data[main_target_column].isnull().values.any():
#      stop_target=stop_target_value(targets_list)
# else:
''' What is stop target?'''
stop_target=dohlcav_mpnxp_data.index[-1]+1

In [32]:
''' this function locates the last valid value in target columns and fills the Nan values with this last valid value'''
'''function is now redundant since the FDU does this job now'''
def fill_nan_values(x,target_name):
  temp=x.apply(lambda y: math.isnan(float(y)) or y=='#NUM!')
  for i in range(len(temp)):
   if temp[i]== True:
     stop=i
     break
  target_values_for_NaN=dohlcav_mpnxp_data[target_name].loc[stop-1]
  dohlcav_mpnxp_data.loc[stop:,(target_name)]=target_values_for_NaN
'''to activate this function, uncomment the below lines of codes'''
# for i in targets_list:
#   if dohlcav_mpnxp_data[i].isnull().values.any():
#     fill_nan_values(dohlcav_mpnxp_data[i],i)

'to activate this function, uncomment the below lines of codes'

In [33]:
''' Here what we do is to separate data in two parts: the first goes from the beginnig of 2020 to the end of 2021 and
it is used for training the model; the second goes from the beginning of 2022 until the end
(it dependes on the last update we did for the dataset) and it is used for testing the model.
To do that, we siply compute the size of the training set and then we use this value (later in the code) to split the dataset '''

dohlcav_mpnxp_data['DCP_date_current_period'] = pd.to_datetime(dohlcav_mpnxp_data['DCP_date_current_period']) #date values in the dataset are converted

In [34]:
dohlcav_mpnxp_data.columns

Index(['ID', 'DCP_date_current_period', 'DNCP_day_number_current_period',
       'OPCP_open_price_current_period', 'HPCP_high_price_current_period',
       'LPCP_low_price_current_period', 'CPCP_close_price_current_period',
       'ACPCP_adjusted_close_price_current_period',
       'VTCP_volume_of_transactions_current_period',
       'MPN5P_median_price_next_5_periods',
       'HPN5P_highest_price_next_5_periods',
       'LPN5P_lowest_price_next_5_periods', 'HPN1P_high_price_next_1_period',
       'LPN1P_low_price_next_1_period'],
      dtype='object')

In [35]:
# str(dohlcav_mpnxp_data[dohlcav_mpnxp_data['DCP_date_current_period'] == '2012-05-23']['DCP_date_current_period'])

In [36]:
start_date = pd.Timestamp(str(dohlcav_mpnxp_data['DCP_date_current_period'].iloc[0])) #start date of the training set
# filtered_data = dohlcav_mpnxp_data[dohlcav_mpnxp_data['DCP_date_current_period'] == '2008-01-02']
# start_date = filtered_data['DCP_date_current_period'].iloc[0]
temp_train_end_date = pd.Timestamp('2022-12-30')#this specifies the training end date
idx=dohlcav_mpnxp_data.index[dohlcav_mpnxp_data['DCP_date_current_period']==temp_train_end_date].values[0]
new_idx=idx-int(model_case_version_main_target_code)

''' why to avoid tail values'''
train_end_date=dohlcav_mpnxp_data.loc[new_idx,'DCP_date_current_period']#we move the training date in such a way that we avoid the tail values
prediction_end_date=pd.Timestamp(str(dohlcav_mpnxp_data['DCP_date_current_period'].iloc[-1]))

In [37]:
period = str(start_date.date()) + '_' + str(temp_train_end_date.date())
filename = 'proceedit '+today+' SPP-'+case+'_'+period+'_MPN'+model_case_version_main_target_code+'P_LSTM-15710760-B'+str(batch)+'E'+str(n_epochs)+'L'+str(lr)+'T'+model_case_version_time_steps+ \
            '-DOHLCAV-FTEKR_LVSP_LSTM-15710760-B'+str(batch)+'E'+str(n_epochs)+'L'+str(lr)+'T'+model_case_version_time_steps+'-PR_'+analytical_parametrs+'L0.75_AH'
print(filename)

proceedit 20240111 SPP-EURUSD=X_2003-12-01_2022-12-30_MPN5P_LSTM-15710760-B64E100L0.0005T10-DOHLCAV-FTEKR_LVSP_LSTM-15710760-B64E100L0.0005T10-PR_11013L0.75_AH


In [38]:
start_date

Timestamp('2003-12-01 00:00:00')

In [39]:
dohlcav_mpnxp_data.loc[idx,]

ID                                                           4980
DCP_date_current_period                       2022-12-30 00:00:00
DNCP_day_number_current_period                            44925.0
OPCP_open_price_current_period                           1.066075
HPCP_high_price_current_period                           1.069793
LPCP_low_price_current_period                             1.06392
CPCP_close_price_current_period                          1.066075
ACPCP_adjusted_close_price_current_period                1.066075
VTCP_volume_of_transactions_current_period                      1
MPN5P_median_price_next_5_periods                        1.060636
HPN5P_highest_price_next_5_periods                       1.071237
LPN5P_lowest_price_next_5_periods                        1.048526
HPN1P_high_price_next_1_period                           1.071237
LPN1P_low_price_next_1_period                            1.065326
Name: 4979, dtype: object

In [40]:
train_end_date

Timestamp('2022-12-23 00:00:00')

In [41]:
prediction_end_date

Timestamp('2023-03-27 00:00:00')

In [42]:
'''This is the mask of booleans that will be used to filter data and take just what we need (data from the beginning until the end of 2021)'''

train_mask = (dohlcav_mpnxp_data['DCP_date_current_period'] <= train_end_date)#to select data for training
prediction_mask = (dohlcav_mpnxp_data['DCP_date_current_period'] <= prediction_end_date)#to select data for prediction

In [43]:
training_size = dohlcav_mpnxp_data.loc[train_mask].shape[0] #the mask is applied and from the correspondent dataframe we take just the shape[0] (the size\the number of rows)
prediction_size = dohlcav_mpnxp_data.loc[prediction_mask].shape[0]
print('Training size: ', training_size)
print('Prediction size: ', prediction_size)
#print('Test size: ', dohlcav_mpnxp_data.shape[0] - training_size)#this is to define our testing size but this is commented becausing testing percent is 0

Training size:  4975
Prediction size:  5042


In [44]:
'''These will be used in the predictions output file (in order to know from which point starting to paste the results).'''
dates = dohlcav_mpnxp_data.iloc[int(model_case_version_time_steps)-1:,1].apply(lambda x: x.date().strftime('%Y-%m-%d')).reset_index(drop=True)
dates

0       2003-12-12
1       2003-12-15
2       2003-12-16
3       2003-12-17
4       2003-12-18
           ...    
5028    2023-03-22
5029    2023-03-23
5030    2023-03-24
5031    2023-03-26
5032    2023-03-27
Name: DCP_date_current_period, Length: 5033, dtype: object

In [45]:
'''Building the dataframe with just the necessary columns (removing 'id', 'uuid' and 'ACPCP_adjusted_close_price_current_period' column)'''

#pay attention here because everytime targets change, also the name of the columns change
df = dohlcav_mpnxp_data.drop(["ID","DCP_date_current_period"], axis=1)
df

Unnamed: 0,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
0,37956.0,1.203398,1.204007,1.194401,1.196501,1.196501,1,1.210903,1.224005,1.194600,1.210903,1.194600
1,37957.0,1.196101,1.210903,1.194600,1.208897,1.208897,1,1.214403,1.227702,1.204398,1.213003,1.207700
2,37958.0,1.209000,1.213003,1.207700,1.212298,1.212298,1,1.219096,1.227702,1.204398,1.214403,1.204398
3,37959.0,1.212004,1.214403,1.204398,1.208094,1.208094,1,1.219552,1.227702,1.206593,1.219096,1.206593
4,37960.0,1.207802,1.219096,1.206593,1.218695,1.218695,1,1.222105,1.230603,1.212298,1.224005,1.215407
...,...,...,...,...,...,...,...,...,...,...,...,...
5037,45007.0,1.077424,1.080147,1.075951,1.077424,1.077424,1,1.079914,1.092900,1.072156,1.092900,1.086862
5038,45008.0,1.087039,1.092900,1.086862,1.087039,1.087039,1,1.079914,1.092900,1.072156,1.084246,1.072156
5039,45009.0,1.083776,1.084246,1.072156,1.076426,1.076426,1,1.079914,1.092900,1.072156,1.079600,1.074807
5040,45011.0,1.077819,1.079600,1.074807,1.077819,1.077819,1,1.079914,1.092900,1.072156,1.081081,1.079914


In [46]:
'''Renaming columns to have a more compact and a better reading of the df'''
#pay attention here because everytime targets change, also the name of the columns change
df = df.rename(columns={"DNCP_day_number_current_period": "DNCP",
                        "OPCP_open_price_current_period":"OPCP",
                        "HPCP_high_price_current_period":"HPCP",
                        "LPCP_low_price_current_period":"LPCP",
                        "CPCP_close_price_current_period":"CPCP",
                        "ACPCP_adjusted_close_price_current_period": "ACPCP",
                        "VTCP_volume_of_transactions_current_period":"VTCP",
                        "MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods":"MPN"+model_case_version_main_target_code+"P",
                        "HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods":"HPN"+model_case_version_main_target_code+"P",
                        "LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods":"LPN"+model_case_version_main_target_code+"P",
                        'HPN1P_high_price_next_1_period':'hpn1p',
                        'LPN1P_low_price_next_1_period':'lpn1p'})

#df.columns = ['DNCP', 'OPCP', 'HPCP', 'LPCP', 'CPCP', 'ACPCP', 'VTCP', 'MPN1P', 'HPN1P', 'LPN1P', 'HPN1P', 'LPN1P']

df.head()

Unnamed: 0,DNCP,OPCP,HPCP,LPCP,CPCP,ACPCP,VTCP,MPN5P,HPN5P,LPN5P,hpn1p,lpn1p
0,37956.0,1.203398,1.204007,1.194401,1.196501,1.196501,1,1.210903,1.224005,1.1946,1.210903,1.1946
1,37957.0,1.196101,1.210903,1.1946,1.208897,1.208897,1,1.214403,1.227702,1.204398,1.213003,1.2077
2,37958.0,1.209,1.213003,1.2077,1.212298,1.212298,1,1.219096,1.227702,1.204398,1.214403,1.204398
3,37959.0,1.212004,1.214403,1.204398,1.208094,1.208094,1,1.219552,1.227702,1.206593,1.219096,1.206593
4,37960.0,1.207802,1.219096,1.206593,1.218695,1.218695,1,1.222105,1.230603,1.212298,1.224005,1.215407


In [47]:
model_case_version_main_target_code

'5'

In [48]:
base_target_code = 'MPN' + model_case_version_main_target_code + 'P'
base_target_column_index = df.columns.get_loc(base_target_code)
base_target_column_index

7

In [49]:
def new_target_column(target_code , shift_back_period):
  prev_target = df[target_code]
  new_target = prev_target[:-shift_back_period]
  first_dates_handling = [0] * shift_back_period
  new_target=np.concatenate((first_dates_handling,new_target))
  return new_target

In [50]:
df.columns

Index(['DNCP', 'OPCP', 'HPCP', 'LPCP', 'CPCP', 'ACPCP', 'VTCP', 'MPN5P',
       'HPN5P', 'LPN5P', 'hpn1p', 'lpn1p'],
      dtype='object')

In [51]:
#Adding multiple targets
new_target_index = base_target_column_index
for i in range(int(model_case_version_main_target_code)):
  new_target_code = 'MPN-' + str(i+1) + 'P'
  df.insert(new_target_index+1,new_target_code,new_target_column(base_target_code,i+1))
  new_target_index = new_target_index + 1
  targets.append(new_target_code)

df

Unnamed: 0,DNCP,OPCP,HPCP,LPCP,CPCP,ACPCP,VTCP,MPN5P,MPN-1P,MPN-2P,MPN-3P,MPN-4P,MPN-5P,HPN5P,LPN5P,hpn1p,lpn1p
0,37956.0,1.203398,1.204007,1.194401,1.196501,1.196501,1,1.210903,0.000000,0.000000,0.000000,0.000000,0.000000,1.224005,1.194600,1.210903,1.194600
1,37957.0,1.196101,1.210903,1.194600,1.208897,1.208897,1,1.214403,1.210903,0.000000,0.000000,0.000000,0.000000,1.227702,1.204398,1.213003,1.207700
2,37958.0,1.209000,1.213003,1.207700,1.212298,1.212298,1,1.219096,1.214403,1.210903,0.000000,0.000000,0.000000,1.227702,1.204398,1.214403,1.204398
3,37959.0,1.212004,1.214403,1.204398,1.208094,1.208094,1,1.219552,1.219096,1.214403,1.210903,0.000000,0.000000,1.227702,1.206593,1.219096,1.206593
4,37960.0,1.207802,1.219096,1.206593,1.218695,1.218695,1,1.222105,1.219552,1.219096,1.214403,1.210903,0.000000,1.230603,1.212298,1.224005,1.215407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5037,45007.0,1.077424,1.080147,1.075951,1.077424,1.077424,1,1.079914,1.079914,1.077819,1.077424,1.073054,1.067886,1.092900,1.072156,1.092900,1.086862
5038,45008.0,1.087039,1.092900,1.086862,1.087039,1.087039,1,1.079914,1.079914,1.079914,1.077819,1.077424,1.073054,1.092900,1.072156,1.084246,1.072156
5039,45009.0,1.083776,1.084246,1.072156,1.076426,1.076426,1,1.079914,1.079914,1.079914,1.079914,1.077819,1.077424,1.092900,1.072156,1.079600,1.074807
5040,45011.0,1.077819,1.079600,1.074807,1.077819,1.077819,1,1.079914,1.079914,1.079914,1.079914,1.079914,1.077819,1.092900,1.072156,1.081081,1.079914


In [52]:
'''Let's check also whether there are missing values or not'''

print("Number of NaN values:")
print(df.isna().sum())

Number of NaN values:
DNCP      0
OPCP      0
HPCP      0
LPCP      0
CPCP      0
ACPCP     0
VTCP      0
MPN5P     0
MPN-1P    0
MPN-2P    0
MPN-3P    0
MPN-4P    0
MPN-5P    0
HPN5P     0
LPN5P     0
hpn1p     0
lpn1p     0
dtype: int64


In [53]:
# '''Ratio Transformation for features'''
df.insert(7,'OPCP_Ratio',df['OPCP']/df['CPCP'])
df.insert(8,'HPCP_Ratio',df['HPCP']/df['CPCP'])
df.insert(9,'LPCP_Ratio',df['LPCP']/df['CPCP'])
df.insert(10,'ACPCP_Ratio',df['ACPCP']/df['CPCP'])
df.insert(df.columns.get_loc('MPN'+model_case_version_main_target_code+'P') ,'MPN'+ model_case_version_main_target_code +'P_Ratio',df['MPN'+ model_case_version_main_target_code +'P'].shift(5)/df['CPCP'].shift(5))
df.insert(df.columns.get_loc('HPN'+model_case_version_main_target_code+'P'),'HPN'+ model_case_version_main_target_code +'P_Ratio',df['HPN'+ model_case_version_main_target_code +'P']/df['CPCP'])
df.insert(df.columns.get_loc('LPN'+model_case_version_main_target_code+'P'),'LPN'+ model_case_version_main_target_code +'P_Ratio',df['LPN'+ model_case_version_main_target_code +'P']/df['CPCP'])
df.insert(df.columns.get_loc('hpn1p')+1,'hpn1p_Ratio',df['hpn1p']/df['CPCP'])
df.insert(df.columns.get_loc('lpn1p')+1,'lpn1p_Ratio',df['lpn1p']/df['CPCP'])

In [54]:
pd.set_option('display.max_columns',None)

In [55]:
df.head(10)

Unnamed: 0,DNCP,OPCP,HPCP,LPCP,CPCP,ACPCP,VTCP,OPCP_Ratio,HPCP_Ratio,LPCP_Ratio,ACPCP_Ratio,MPN5P_Ratio,MPN5P,MPN-1P,MPN-2P,MPN-3P,MPN-4P,MPN-5P,HPN5P_Ratio,HPN5P,LPN5P_Ratio,LPN5P,hpn1p,hpn1p_Ratio,lpn1p,lpn1p_Ratio
0,37956.0,1.203398,1.204007,1.194401,1.196501,1.196501,1,1.005764,1.006273,0.998244,1.0,,1.210903,0.0,0.0,0.0,0.0,0.0,1.022987,1.224005,0.998411,1.1946,1.210903,1.012036,1.1946,0.998411
1,37957.0,1.196101,1.210903,1.1946,1.208897,1.208897,1,0.989414,1.001659,0.988173,1.0,,1.214403,1.210903,0.0,0.0,0.0,0.0,1.015555,1.227702,0.996278,1.204398,1.213003,1.003396,1.2077,0.99901
2,37958.0,1.209,1.213003,1.2077,1.212298,1.212298,1,0.99728,1.000582,0.996208,1.0,,1.219096,1.214403,1.210903,0.0,0.0,0.0,1.012707,1.227702,0.993484,1.204398,1.214403,1.001737,1.204398,0.993484
3,37959.0,1.212004,1.214403,1.204398,1.208094,1.208094,1,1.003236,1.005222,0.996941,1.0,,1.219552,1.219096,1.214403,1.210903,0.0,0.0,1.01623,1.227702,0.998757,1.206593,1.219096,1.009107,1.206593,0.998757
4,37960.0,1.207802,1.219096,1.206593,1.218695,1.218695,1,0.991062,1.000329,0.99007,1.0,,1.222105,1.219552,1.219096,1.214403,1.210903,0.0,1.009771,1.230603,0.994751,1.212298,1.224005,1.004358,1.215407,0.997302
5,37963.0,1.216797,1.224005,1.215407,1.222001,1.222001,1,0.995741,1.00164,0.994604,1.0,1.012036,1.222703,1.222105,1.219552,1.219096,1.214403,1.210903,1.00859,1.232498,0.992059,1.212298,1.227702,1.004665,1.219795,0.998195
6,37964.0,1.222105,1.227702,1.219795,1.224995,1.224995,1,0.997641,1.00221,0.995755,1.0,1.004554,1.224905,1.222703,1.222105,1.219552,1.219096,1.214403,1.008898,1.235895,0.989635,1.212298,1.226603,1.001312,1.216205,0.992824
7,37965.0,1.224905,1.226603,1.216205,1.219096,1.219096,1,1.004765,1.006158,0.997628,1.0,1.005608,1.229997,1.224905,1.222703,1.222105,1.219552,1.219096,1.018791,1.242005,0.994423,1.212298,1.223496,1.003609,1.212298,0.994423
8,37966.0,1.219096,1.223496,1.212298,1.222404,1.222404,1,0.997294,1.000893,0.991732,1.0,1.009485,1.231998,1.229997,1.224905,1.222703,1.222105,1.219552,1.017829,1.244199,0.996237,1.217804,1.230603,1.006707,1.221299,0.999096
9,37967.0,1.222703,1.230603,1.221299,1.227898,1.227898,1,0.995769,1.002203,0.994626,1.0,1.002799,1.235895,1.231998,1.229997,1.224905,1.222703,1.222105,1.013276,1.244199,0.99178,1.217804,1.232498,1.003747,1.217804,0.99178


In [56]:
df = df.iloc[int(model_case_version_main_target_code):]

In [57]:
df.tail(15)

Unnamed: 0,DNCP,OPCP,HPCP,LPCP,CPCP,ACPCP,VTCP,OPCP_Ratio,HPCP_Ratio,LPCP_Ratio,ACPCP_Ratio,MPN5P_Ratio,MPN5P,MPN-1P,MPN-2P,MPN-3P,MPN-4P,MPN-5P,HPN5P_Ratio,HPN5P,LPN5P_Ratio,LPN5P,hpn1p,hpn1p_Ratio,lpn1p,lpn1p_Ratio
5027,44993.0,1.055075,1.057328,1.052665,1.055075,1.055075,1,1.0,1.002136,0.997716,1.0,1.004622,1.068365,1.058649,1.05847,1.05847,1.059973,1.06262,1.019842,1.076009,0.997117,1.052034,1.058649,1.003388,1.053852,0.998841
5028,44994.0,1.054952,1.058649,1.053852,1.054952,1.054952,1,1.0,1.003504,0.998957,1.0,0.993513,1.068365,1.068365,1.058649,1.05847,1.05847,1.059973,1.01996,1.076009,0.997233,1.052034,1.069862,1.014133,1.057574,1.002485
5029,44995.0,1.05847,1.069862,1.057574,1.05847,1.05847,1,1.0,1.010763,0.999154,1.0,0.998582,1.067965,1.068365,1.068365,1.058649,1.05847,1.05847,1.016571,1.076009,0.993919,1.052034,1.073722,1.014409,1.065235,1.006391
5030,44998.0,1.068365,1.073722,1.065235,1.068365,1.068365,1,1.0,1.005014,0.997071,1.0,0.996094,1.067886,1.067965,1.068365,1.068365,1.058649,1.05847,1.007156,1.076009,0.984714,1.052034,1.074714,1.005943,1.067965,0.999626
5031,44999.0,1.072501,1.074714,1.067965,1.072501,1.072501,1,1.0,1.002063,0.995771,1.0,0.990472,1.067886,1.067886,1.067965,1.068365,1.068365,1.058649,1.005803,1.078725,0.980916,1.052034,1.076009,1.003271,1.052034,0.980916
5032,45000.0,1.072766,1.076009,1.052034,1.072766,1.072766,1,1.0,1.003024,0.980674,1.0,1.012596,1.067886,1.067886,1.067886,1.067965,1.068365,1.068365,1.006881,1.080147,0.983914,1.055509,1.063456,0.991322,1.055509,0.983914
5033,45001.0,1.058335,1.063456,1.055509,1.058335,1.058335,1,1.0,1.004839,0.99733,1.0,1.012713,1.073054,1.067886,1.067886,1.067886,1.067965,1.068365,1.032659,1.0929,1.002717,1.061211,1.066963,1.008152,1.061211,1.002717
5034,45002.0,1.061413,1.066963,1.061211,1.061413,1.061413,1,1.0,1.005228,0.999809,1.0,1.008971,1.077424,1.073054,1.067886,1.067886,1.067886,1.067965,1.029665,1.0929,1.001722,1.063242,1.073054,1.010967,1.063242,1.001722
5035,45005.0,1.067886,1.073054,1.063242,1.067886,1.067886,1,1.0,1.004839,0.995651,1.0,0.999552,1.077819,1.077424,1.073054,1.067886,1.067886,1.067886,1.023424,1.0929,1.002441,1.070492,1.078725,1.010151,1.070492,1.002441
5036,45006.0,1.071915,1.078725,1.070492,1.071915,1.071915,1,1.0,1.006354,0.998673,1.0,0.995696,1.079914,1.077819,1.077424,1.073054,1.067886,1.067886,1.019577,1.0929,1.000225,1.072156,1.080147,1.00768,1.075951,1.003766


In [58]:
targets

['MPN5P', 'MPN-1P', 'MPN-2P', 'MPN-3P', 'MPN-4P', 'MPN-5P']

In [59]:
for target in targets:
  avg_prices_list.append(df[target].astype(float).mean())

avg_prices_list

[1.2437507782876713,
 1.2437790077496527,
 1.2438067303573557,
 1.2438343623605321,
 1.2438610626186222,
 1.2438870680583682]

In [60]:
# dncp_train = dohlcav_mpnxp_data[train_mask]['DNCP_day_number_current_period'].replace(',','', regex=True)[int(model_case_version_time_steps)-1+5:]
# dncp_train

In [61]:
main_target_code_integer = int(model_case_version_main_target_code)

In [62]:
main_target_code_integer

5

In [63]:
#this is calculate the period day number to be used in the training section
dncp_train = dohlcav_mpnxp_data[train_mask]['DNCP_day_number_current_period'].replace(',','', regex=True)[int(model_case_version_time_steps)-1+main_target_code_integer:]
dncp_train= dncp_train.astype(int).to_numpy()
span_dncp_train=dncp_train[-1] - dncp_train[0] + 1
positions_day_number_train= dncp_train-dncp_train[0]+1
positions_day_number_train

array([   1,    4,    5, ..., 6943, 6944, 6945])

In [64]:
len(positions_day_number_train)

4961

In [65]:
#this is calculate the period day number to be used in the prediction section
dncp = dohlcav_mpnxp_data['DNCP_day_number_current_period'].replace(',','', regex=True)[int(model_case_version_time_steps)-1+main_target_code_integer:]
dncp = dncp.astype(int).to_numpy()
span_dncp=dncp[-1] - dncp[0] + 1
positions_day_number = dncp-dncp[0]+1
positions_day_number

array([   1,    4,    5, ..., 7036, 7038, 7039])

In [66]:
len(positions_day_number)

5028

In [67]:
'''The padding point value is calculated for computing the value at the end of the trend line. We'll see better during the training and the application of vertical padding '''
padding_point = positions_day_number[0]
padding_point

1

#### Preprocessing

In [68]:
df.head()

Unnamed: 0,DNCP,OPCP,HPCP,LPCP,CPCP,ACPCP,VTCP,OPCP_Ratio,HPCP_Ratio,LPCP_Ratio,ACPCP_Ratio,MPN5P_Ratio,MPN5P,MPN-1P,MPN-2P,MPN-3P,MPN-4P,MPN-5P,HPN5P_Ratio,HPN5P,LPN5P_Ratio,LPN5P,hpn1p,hpn1p_Ratio,lpn1p,lpn1p_Ratio
5,37963.0,1.216797,1.224005,1.215407,1.222001,1.222001,1,0.995741,1.00164,0.994604,1.0,1.012036,1.222703,1.222105,1.219552,1.219096,1.214403,1.210903,1.00859,1.232498,0.992059,1.212298,1.227702,1.004665,1.219795,0.998195
6,37964.0,1.222105,1.227702,1.219795,1.224995,1.224995,1,0.997641,1.00221,0.995755,1.0,1.004554,1.224905,1.222703,1.222105,1.219552,1.219096,1.214403,1.008898,1.235895,0.989635,1.212298,1.226603,1.001312,1.216205,0.992824
7,37965.0,1.224905,1.226603,1.216205,1.219096,1.219096,1,1.004765,1.006158,0.997628,1.0,1.005608,1.229997,1.224905,1.222703,1.222105,1.219552,1.219096,1.018791,1.242005,0.994423,1.212298,1.223496,1.003609,1.212298,0.994423
8,37966.0,1.219096,1.223496,1.212298,1.222404,1.222404,1,0.997294,1.000893,0.991732,1.0,1.009485,1.231998,1.229997,1.224905,1.222703,1.222105,1.219552,1.017829,1.244199,0.996237,1.217804,1.230603,1.006707,1.221299,0.999096
9,37967.0,1.222703,1.230603,1.221299,1.227898,1.227898,1,0.995769,1.002203,0.994626,1.0,1.002799,1.235895,1.231998,1.229997,1.224905,1.222703,1.222105,1.013276,1.244199,0.99178,1.217804,1.232498,1.003747,1.217804,0.99178


In [69]:
#seperate training dataset and apply pre-treament only on the training dataset
train_dataset=df.iloc[main_target_code_integer:new_idx+1,:]

In [70]:
train_df_array=train_dataset.to_numpy(dtype='float64')
train_df_array

array([[3.79700000e+04, 1.21869470e+00, 1.23249850e+00, ...,
        1.00316381e+00, 1.23049660e+00, 9.98781735e-01],
       [3.79710000e+04, 1.23140570e+00, 1.23589530e+00, ...,
        1.00796124e+00, 1.22999720e+00, 9.98216516e-01],
       [3.79720000e+04, 1.23199750e+00, 1.24200460e+00, ...,
        1.00322246e+00, 1.23659840e+00, 9.97094023e-01],
       ...,
       [4.49230000e+04, 1.06371670e+00, 1.06742950e+00, ...,
        1.00310496e+00, 1.06123320e+00, 9.97665262e-01],
       [4.49240000e+04, 1.06292510e+00, 1.06701950e+00, ...,
        1.00646160e+00, 1.06392040e+00, 1.00093638e+00],
       [4.49250000e+04, 1.06607530e+00, 1.06979330e+00, ...,
        1.00484206e+00, 1.06532570e+00, 9.99296860e-01]])

In [71]:
train_df_array[:,:12]

array([[3.79700000e+04, 1.21869470e+00, 1.23249850e+00, ...,
        9.88479522e-01, 1.00000000e+00, 1.00057455e+00],
       [3.79710000e+04, 1.23140570e+00, 1.23589530e+00, ...,
        9.98621809e-01, 1.00000000e+00, 9.99926530e-01],
       [3.79720000e+04, 1.23199750e+00, 1.24200460e+00, ...,
        9.91771343e-01, 1.00000000e+00, 1.00894204e+00],
       ...,
       [4.49230000e+04, 1.06371670e+00, 1.06742950e+00, ...,
        9.97813516e-01, 1.00000000e+00, 1.00095737e+00],
       [4.49240000e+04, 1.06292510e+00, 1.06701950e+00, ...,
        9.98408260e-01, 1.00000000e+00, 1.00255295e+00],
       [4.49250000e+04, 1.06607530e+00, 1.06979330e+00, ...,
        9.97978661e-01, 1.00000000e+00, 1.00347928e+00]])

In [72]:
train_df_array=train_dataset.to_numpy(dtype='float64')
#application of logarithmic detrending
if not (train_df_array < 0).any():
    print('logarithmic detrending applied')
    train_df_array = np.log(train_df_array)

logarithmic detrending applied


In [73]:
kalman_train=True#not applying kalman Filter

In [74]:
'''Preprocessing consists, in this case, to transform the dataset through 3 filters: Kalman, E_logrithmic and RobustScaler (in this order)'''
if kalman_train: #necessary to deal with filter results
  '''Application of the Kalman filter (rounding data)'''
  kalman_smoother=tsmoothie.KalmanSmoother(component='level_trend',  component_noise={'level':0.1, 'trend':0.1}) #values for Kalman filters parameters
                                                                                                                  #are taken from an example in the original code
                                                                                                                  #of the library
  temp_df=pd.DataFrame(train_df_array,columns=train_dataset.columns)
  for i in range(len(temp_df.columns)): #this is the more convenient way I've found to apply the filter
    kalman_smoother.smooth(temp_df[temp_df.columns[i]])
    train_df_array[:,i] = kalman_smoother.smooth_data

In [75]:
train_df_array

array([[ 1.03068879e+01,  1.97220594e-01,  2.05136234e-01, ...,
         4.00581072e-03,  2.02534251e-01, -1.93558391e-03],
       [ 1.04084493e+01,  2.03127639e-01,  2.09931832e-01, ...,
         4.77034153e-03,  2.05898830e-01, -2.31433096e-03],
       [ 1.04792731e+01,  2.07819604e-01,  2.14128683e-01, ...,
         5.12086673e-03,  2.08842495e-01, -2.66780093e-03],
       ...,
       [ 1.07127029e+01,  6.21223063e-02,  6.47003942e-02, ...,
         3.65800869e-03,  6.05448266e-02, -1.57747964e-03],
       [ 1.07127292e+01,  6.24658613e-02,  6.55854873e-02, ...,
         4.56733665e-03,  6.16142537e-02, -8.51607595e-04],
       [ 1.07127552e+01,  6.30472136e-02,  6.66261360e-02, ...,
         5.24759461e-03,  6.27064600e-02, -3.40753671e-04]])

In [76]:
#fitting the robust scaler onn both the features and target column separately for the training dataset
train_robust_scaler_features= RobustScaler().fit(train_df_array[:,:12])
train_robust_scaler_target=RobustScaler().fit(train_df_array[:,12:12+int(model_case_version_main_target_code)+1])

In [77]:
#performing transform on the features
train_df_features=train_robust_scaler_features.transform(train_df_array[:,:12])

In [78]:
train_df_features

array([[-3.87341469e+000, -8.44810445e-002, -5.65825428e-002, ...,
        -2.14292011e+000,  8.20781323e-047,  2.21076629e-001],
       [-2.66399305e+000, -4.81953214e-002, -2.71846869e-002, ...,
        -1.37675360e+000,  6.35072350e-047,  4.27780765e-001],
       [-1.82060370e+000, -1.93735805e-002, -1.45724788e-003, ...,
        -9.67755006e-001,  4.26425073e-047,  6.63692064e-001],
       ...,
       [ 9.59142224e-001, -9.14361063e-001, -9.17478925e-001, ...,
         4.06650831e-001,  0.00000000e+000,  1.72063788e-001],
       [ 9.59455847e-001, -9.12250677e-001, -9.12053149e-001, ...,
         5.48835619e-001,  0.00000000e+000,  2.81858979e-001],
       [ 9.59765057e-001, -9.08679554e-001, -9.05673789e-001, ...,
         6.60814489e-001, -4.94065646e-324,  3.92929508e-001]])

In [79]:
#performing transform on the target
train_df_target=train_robust_scaler_target.transform(train_df_array[:,12:12+int(model_case_version_main_target_code)+1])

In [80]:
train_df_array

array([[ 1.03068879e+01,  1.97220594e-01,  2.05136234e-01, ...,
         4.00581072e-03,  2.02534251e-01, -1.93558391e-03],
       [ 1.04084493e+01,  2.03127639e-01,  2.09931832e-01, ...,
         4.77034153e-03,  2.05898830e-01, -2.31433096e-03],
       [ 1.04792731e+01,  2.07819604e-01,  2.14128683e-01, ...,
         5.12086673e-03,  2.08842495e-01, -2.66780093e-03],
       ...,
       [ 1.07127029e+01,  6.21223063e-02,  6.47003942e-02, ...,
         3.65800869e-03,  6.05448266e-02, -1.57747964e-03],
       [ 1.07127292e+01,  6.24658613e-02,  6.55854873e-02, ...,
         4.56733665e-03,  6.16142537e-02, -8.51607595e-04],
       [ 1.07127552e+01,  6.30472136e-02,  6.66261360e-02, ...,
         5.24759461e-03,  6.27064600e-02, -3.40753671e-04]])

In [81]:
df_array=df.to_numpy(dtype='float64')

In [82]:
if (df_array < 0).any():
  # df_array = no_kalman_df_array.copy()
  print('MSFT'+" has negative values after the logarithmic detrending")

In [83]:

# df_array = df_array[5:]
#applying logarithmic detrending to the prediction/full dataset
if not (df_array < 0).any():
    df_array = np.log(df_array)


In [84]:
kalman_predict=True

In [85]:
if kalman_predict:
  '''Preprocessing consists, in this case, to transform the dataset through 3 filters: Kalman, E_logrithmic and RobustScaler (in this order)'''

  '''Application of the Kalman filter (rounding data)'''
  kalman_smoother=tsmoothie.KalmanSmoother(component='level_trend',  component_noise={'level':0.1, 'trend':0.1}) #values for Kalman filters parameters
                                                                                                                  #are taken from an example in the original code
                                                                                                                  #of the library
  temp_df=pd.DataFrame(df_array,columns=df.columns)
  for i in range(len(df.columns)): #this is the more convenient way I've found to apply the filter
    kalman_smoother.smooth(temp_df[temp_df.columns[i]])
    df_array[:,i] = kalman_smoother.smooth_data



In [None]:
#robust scaler transform on the features
prediction_df_features=train_robust_scaler_features.transform(df_array[:,:12])

In [None]:
prediction_df_features

array([[-3.87556079e+000, -1.10638913e-001, -1.03944217e-001, ...,
        -5.21089776e-001, -3.91689853e-048,  1.21000252e+000],
       [-2.66616022e+000, -9.08453207e-002, -8.49179549e-002, ...,
        -6.50984280e-001,  2.28465493e-048,  9.86974767e-001],
       [-1.82278554e+000, -7.64195818e-002, -7.06099685e-002, ...,
        -8.37562127e-001,  9.14846698e-048,  8.35514937e-001],
       ...,
       [ 9.82056326e-001, -8.17959107e-001, -8.22570692e-001, ...,
         6.20711085e-001,  0.00000000e+000,  1.17031841e+000],
       [ 9.82405157e-001, -8.15807207e-001, -8.25033177e-001, ...,
         6.56770313e-001,  0.00000000e+000,  1.27206758e+000],
       [ 9.82746326e-001, -8.12706190e-001, -8.26507833e-001, ...,
         7.62865737e-001, -4.94065646e-324,  1.34233359e+000]])

In [None]:
#robust scaler transform on the target
prediction_df_targets=train_robust_scaler_target.transform(df_array[:,12:12+int(model_case_version_main_target_code)+1])

In [None]:
print(prediction_df_features)

[[-3.87556079e+000 -1.10638913e-001 -1.03944217e-001 ... -5.21089776e-001
  -3.91689853e-048  1.21000252e+000]
 [-2.66616022e+000 -9.08453207e-002 -8.49179549e-002 ... -6.50984280e-001
   2.28465493e-048  9.86974767e-001]
 [-1.82278554e+000 -7.64195818e-002 -7.06099685e-002 ... -8.37562127e-001
   9.14846698e-048  8.35514937e-001]
 ...
 [ 9.82056326e-001 -8.17959107e-001 -8.22570692e-001 ...  6.20711085e-001
   0.00000000e+000  1.17031841e+000]
 [ 9.82405157e-001 -8.15807207e-001 -8.25033177e-001 ...  6.56770313e-001
   0.00000000e+000  1.27206758e+000]
 [ 9.82746326e-001 -8.12706190e-001 -8.26507833e-001 ...  7.62865737e-001
  -4.94065646e-324  1.34233359e+000]]


#### Splitting the Data

In [None]:
'''Creating the input blocks for the models. The timestep value must be changed according to the targets and dataset we are working with'''
X_train, y_train =building_data_sequences(train_df_features,train_df_target,timesteps=int(model_case_version_time_steps))
X_predict,y_predict=building_data_sequences(prediction_df_features,prediction_df_targets,timesteps=int(model_case_version_time_steps))#see Functions section above;
#X_test, y_test = building_data_sequences(test_data, timesteps=int(model_case_version_time_steps))

In [None]:
print(X_train.shape)
print(X_predict.shape)

(4961, 10, 12)
(5028, 10, 12)


In [None]:
print(y_train)

[array([[ 0.10344583,  0.08736546,  0.07389774,  0.06335437,  0.05515523,
         0.04838234],
       [ 0.11932   ,  0.10266089,  0.0871957 ,  0.07367948,  0.06293948,
         0.05493976],
       [ 0.1352367 ,  0.11860877,  0.10252804,  0.08704415,  0.07329629,
         0.06279993],
       ...,
       [-0.90624631, -0.90380514, -0.90524089, -0.90799982, -0.91200005,
        -0.91446701],
       [-0.91076115, -0.90328647, -0.90189224, -0.90464628, -0.90998749,
        -0.9120118 ],
       [-0.91662558, -0.90351469, -0.89924035, -0.90054386, -0.90813625,
        -0.90961947]])]


#### Configuration parameters

In [None]:
'''In this section, we're going to define some variables that will be useful during the training and testing of the model'''

input_shape=((X_train).shape[1],(X_train).shape[2])
print("Input shape obtained is:",input_shape)

Input shape obtained is: (10, 12)


In [None]:
'''We need actual values (true values) to compute some analytical parameters during the training'''

df_actual = df.iloc[int(model_case_version_time_steps)-1:,:].reset_index(drop=True)
print(df_actual)

         DNCP      OPCP      HPCP      LPCP      CPCP     ACPCP  VTCP  \
0     37974.0  1.242205  1.243503  1.235102  1.238497  1.238497     1   
1     37977.0  1.235697  1.244601  1.235697  1.239895  1.239895     1   
2     37978.0  1.239895  1.267202  1.238804  1.240002  1.240002     1   
3     37979.0  1.240095  1.247194  1.240002  1.245299  1.245299     1   
4     37980.0  1.245702  1.247007  1.244307  1.244803  1.244803     1   
...       ...       ...       ...       ...       ...       ...   ...   
5023  45007.0  1.077424  1.080147  1.075951  1.077424  1.077424     1   
5024  45008.0  1.087039  1.092900  1.086862  1.087039  1.087039     1   
5025  45009.0  1.083776  1.084246  1.072156  1.076426  1.076426     1   
5026  45011.0  1.077819  1.079600  1.074807  1.077819  1.077819     1   
5027  45012.0  1.080030  1.081081  1.079914  1.080964  1.080964     1   

      OPCP_Ratio  HPCP_Ratio  LPCP_Ratio  ACPCP_Ratio  MPN5P_Ratio     MPN5P  \
0       1.002994    1.004041    0.997258   

In [None]:
actuals_cols = [] #simply put the targets columns into a list (in order to be iterated during training)

for target in targets:
  actuals_cols.append(np.array(df_actual[target].astype(float)))

actuals_cols

[array([1.2443073 , 1.244803  , 1.24545492, ..., 1.0799136 , 1.0799136 ,
        1.0799136 ]),
 array([1.2422051, 1.2443073, 1.244803 , ..., 1.0799136, 1.0799136,
        1.0799136]),
 array([1.2400948, 1.2422051, 1.2443073, ..., 1.0799136, 1.0799136,
        1.0799136]),
 array([1.2398949, 1.2400948, 1.2422051, ..., 1.0799136, 1.0799136,
        1.0799136]),
 array([1.2384974, 1.2398949, 1.2400948, ..., 1.0778185, 1.0799136,
        1.0799136]),
 array([1.2358953, 1.2384974, 1.2398949, ..., 1.0774237, 1.0778185,
        1.0799136])]

In [None]:
'''this indicates the index from which we start to replace the actual target with the predicted target to be used in the prediction section'''
stop_actual=stop_target-int(model_case_version_time_steps)+1-main_target_code_integer

In [None]:
best_models_path = model_path + '/best_models/' #this path is used just to make everything more orderly

#### Training

In [None]:
df_cpcp_train=df.iloc[(int(model_case_version_time_steps)-1):new_idx+1,:].reset_index(drop=True)
CPCP_cols_train=[]
CPCP_cols_train.append(np.array(df_cpcp_train['CPCP'].astype(float)))

In [None]:
best_prediction_run=0
best_correction_run=0

In [None]:
prediction_runs=100
correction_runs=1

In [None]:
past_corr_train_stat=True
past_corr_pred_stat=True

In [None]:
apply_correction_lstm=True
apply_vertical_swing_corr=True

In [None]:
'''This is the custom loss function we defined in order to apply a sort of padding during the training of the model. It consists in a normal mean squared error loss fnuction except for the application of
a multiplicative factor on the prediction of the model'''

def custom_loss_function(attenuated_padding_value):

  def padding_loss_function(y_true, y_pred):

    y_pred = tf.multiply(y_pred, attenuated_padding_value) #this is the multiplication between the predictions and the attenuated_padding_value

    squared_difference = tf.square(y_true - y_pred)

    return tf.reduce_mean(squared_difference, axis=-1) #mse

  return padding_loss_function

intercepts_and_slopes = {'intercepts': [],
                         'slopes':[],
                         'end_intercepts':[]
                         }

In [None]:
'''This is about a part that must be inserted in the analytical paramters file and which is referred to the weights that are used to compute the compound_run_term (see later in the code)'''

weights_df = pd.DataFrame([None, None, None, None, None, None, None, None, slope_weighting_exponent_ratio, intercept_weighting_exponent_ratio, end_intercept_weighting_exponent_ratio, rsqr_weighting_exponent_ratio, dispersion_weighting_exponent_ratio,
                           slope_weighting_exponent_predicted_actual, intercept_weighting_exponent_predicted_actual, rsqr_weighting_exponent_predicted_actual, dispersion_weighting_exponent_predicted_actual, None, None, None, None,None]).T

weights_df=weights_df.rename({0:'weights'})
weights_df=weights_df.rename(columns={0:'slope_predicted_calculated_target_ratio_versus_period',
                           1: 'intercept_predicted_calculated_target_ratio_versus_period',
                           2: 'end_intercept_predicted_calculated_target_ratio_versus_period',
                           3: 'rsqr_predicted_calculated_target_ratio_versus_period',
                           4: 'average_tld_predicted_calculated_target_ratio_versus_period',
                           5: 'slope_predicted_versus_calculated_target',
                           6: 'intercept_predicted_versus_calculated_target',
                           7: 'rsqr_predicted_versus_calculated_target',
                           8: 'average_tld_predicted_versus_calculated_target',
                           9: 'normalized_slope_predicted_calculated_target_ratio_versus_period',
                           10: 'normalized_intercept_predicted_calculated_target_ratio_versus_period',
                           11: 'normalized_end_intercept_predicted_calculated_target_ratio_versus_period',
                           12: 'normalized_rsqr_predicted_calculated_target_ratio_versus_period',
                           13: 'rescaled_normalized_average_tld_predicted_calculated_target_ratio_versus_period',
                           14: 'normalized_slope_predicted_versus_calculated_target',
                           15: 'rescaled_normalized_intercept_predicted_versus_calculated_target',
                           16: 'normalized_rsqr_predicted_versus_calculated_target',
                           17: 'rescaled_normalized_average_tld_predicted_versus_calculated_target',
                           18: 'compound_run_term',
                           19: 'best_run',
                           20: 'adjusted_compound_run_term',
                           21: 'padding_correction_factor',
                           22: 'padding_correction_factor_attenuation'
                           })

In [None]:
weights_df.columns

Index(['slope_predicted_calculated_target_ratio_versus_period',
       'intercept_predicted_calculated_target_ratio_versus_period',
       'end_intercept_predicted_calculated_target_ratio_versus_period',
       'rsqr_predicted_calculated_target_ratio_versus_period',
       'average_tld_predicted_calculated_target_ratio_versus_period',
       'slope_predicted_versus_calculated_target',
       'intercept_predicted_versus_calculated_target',
       'rsqr_predicted_versus_calculated_target',
       'average_tld_predicted_versus_calculated_target',
       'normalized_slope_predicted_calculated_target_ratio_versus_period',
       'normalized_intercept_predicted_calculated_target_ratio_versus_period',
       'normalized_end_intercept_predicted_calculated_target_ratio_versus_period',
       'normalized_rsqr_predicted_calculated_target_ratio_versus_period',
       'rescaled_normalized_average_tld_predicted_calculated_target_ratio_versus_period',
       'normalized_slope_predicted_versus_calcula

In [None]:
'''The creation of the new analytical parameters output file '''

new_weights_df = pd.DataFrame([slope_weighting_exponent_ratio, intercept_weighting_exponent_ratio,
                               end_intercept_weighting_exponent_ratio, rsqr_weighting_exponent_ratio,
                               dispersion_weighting_exponent_ratio, slope_weighting_exponent_predicted_actual,
                               intercept_weighting_exponent_predicted_actual, end_intercept_weighting_exponent_ratio,
                               rsqr_weighting_exponent_predicted_actual, dispersion_weighting_exponent_predicted_actual,
                               None, None, None, None]).T
new_weights_df = new_weights_df.rename({0:'weights'})
new_weights_df = new_weights_df.rename(columns={0:'normalized_trend_slope',
                               1: 'normalized_trend_intercept',
                               2: 'normalized_trend_end_intercept',
                               3: 'normalized_trend_rsqr',
                               4: 'rescaled_normalized_trend_dispersion',
                               5: 'trend_slope_weighting_exponent',
                               6: 'trend_intercept_weighting_exponent',
                               7: 'trend_end_intercept_weighting_exponent',
                               8: 'trend_rsqr_weighting_exponent',
                               9: 'trend_dispersion_weighting_exponent',
                               10: 'adjusted_compound_run_value',
                               11: 'number_of_runs_performed',
                               12: 'best_run_number',
                               13: 'best_model_.h5_file_link'
                               })

In [None]:
#this prepare the correction lstm table consisting of three columns:period_day_number, raw_predicted_targets,actual
def preprare_correction_lstm_table(period_day_number,raw_predicted_targets,volume,actual):
  correction_lstm_table=pd.DataFrame()
  correction_lstm_table['period_day_number']=period_day_number[int(model_case_version_main_target_code):]
  correction_lstm_table['raw_predicted_targets']=raw_predicted_targets[int(model_case_version_main_target_code):]
  new = (actual/raw_predicted_targets)
  print(len(new[:-int(model_case_version_main_target_code)]))
  print(new)
  correction_lstm_table['new_feature'] = new[:-int(model_case_version_main_target_code)]
  print(correction_lstm_table['new_feature'])
  correction_lstm_table['volume'] = volume[int(model_case_version_main_target_code):]
  correction_lstm_table['actual_raw_predited_ratio']=(actual/raw_predicted_targets)[int(model_case_version_main_target_code):]
  return correction_lstm_table

In [None]:
#this prepare the correction lstm table consisting of three columns:period_day_number, raw_predicted_targets,actual
def preprare_correction_lstm_table_predict(period_day_number,raw_predicted_targets,volume,actual,lstm_scaler_feat,lstm_scaler_target):
  correction_lstm_table=pd.DataFrame()
  correction_lstm_table['period_day_number']=period_day_number[int(model_case_version_main_target_code):]
  correction_lstm_table['raw_predicted_targets']=raw_predicted_targets[int(model_case_version_main_target_code):]
  new = (actual/raw_predicted_targets)
  correction_lstm_table['new_feature'] = new[:-int(model_case_version_main_target_code)]
  correction_lstm_table['volume'] = volume[int(model_case_version_main_target_code):]
  correction_lstm_table['actual_raw_predited_ratio']=(actual/raw_predicted_targets)[int(model_case_version_main_target_code):]
  correction_lstm_table=correction_lstm_table.to_numpy()
  train_df_lstm_features=lstm_scaler_feat.transform(correction_lstm_table[:,:2])
  train_df_lstm_target=lstm_scaler_target.transform(correction_lstm_table[:,2].reshape(-1,1))
  final_correction_lstm=np.concatenate((train_df_lstm_features,train_df_lstm_target),axis=1)
  return final_correction_lstm

In [None]:
#this prepare the correction lstm table consisting of three columns:period_day_number, raw_predicted_targets,actual
def preprare_correction_lstm_table_train(period_day_number, raw_predicted_targets,volume,actual):
  correction_lstm_table_temp=pd.DataFrame()
  correction_lstm_table_temp['period_day_number']=period_day_number[int(model_case_version_main_target_code):]
  correction_lstm_table_temp['raw_predicted_targets']=raw_predicted_targets[int(model_case_version_main_target_code):]
  new = (actual/raw_predicted_targets)
  # print(len(new[:-int(model_case_version_main_target_code)]))
  # print(new)
  # sys.exit()
  correction_lstm_table_temp['new_feature'] = new[:-int(model_case_version_main_target_code)]
  # print(correction_lstm_table['new_feature'])
  correction_lstm_table_temp['volume'] = volume[int(model_case_version_main_target_code):]
  correction_lstm_table_temp['actual_raw_predited_ratio']=(actual/raw_predicted_targets)[int(model_case_version_main_target_code):]
  # pd.set_option('display.max_rows', None)
  print(correction_lstm_table_temp)
  pd.reset_option('display.max_rows')
  correction_lstm_table=correction_lstm_table_temp.to_numpy()
  correction_lstm_table = correction_lstm_table[int(model_case_version_main_target_code):]
  robust_scaler_LSTM_features= RobustScaler().fit(correction_lstm_table[:,:4])
  robust_scaler_LSTM_target=RobustScaler().fit(correction_lstm_table[:,4].reshape(-1,1))
  train_df_lstm_features=robust_scaler_LSTM_features.transform(correction_lstm_table[:,:4])
  train_df_lstm_target=robust_scaler_LSTM_target.transform(correction_lstm_table[:,4].reshape(-1,1))
  final_correction_lstm=np.concatenate((train_df_lstm_features,train_df_lstm_target),axis=1)
  return final_correction_lstm,robust_scaler_LSTM_features,robust_scaler_LSTM_target,correction_lstm_table_temp['actual_raw_predited_ratio'][int(model_case_version_time_steps)-1:].to_numpy()

In [None]:
#this creates the data_sequences(time widows) for the correction LSTM
def correction_data_sequences(data, timesteps): #timesteps means how many days we consider for each block

    X=[]
    y_MPNxP = []
    for i in range(len(data)-timesteps+1):  #how it works: every timesteps (e.g. 10 days) a block is constituted and for each block data and true values are stored


        X.append(data[i:(i+timesteps),:4])
        y_MPNxP.append(data[i+timesteps-1,4])

    return np.array(X),np.array(y_MPNxP)

In [None]:
def plot_me(scatter_x_value,scatter_y_value,scatter_x_label,scatter_y_label,
            ratio_x_value,ratio_y_value,ratio_x_label,ratio_y_label,
            comparison_x_value,comparison_y_value,comparison_x_label,comparison_y_label,
            legend_fontsize=13,series_line_style='dotted',trend_line_style='solid',series_line_width=2,trend_line_width=2):
    plt.figure(figsize=figure_size[0])
    z=(np.polyfit(scatter_x_value,scatter_y_value,1))
    p=np.poly1d(z)
    plt.scatter(scatter_x_value,scatter_y_value,color=series_dot_color)
    plt.plot(scatter_x_value,p(scatter_x_value),color=series_trend_color,linewidth=series_line_width)
    plt.legend([scatter_x_label,scatter_y_label], fontsize=legend_fontsize)
    plt.title(plot_title)
    trend_slope,trend_intercept,trend_r2,dispersion=sir_parameters(scatter_x_value,scatter_y_value)
    plt.figtext(.92,.85,['trend_slope:',round(trend_slope,4)])
    plt.figtext(.92,.80,['trend_intercept:',round(trend_intercept,4)])
    plt.figtext(.92,.75,['trend_r2:',round(trend_r2,4)])
    plt.figtext(.92,.70,['trend_standard_deviation:',round(np.std(scatter_x_value),4)])
    plt.figtext(.92,.65,['trend_dispersion:',round(dispersion,4)])
    plt.show()

    plt.figure(figsize=figure_size[1])
    a=np.linspace(0,len(ratio_x_value),len(ratio_x_value),dtype=np.int32)
    z=np.polyfit(a,ratio_y_value,1)
    p=np.poly1d(z)
    plt.plot(ratio_y_value,linestyle=series_line_style)
    plt.plot(a,p(a),linewidth=series_line_width)
    trend_slope,trend_intercept,trend_r2,dispersion=sir_parameters(ratio_x_value,ratio_y_value)
    plt.figtext(.92,.85,['trend_slope:',round(trend_slope,4)])
    plt.figtext(.92,.80,['trend_intercept:',round(trend_intercept,4)])
    plt.figtext(.92,.75,['trend_r2:',round(trend_r2,4)])
    plt.figtext(.92,.70,['trend_standard_deviation:',round(np.std(ratio_x_value),4)])
    plt.figtext(.92,.65,['trend_dispersion:',round(dispersion,4)])
    plt.xticks(range(0,len(dates_num)), dates_num)
    plt.locator_params(axis='x', nbins=4)
    plt.legend([ratio_x_label,ratio_y_label], fontsize=legend_fontsize)
    plt.title(plot_title)
    plt.show()

    plt.figure(figsize=figure_size[1])
    plt.plot(comparison_x_value)
    plt.plot(comparison_y_value)
    plt.xticks(range(0,len(dates_num)), dates_num)
    plt.locator_params(axis='x', nbins=10)
    plt.legend([comparison_x_label,comparison_y_label], fontsize=legend_fontsize)
    plt.title(plot_title)
    trend_slope,trend_intercept,trend_r2,dispersion=sir_parameters(comparison_x_value,comparison_y_value)
    plt.figtext(.92,.85,['trend_slope:',round(trend_slope,4)])
    plt.figtext(.92,.80,['trend_intercept:',round(trend_intercept,4)])
    plt.figtext(.92,.75,['trend_r2:',round(trend_r2,4)])
    plt.figtext(.92,.70,['trend_standard_deviation:',round(np.std(comparison_x_value),4)])
    plt.figtext(.92,.65,['trend_dispersion:',round(dispersion,4)])
    plt.show()

In [None]:
def plot_target(scatter_x_value,scatter_y_value,scatter_x_label,scatter_y_label,ratio_x_value,ratio_y_value,ratio_x_label,ratio_y_label,comparison_x_value,comparison_y_value,comparison_x_label,comparison_y_label):

  global figure_size,plot_title,series_dot_color,series_trend_color,dates_num
  series_dot_color=np.random.rand(3,)
  series_trend_color=np.random.rand(3,)
  legend_fontsize=13
  legend_label={'raw_predicted_targets':['actual','raw_predicted'],
                'lstm_predicted_corrections':['actual','lstm_predicted_corrections'],
                'vertical_corrected_corrections':['actual','vertical_corrected_corrections'],
                'swing_corrected_corrections':['actual','swing_corrected_corrections'],
                'past_corrected_corrections':['actual','past_corrected_corrections'],
                'past_predicted_targets':['actual','past_predicted_targets']}

  figure_size=[(10,10),(15,8)]
  plot_title='test'
  # x_scale_start_value=min(x_value)
  # x_scale_end_value=max(x_value)
  # y_scale_start_value=min(y_value)
  # y_scale_end_value=max(y_value)
  # trend_slope,trend_intercept,trend_r2,_=sir_parameters(scatter_x_value,scatter_y_value)
  # plt.figtext(.92,.85,['trend_slope:',round(trend_slope,4)])
  # plt.figtext(.92,.80,['trend_intercept:',round(trend_intercept,4)])
  # plt.figtext(.92,.75,['trend_r2:',round(trend_r2,4)])
  # plt.figtext(.92,.70,['trend_standard_deviation:',round(np.std(scatter_x_value),4)])
  dates_num=dates[:len(multiple_run_predictions)]
  plot_me(scatter_x_value,scatter_y_value,scatter_x_label,scatter_y_label,
          ratio_x_value,ratio_y_value,ratio_x_label,ratio_y_label,
          comparison_x_value,comparison_y_value,comparison_x_label,comparison_y_label)

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')

print(physical_devices)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
'''This is the core part, where the the model is defined and trained. In particular, we're using an approach defined by us as 'Multiple Run' because what we do is to iterate the training of our model for a certain amount of times,
until we do not obtain the best one (according to a certain value of course) over all the others. More in details: we train a model and for this model we compute the compound_run_term (that is the value used to eveluate the quality
of the model itself). Of course, at the beginning the first model we obtain is the best one. However, in the next iteration, we train another model (with the same parameters, layers, etc.) and also for this second model we compute
the compound_run_term (to assess its quality). If the compound_run_term of the second model is better ('better' in this case means that is closer to 1) than the best model we've obtained so far (so the first one), then the second
model becomes the new best model. So, we proceed in this way, until either we reach the maximum number of iterations or we obtain a compound_run_term with a certain precision. This approach was born because of an instability problem
that affects our model'''

after_training_predictions = []

# intercepts_and_slopes = {'intercepts': [],
#                          'slopes':[],
#                          'end_intercepts':[]
#                          }

for target in range(len(targets)): #for each target

  '''This is the analytical parameters file inizialization. Analytical parameters are values referred to two graphs (ratio between predicted and actual and actual vs predicted) that we use to assess the quality of the predictions.
  In particular, in the file, we store also the normalize/rescaled version of the parameters because they are used to compute the compound_run_term (see later in the code) and other information like the number ofthe best iteration
  (best_run), the padding values and so on. '''

  analytical_parameters = {
    'slope_predicted_calculated_target_ratio_versus_period':[], #slope of the trend line obtained by considering the ratio between predicted values and actual values.
    'intercept_predicted_calculated_target_ratio_versus_period':[], #intercept of the trend line obtained by considering the ratio between predicted values and actual values.
    'end_intercept_predicted_calculated_target_ratio_versus_period':[],
    'rsqr_predicted_calculated_target_ratio_versus_period':[], #r2 of the trend line obtained by considering the ratio between predicted values and actual values.
    'average_tld_predicted_calculated_target_ratio_versus_period':[], #average trend line distance of the trend line obtained by considering the ratio between predicted values and actual values.

    'slope_predicted_versus_calculated_target':[], #slope of the trend line obtained by considering the actual values as x and the predicted values as y.
    'intercept_predicted_versus_calculated_target':[], #intercept of the trend line obtained by considering the actual values as x and the predicted values as y.
    'rsqr_predicted_versus_calculated_target':[], #r2 of the trend line obtained by considering the actual values as x and the predicted values as y.
    'average_tld_predicted_versus_calculated_target':[], #average trend line distance of the trend line obtained by considering the actual values as x and the predicted values as y.

    'normalized_slope_predicted_calculated_target_ratio_versus_period': [],
    'normalized_intercept_predicted_calculated_target_ratio_versus_period':[],
    'normalized_end_intercept_predicted_calculated_target_ratio_versus_period':[],
    'normalized_rsqr_predicted_calculated_target_ratio_versus_period': [],
    'rescaled_normalized_average_tld_predicted_calculated_target_ratio_versus_period': [],

    'normalized_slope_predicted_versus_calculated_target': [],
    'rescaled_normalized_intercept_predicted_versus_calculated_target': [],
    'normalized_rsqr_predicted_versus_calculated_target': [],
    'rescaled_normalized_average_tld_predicted_versus_calculated_target': [],

    'compound_run_term': [],
    'best_run': [],
    'adjusted_compound_run_term':[],
    'padding_correction_factor':[],
    'padding_correction_factor_attenuation':[],
    'absolute_difference_normalized_intercept': [],
    'vertical_padding_correction_factor':[],
    'train_vertical_trend_slope':[]
    }
  
  new_analytical_parameters = {
    'normalized_trend_slope':[],
    'normalized_trend_intercept':[],
    'normalized_trend_end_intercept':[],
    'normalized_trend_rsqr':[],
    'rescaled_normalized_trend_dispersion':[],

    'trend_slope_weighting_exponent':[],
    'trend_intercept_weighting_exponent':[],
    'trend_end_intercept_weighting_exponent':[],
    'trend_rsqr_weighting_exponent':[],
    'trend_dispersion_weighting_exponent':[],

    'adjusted_compound_run_value':[],
    'number_of_runs_performed': [],
    'best_run_number':[],
    'best_model_.h5_file_link':[],
    'best_correction_model_.h5_file_link':[]

    }


  '''Inzialization of some values'''
  y_target = y_train #the current target we're considering
  diff = 1e3 #this is used to determine whether the current best model is still the best or not.
  best_run = -1 #this is used to take trace of the best iteration so far
  adjusted_compound_run_term = 1e3 #it is initialized with a very big value just to iterate at least one time the while loop
  iteration = 0 #counter for iterations
  attenuated_padding_value = 1 #initial value for the attenuating padding value
  best_model_adjusted_compound_run_term=0#initial value of the best_model adjusted compound run term
  # last_best_model_run_number=0# initial value for the last best model run number
  # current_best_model_run_number=0#initial value for the current best model run number
  last_best_model_path_h5='dummy.h5'#initial dummy value for the last best model path
  last_best_correction_model_path_h5='correction.h5'


  '''As long as the compound_run_term has not reached the precision we want or the maximum number of iterations is not reached, the code inside the while loop is executed.'''
  max_iterations=max(prediction_runs,correction_runs) if prediction_runs!=0 or correction_runs!=0 else 1
  while abs(adjusted_compound_run_term - 1.0) >= precision and iteration < max_iterations:


    print('Iteration', iteration, 'for target',targets[target])
    print('Attenuated padding value', attenuated_padding_value)

    if prediction_runs==0:
      # default_path_prediction=(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'run_'+str(best_prediction_run)+'.h5')
      default_path_prediction=(best_models_path+targets[target]+'/proceedit 20231230 SPP-AAPL_20000103-20221130_MPN5P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-11013L0.75_ME - Best model for MPN5Prun_0.h5')
      # default_path_prediction=(best_models_path+targets[target]+'/proceedit 20230826 SPP-^GSPC_20200102-20230118_MPN5P_LSTM-15710760-B64E100L0.0005T8-DOHLCAV-FTEKR_LVSP_LSTM-15710760-B64E100L0.0005T8-PR_11011L0.75_AH - Best model for MPN7Prun_0.h5') #TSLA
      prediction_model=load_model(default_path_prediction,compile=False)
      print(default_path_prediction)
    else:
      if  iteration < prediction_runs:
        print(y_target)
        prediction_model= tf.keras.Sequential()

        '''The layers of the model (see case_version_cat Tab)'''

        prediction_model.add(LSTM(2**twoexp_nodes_number_layer_1,input_shape=input_shape,return_sequences=True))
        prediction_model.add(LSTM(2**twoexp_nodes_number_layer_2, return_sequences=True))
        prediction_model.add(LSTM(2**twoexp_nodes_number_layer_3))
        prediction_model.add(Dense(2**twoexp_nodes_number_layer_4))
        prediction_model.add(Dense(int(model_case_version_main_target_code)+1))

        prediction_model.compile(optimizer = optimizers.Adam(learning_rate = lr), loss = custom_loss_function(attenuated_padding_value))

        '''Training of the model'''
        prediction_model.fit(X_train, y_target, batch_size=batch, epochs=100, verbose=2)

    '''Computation of the predictions'''
    train_predictions=prediction_model.predict(X_train) #predictions for training data
    print(train_predictions)
    train_predictions = train_robust_scaler_target.inverse_transform(train_predictions) #convert prediction first by inverting the Robust scaler transformation and then the e_logarithmic one.
    train_predictions = np.exp(train_predictions)
    train_prediction = [a[:1] for a in train_predictions]
    print(train_predictions)
    train_array = np.concatenate(train_prediction)
    # train_array=train_array*CPCP_cols_train[0]
    multiple_run_predictions = train_array
    actual = actuals_cols[0][:len(multiple_run_predictions)]
    pred_actual_vs_date_ratio=train_array/ actual
    _, _, _, pred_actual_vs_date_ratio_dispersion= sir_parameters(positions_day_number_train,pred_actual_vs_date_ratio)
    print("pred_actual_vs_date_ratio_dispersion",pred_actual_vs_date_ratio_dispersion)
    _, _, _, pred_vs_actual_dispersion= sir_parameters(actual,train_array)
    print("pred_actual_dispersion",pred_vs_actual_dispersion)
    # plt.figure(figsize=(10,10))
    # a, b = np.polyfit(actual,train_array, 1)
    # plt.scatter(actual, train_array, color='purple')
    # plt.plot(actual, a*actual+b,color='steelblue', linewidth=2)
    # plt.title('predicted_vs_actual_plot')
    # plt.show()

    # print("Plot Function")
    # plot_target(actual,train_array,"Actual","Predicted",actual,pred_actual_vs_date_ratio,"actual","ratios",actual,train_array,"actual","predictions")


    '''implementation of the correction phases for the LSTM corrected_predicted_targets'''
    '''PHASE 1 CORRECTION'''
    #vertical_correction
    raw_predicted_targets=train_array
    train_raw_targets_ratio=raw_predicted_targets/actual
    # '''plot of train_raw_targets_ratios against period'''
    # x = np.linspace(0,len(raw_predicted_targets), len(raw_predicted_targets), dtype= np.int32)
    # z = np.polyfit(x,  train_raw_targets_ratio, 1)
    # p = np.poly1d(z)
    # plt.figure(figsize=(10,10))
    # plt.plot(train_raw_targets_ratio, linestyle = 'dotted')
    # plt.plot(x, p(x), linewidth = 2.5)
    # plt.title('train_raw_targets_ratio')
    # plt.show()

    if apply_vertical_swing_corr:

      train_raw_trend_slope, train_raw_trend_intercept, _, train_raw_trend_dispersion= sir_parameters(positions_day_number_train,train_raw_targets_ratio)
      print('train_raw_trend_slope:',train_raw_trend_slope)
      print('train_raw_trend_intercept:',train_raw_trend_intercept)
      print('train_raw_trend_dispersion',train_raw_trend_dispersion)
      vertical_padding_correction_factor = 1/train_raw_trend_intercept
      print("vertical_padding_correction_factor:",vertical_padding_correction_factor)
      corrected_train_raw_targets_ratios = train_raw_targets_ratio * vertical_padding_correction_factor
      print("corrected_train_raw_targets_ratios", corrected_train_raw_targets_ratios)
      train_vertical_corrected_targets = corrected_train_raw_targets_ratios * actual#this is the formula for vertical padding
      print("back_vertical_corrected_targets", train_vertical_corrected_targets)
      # '''plot of corrected_train_raw_targets_ratios against period'''
      # x = np.linspace(0,len(train_vertical_corrected_targets), len(train_vertical_corrected_targets), dtype= np.int32)
      # z = np.polyfit(x, corrected_train_raw_targets_ratios, 1) #changed from ratio_array to vertical_ratio_array
      # p = np.poly1d(z)
      # plt.figure(figsize=(10,10))
      # plt.plot(corrected_train_raw_targets_ratios, linestyle = 'dotted')
      # plt.plot(x, p(x), linewidth = 2.5)
      # plt.title('corrected_train_raw_targets_ratios')
      # plt.show()

      '''swing_padding_correction'''
      swing_padding_correction_factors = []
      train_vertical_targets_ratios=train_vertical_corrected_targets/actual
      train_vertical_trend_slope, train_vertical_trend_intercept, _, train_vertical_trend_dispersion= sir_parameters(positions_day_number_train, train_vertical_targets_ratios)

      for i in positions_day_number_train:
        swing_padding_correction_factor = 1 / (1 + i * train_vertical_trend_slope)
        swing_padding_correction_factors.append(swing_padding_correction_factor)

      print('swing_padding_correction_factors',swing_padding_correction_factors)
      print('period_day_numbers', positions_day_number_train)

      swing_targets_ratios = train_raw_targets_ratio* vertical_padding_correction_factor*swing_padding_correction_factors
      swing_predicted_targets = swing_targets_ratios * actual

      # '''plot of swing_targets_ratios against period'''
      # x = np.linspace(0,len(swing_predicted_targets), len(swing_predicted_targets), dtype= np.int32)
      # z = np.polyfit(x, swing_targets_ratios, 1) #changed from ratio_array to vertical_ratio_array
      # p = np.poly1d(z)
      # plt.figure(figsize=(10,10))
      # plt.plot(swing_targets_ratios, linestyle = 'dotted')
      # plt.plot(x, p(x), linewidth = 2.5)
      # plt.title('swing_targets_ratios')
      # plt.show()
    else:
      swing_predicted_targets=raw_predicted_targets

    '''implementation of correction lstm'''
    if True:
      volume = df_actual['VTCP']
      correction_lstm_table,lstm_robust_feat,lstm_robust_target,actual_correct=preprare_correction_lstm_table_train(positions_day_number_train,swing_predicted_targets,volume,actual)
      correction_train_data=correction_lstm_table
      X_correction_train, y_correction_train =correction_data_sequences(correction_train_data, int(model_case_version_time_steps))
      print(correction_train_data)
      # sys.exit()
      correction_input_shape=((X_correction_train).shape[1],(X_correction_train).shape[2])
      print(correction_input_shape)
      # sys.exit()
      if correction_runs==0:
        # default_path_correction=(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'run_'+str(best_correction_run)+'correction_model.h5')
        default_path_correction=(best_models_path+targets[target]+'/proceedit 20230827 SPP-TSLA_20200102-20230531_MPN5P_LSTM-15710760-B64E100L0.0005T8-DOHLCAV-FTEKR_LVSP_LSTM-15710760-B64E100L0.0005T8-PR_11011L0.75_DM - Best model for MPN5Prun_5correction_model.h5') #TSLA
        correction_model=load_model(default_path_correction,compile=False)
      else:
        if  iteration < correction_runs:
          '''correction lstm implementation'''
          correction_model= tf.keras.Sequential()
          correction_model.add(LSTM(2**twoexp_nodes_number_layer_1,input_shape=correction_input_shape,return_sequences=True))
          correction_model.add(LSTM(2**twoexp_nodes_number_layer_2, return_sequences=True))
          correction_model.add(LSTM(2**twoexp_nodes_number_layer_3))
          correction_model.add(Dense(2**twoexp_nodes_number_layer_4))
          correction_model.add(Dense(2**twoexp_nodes_number_layer_5))
          correction_model.compile(optimizer = optimizers.Adam(learning_rate =correction_lr), loss = custom_loss_function(attenuated_padding_value))
          correction_model.fit(X_correction_train, y_correction_train, batch_size=correction_batch, epochs=100, verbose=2)

      LSTM_corrections=correction_model.predict(X_correction_train)
      print("raw_LSTM_corrections",LSTM_corrections)
      LSTM_corrections=lstm_robust_target.inverse_transform(LSTM_corrections.reshape(-1,1))
      print("transformed_LSTM_corrections",LSTM_corrections)
      LSTM_corrections=LSTM_corrections.reshape(1,-1)[0]
      # actual_corrections= correction_lstm_table['actual_raw_predited_ratio'][int(model_case_version_time_steps)-1:].to_numpy()
      compare_corrections=lstm_robust_target.inverse_transform(correction_lstm_table[:,2][int(model_case_version_time_steps)-1 + int(model_case_version_main_target_code):].reshape(1,-1))
      actual_corrections=actual_correct
      # raw_predicted_targets=train_array
      corrected_predicted_targets=np.multiply(swing_predicted_targets[int(model_case_version_time_steps)-1 + (2*int(model_case_version_main_target_code)):],LSTM_corrections)
      print("LSTM_corrections",LSTM_corrections)
      print("raw_predicted_targets_for_LSTM",swing_predicted_targets[int(model_case_version_time_steps)-1 + (2*int(model_case_version_main_target_code)):])
      print("corrected_predicted_targets",corrected_predicted_targets)
      _, _, _, correction_dispersion= sir_parameters(actual_corrections[int(model_case_version_main_target_code):],LSTM_corrections)
      print("pred_vs_actual_correction_dispersion",correction_dispersion)
      # plt.figure(figsize=(10,10))
      # a, b = np.polyfit(actual_corrections,LSTM_corrections, 1)
      # plt.scatter(actual_corrections, LSTM_corrections, color='purple')
      # plt.plot(actual_corrections, a*actual_corrections+b,color='steelblue', linewidth=2)
      # plt.title('predicted_vs_actual_correction_plot')
      # plt.show()

      period_day_number_temp=positions_day_number_train[int(model_case_version_time_steps)-1 + (2*int(model_case_version_main_target_code)):]
      period_day_number=period_day_number_temp-period_day_number_temp[0]+1
      actual_moving_average=actual[int(model_case_version_time_steps)-1 + (2*int(model_case_version_main_target_code)):]
    else:
      period_day_number=positions_day_number_train
      corrected_predicted_targets=swing_predicted_targets
      actual_moving_average=actual

    if past_corr_train_stat:
      ''' Implementation of last padding correction'''
      last_padding_correction_factors = []
      last_padding_correction_factors.append(1)
      last_padding_correction_factors.append(1)
      last_padding_correction_factors.append(1)
      last_padding_correction_factors.append(1)
      # last_padding_correction_factors.append(1)
      # last_padding_correction_factors.append(1)
      for i in range(4,len(period_day_number)):
        last_padding_correction_factors.append(((actual_moving_average[i-1]+actual_moving_average[i-2]+actual_moving_average[i-3]+actual_moving_average[i-4])/4)/((corrected_predicted_targets[i-1]+corrected_predicted_targets[i-2]+corrected_predicted_targets[i-3]+corrected_predicted_targets[i-4])/4))

      print(len(last_padding_correction_factors))
      # initial_last_ratio=swing_targets_ratios
      # initial_last_ratio=LSTM_corrections
      initial_last_ratio=corrected_predicted_targets/actual_moving_average

      last_targets_ratios = initial_last_ratio * last_padding_correction_factors
      last_slope,last_start_intercept,_,last_dispersion=sir_parameters(period_day_number,last_targets_ratios)
      last_predicted_targets = last_targets_ratios*actual_moving_average
      print('last_padding_correction_factors:',last_padding_correction_factors)
      print('last_targets_ratios:',last_targets_ratios)
      print('last_slope:', last_slope)
      print('last_start_intercept:',last_start_intercept)
      print('last_dispersion:',last_dispersion)
      print('last_predicted_targets:',last_predicted_targets)
      print()

      # '''plot of swing_targets_ratios against period'''
      # x = np.linspace(0,len(last_predicted_targets), len(last_predicted_targets), dtype= np.int32)
      # z = np.polyfit(x,last_targets_ratios, 1) #changed from ratio_array to vertical_ratio_array
      # p = np.poly1d(z)
      # plt.figure(figsize=(10,10))
      # plt.plot(last_targets_ratios, linestyle = 'dotted')
      # plt.plot(x, p(x), linewidth = 2.5)
      # plt.title('last_targets_ratio')
      # plt.show()
    else:
      last_predicted_targets=corrected_predicted_targets

    '''Computation of the analytical parameter values '''
    predicted_actual_ratio = (last_predicted_targets /actual_moving_average) #compute the ratio between predicted values and actual values

    slope_first, intercept_first, r_squared_first, avg_tld_first = sir_parameters(period_day_number, predicted_actual_ratio) # computation of the analytical parameters values for the ratio graph (see Functions section above for more details)
    slope_second, intercept_second, r_squared_second, avg_tld_second = sir_parameters(actual_moving_average, last_predicted_targets) # computation of the analytical parameters values for the actual vs predicted graph (see Functions section above for more details)
    # end_intercept_first = slope_first * padding_point + intercept_first
    end_intercept_first = slope_first * positions_day_number_train[-1] + intercept_first##CONFIRM THESE

    if end_intercept_first < 0:    # end_intercept shouldn't be negative condition
       end_intercept_first = 0.1

    print(' Predicted actual ratio', predicted_actual_ratio) #Added by me just to check the ratio
    print(' Intercept', intercept_first)
    print(' Slope', slope_first)
    print(' End Intercept', end_intercept_first)
    print(' Postion day number',padding_point)

    '''Just adding each value to the correspondent list'''
    analytical_parameters['slope_predicted_calculated_target_ratio_versus_period'].append(slope_first)
    analytical_parameters['intercept_predicted_calculated_target_ratio_versus_period'].append(intercept_first)
    analytical_parameters['end_intercept_predicted_calculated_target_ratio_versus_period'].append(end_intercept_first)
    analytical_parameters['rsqr_predicted_calculated_target_ratio_versus_period'].append(r_squared_first)
    analytical_parameters['average_tld_predicted_calculated_target_ratio_versus_period'].append(avg_tld_first)
    analytical_parameters['slope_predicted_versus_calculated_target'].append(slope_second)
    analytical_parameters['intercept_predicted_versus_calculated_target'].append(intercept_second)
    analytical_parameters['rsqr_predicted_versus_calculated_target'].append(r_squared_second)
    analytical_parameters['average_tld_predicted_versus_calculated_target'].append(avg_tld_second)

    '''Now, we normalized\rescaled the analytical parameters values according the our reference values (that is 1).This is why we add 1 to some values or we divide for the aevrage value of that specific targets.
     Afterwards, we put the values in the correpsoondent lists. '''
    norm_slope_ratio = slope_first+1
    norm_intercept_ratio = intercept_first #I added this line
    norm_end_intercept_ratio = end_intercept_first #I added this line
    norm_rsqr_ratio = r_squared_first+1
    resc_norm_avg_tld_ratio = (avg_tld_first)+1 #here you can see the avg_price_list that we computeed at the beginning of the code
    resc_norm_inter_predact = (intercept_second)+1
    resc_norm_avg_tld_predact = (avg_tld_second)+1
    # resc_norm_avg_tld_ratio = (avg_tld_first/avg_prices_list[target])+1 #here you can see the avg_price_list that we computeed at the beginning of the code
    # resc_norm_inter_predact = (intercept_second/avg_prices_list[target])+1
    # resc_norm_avg_tld_predact = (avg_tld_second/avg_prices_list[target])+1

    analytical_parameters['normalized_slope_predicted_calculated_target_ratio_versus_period'].append(norm_slope_ratio)
    analytical_parameters['normalized_intercept_predicted_calculated_target_ratio_versus_period'].append(norm_intercept_ratio) #I changed this line
    analytical_parameters['normalized_end_intercept_predicted_calculated_target_ratio_versus_period'].append(norm_end_intercept_ratio) #I changed this line
    analytical_parameters['normalized_rsqr_predicted_calculated_target_ratio_versus_period'].append(norm_rsqr_ratio)
    analytical_parameters['rescaled_normalized_average_tld_predicted_calculated_target_ratio_versus_period'].append(resc_norm_avg_tld_ratio)
    analytical_parameters['normalized_slope_predicted_versus_calculated_target'].append(slope_second)
    analytical_parameters['rescaled_normalized_intercept_predicted_versus_calculated_target'].append(resc_norm_inter_predact)
    analytical_parameters['normalized_rsqr_predicted_versus_calculated_target'].append(r_squared_second)
    analytical_parameters['rescaled_normalized_average_tld_predicted_versus_calculated_target'].append(resc_norm_avg_tld_predact)
    analytical_parameters['vertical_padding_correction_factor'].append(vertical_padding_correction_factor)
    analytical_parameters['train_vertical_trend_slope'].append(train_vertical_trend_slope)
    #Here we append the new analytical parameters
    new_analytical_parameters['normalized_trend_slope'].append(norm_slope_ratio)
    new_analytical_parameters['normalized_trend_intercept'].append(norm_intercept_ratio)
    new_analytical_parameters['normalized_trend_end_intercept'].append(norm_end_intercept_ratio)
    new_analytical_parameters['normalized_trend_rsqr'].append(norm_rsqr_ratio)
    new_analytical_parameters['rescaled_normalized_trend_dispersion'].append(resc_norm_avg_tld_ratio)

    new_analytical_parameters['trend_slope_weighting_exponent'].append(slope_weighting_exponent_ratio)
    new_analytical_parameters['trend_intercept_weighting_exponent'].append(intercept_weighting_exponent_ratio)
    new_analytical_parameters['trend_end_intercept_weighting_exponent'].append(end_intercept_weighting_exponent_ratio)
    new_analytical_parameters['trend_rsqr_weighting_exponent'].append(rsqr_weighting_exponent_ratio)
    new_analytical_parameters['trend_dispersion_weighting_exponent'].append(dispersion_weighting_exponent_ratio)

    '''Here, there's the computation of the compound_run_term. The idea is, in fact, to multiply all the normalized\rescaled analytical parameters among themselves. In addition to that, each value is weighted with a certain exponent.
    The best model is considered as the one whose compound_run_term is the closest to our rederence value (that is 1)'''
    compound_run_term = (norm_slope_ratio)**slope_weighting_exponent_ratio * (norm_intercept_ratio)**intercept_weighting_exponent_ratio * (norm_end_intercept_ratio)**end_intercept_weighting_exponent_ratio * (norm_rsqr_ratio)**rsqr_weighting_exponent_ratio * (resc_norm_avg_tld_ratio)**dispersion_weighting_exponent_ratio * (slope_second)**(slope_weighting_exponent_predicted_actual) * (resc_norm_inter_predact)**(intercept_weighting_exponent_predicted_actual) * (r_squared_second)**(rsqr_weighting_exponent_predicted_actual) * (resc_norm_avg_tld_predact)**(dispersion_weighting_exponent_predicted_actual)

    analytical_parameters['compound_run_term'].append(compound_run_term)

    '''The adjusted_compound_run term is the version of the compound_run_term that can be used for comparing compound run terms of different cases. Remember to change them according to the weights you're working with'''
    #adjusted_compound_run_term = (compound_run_term)**(8/(slope_weighting_exponent_ratio+intercept_weighting_exponent_ratio+rsqr_weighting_exponent_ratio+dispersion_weighting_exponent_ratio+slope_weighting_exponent_predicted_actual+intercept_weighting_exponent_predicted_actual+rsqr_weighting_exponent_predicted_actual+dispersion_weighting_exponent_predicted_actual))
    adjusted_compound_run_term = (compound_run_term)
    print(' Adjusted Compound run term:', adjusted_compound_run_term)

    analytical_parameters['adjusted_compound_run_term'].append(adjusted_compound_run_term)
    new_analytical_parameters['adjusted_compound_run_value'].append(adjusted_compound_run_term)
    new_analytical_parameters['number_of_runs_performed'].append(max_iterations)

    '''This is te if branch in which the best model is determined'''
    if abs(adjusted_compound_run_term - 1) < diff: #if the compound_run_term of the current model is nearer to one than the compound_run_term of the best model, then we have a new best model
      # last_best_model_run_number=current_best_model_run_number
      # current_best_model_run_number=iteration
      analytical_parameters['best_run'].append(iteration) #store the best run so far
      new_analytical_parameters['best_run_number'].append(iteration)
      diff = abs(adjusted_compound_run_term - 1) #store the new difference (for successive comparisons)
      best_run = iteration

      #change the value for the best_run (in such a way it works also if we are in the else branch)
      if prediction_runs==0:
        current_best_model_path_h5=default_path_prediction
      else:
        best_prediction_run=iteration
        if os.path.exists(last_best_model_path_h5):#checks the exitence of the last best saved model
          os.remove(last_best_model_path_h5)#deletes the previous best saved model
        prediction_model.save(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'run_'+str(iteration)+'.h5') #save the best model
        prediction_model.save_weights(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'run_'+str(iteration)+'weights.h5')
        current_best_model_path_h5=(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'run_'+str(iteration)+'.h5')
      if correction_runs==0:
        current_best_correction_model_path_h5="No"
      else:
        best_correction_run=iteration
        if os.path.exists(last_best_correction_model_path_h5):#checks the exitence of the last best saved model
          os.remove(last_best_correction_model_path_h5)#deletes the previous best saved model
        correction_model.save(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'run_'+str(iteration)+'correction_model.h5') #save the best model
        correction_model.save_weights(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'run_'+str(iteration)+'correction_model_weights.h5') #save the best model
        current_best_correction_model_path_h5=(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'run_'+str(iteration)+'correction_model.h5')
      # model.save(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'.h5') #save the best model
      # path_h5=(best_models_path+targets[target]+'/proceedit '+today+' SPP-'+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Best model for '+targets[target]+'.h5')
      after_training_predictions = []
      after_training_predictions.append(last_predicted_targets)
      new_analytical_parameters['best_model_.h5_file_link'].append(current_best_model_path_h5)
      new_analytical_parameters['best_correction_model_.h5_file_link'].append(current_best_correction_model_path_h5)

      intercepts_and_slopes['intercepts'].clear()
      intercepts_and_slopes['intercepts'].append(intercept_first)
      intercepts_and_slopes['slopes'].clear()
      intercepts_and_slopes['slopes'].append(slope_first)
      intercepts_and_slopes['end_intercepts'].clear()
      intercepts_and_slopes['end_intercepts'].append(end_intercept_first)
      best_model_intercept_first = intercept_first   #save the the intercept for the best model
      best_model_slope_first = slope_first     #save the the slope for the best model
      best_model_end_intercept_first = end_intercept_first #save the the end intercept for the best model
      best_model_adjusted_compound_run_term=adjusted_compound_run_term
      print(' Predictions during training:', multiple_run_predictions)    #print the predictions during the training of every run

      # analytical_parameters_updated['normalized_trend_slope'].append(norm_slope_ratio)
      # analytical_parameters_updated['normalized_trend_intercept'].append(norm_intercept_ratio)
      # analytical_parameters_updated['normalized_trend_end_intercept'].append(norm_end_intercept_ratio)
      # analytical_parameters_updated['normalized_trend_correlation'].append(norm_rsqr_ratio)
      # analytical_parameters_updated['rescaled_normalized_trend_dispersion'].append(resc_norm_avg_tld_ratio)
      # analytical_parameters_updated['trend_slope_weighting_exponent'].append(slope_second)
      # analytical_parameters_updated['trend_intercept_weighting_exponent'].append(resc_norm_inter_predact)
      # analytical_parameters_updated['trend_end_intercept_weighting_exponent'].append("")
      # analytical_parameters_updated['trend_correlation_weighting_exponent'].append(r_squared_second)
      # analytical_parameters_updated['trend_dispersion_weighting_exponent'].append(resc_norm_avg_tld_predact)

    else: #otherwise, we simply take trace of the best_run so far
      analytical_parameters['best_run'].append(best_run)
      new_analytical_parameters['best_run_number'].append(best_run)
      new_analytical_parameters['best_model_.h5_file_link'].append(current_best_model_path_h5)
      new_analytical_parameters['best_correction_model_.h5_file_link'].append(current_best_correction_model_path_h5)

    last_best_model_path_h5=current_best_model_path_h5
    last_best_correction_model_path_h5=current_best_correction_model_path_h5
    print(' Best model so far: ', best_run)
    print(' Best Model Adjusted compound run term:', best_model_adjusted_compound_run_term) #Just to print the best model compound run term
    print(' Best Model Intercept:', best_model_intercept_first)
    print(' Best model Slope:', best_model_slope_first)
    print(' Best model end Intercept:', best_model_end_intercept_first)
    #print(' Predictions during training:', multiple_run_predictions)


    '''The following block of lines is related to the way we compute the attenuated_padding_value (for the custom loss function). In fact, according to the position of this block, we can have two approaches:

          - if the block is inside the if branch (the one where the best model is determined), then we are in the B approach: the attenuated_padding_value is computed according to the best model obtained so far
          - if the block is outside the if branch (the one where the best model is determined), then we are in the L approach: the attenuated_padding_value is computed according to the last model obtained so far

    So in order to switch from an approach to the other, you can just cut the following 5 lines of code and paste them in the position you desire.'''

    intercept = intercept_first
    slope = slope_first
    padding_ratio_value = slope * padding_point + intercept #here we have the padding point we computed at the beginning
    padding_value = 1/padding_ratio_value
    attenuated_padding_value = ((padding_value-1)*attenuation_factor)+1 #formula for computing the attenuated_padding_value.. we have eliminated the -1 after paddinf_value
    print(' padding_ratio_value',padding_ratio_value) #I added these print here to check the calculation before the next iteration
    print(' padding_value',padding_value) #I added these print here to check the calculation before the next iteration
    print(' Attenuated padding value at end',attenuated_padding_value) #I added these print here to check the calculation before the next iteration

    if ((attenuated_padding_value< 0.8) or (attenuated_padding_value> 1.2)): #the attenuated_padding_value shouldn't be negative and shouldn't exceed 1
        attenuated_padding_value = 1

    print(' Attenuated padding value at end after the limit',attenuated_padding_value)
    print()


    '''Appending the values referred to the padding'''
    analytical_parameters['padding_correction_factor'].append(padding_value)
    analytical_parameters['padding_correction_factor_attenuation'].append(attenuated_padding_value)

    analytical_parameters['absolute_difference_normalized_intercept'].append(abs(intercept_first-1)) #this value is used for certain graphs (I don't know if it is still relevant for our analysis)

    '''Here , we store the analytical parameters file inside the loop (and not at the end when we have all the results) because in this way we can monitor step by step where we are. Of course, the file is overwritten everytime .'''
    analytical_params_df = pd.DataFrame(analytical_parameters)
    analytical_params_df.to_csv(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_'+str(prediction_runs)+'_'+str(correction_runs)+'_DM - Analytical parameters.csv')
    new_analytical_params_df = pd.DataFrame(new_analytical_parameters)
    new_analytical_params_df.to_csv(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_'+str(prediction_runs)+'_'+str(correction_runs)+'_DM - New Analytical parameters.csv')
    iteration += 1
  '''
  fit = np.polyfit(np.arange(0, analytical_params_df['absolute_difference_normalized_intercept'].shape[0],1), np.log(analytical_params_df['absolute_difference_normalized_intercept']),1) #this value is used for certain graphs (I don't know if it is still relevant for our analysis.
  alpha = fit[0]
  beta = fit[1]
  exponent_points = np.exp(beta + alpha*np.arange(0, analytical_params_df['absolute_difference_normalized_intercept'].shape[0],1))
  analytical_params_df['exponent_points'] = exponent_points'''

  '''Computation of slope and intercept for the best model predcitions'''
  last_model_predicted_actual_ratio = (last_predicted_targets/actual_moving_average)
  #best_model_predicted_actual_ratio = upload_predictions / actual
  last_model_slope_first, last_model_intercept_first, last_model_r_squared_first, last_model_avg_tld_first = sir_parameters(period_day_number, last_model_predicted_actual_ratio)
  #print('Last Model Intercept:', last_model_intercept_first)
  #print('Last model Slope:', last_model_slope_first)
  print('Best Model Intercept:', best_model_intercept_first)
  print('Best model Slope:', best_model_slope_first)
  print('Best model End Intercept:', best_model_end_intercept_first)
  print('The Predictions:', last_predicted_targets)

  '''Saving the final analytical parameters file for a specific target'''
  analytical_params_df = pd.concat([weights_df, analytical_params_df])
  analytical_params_df.to_csv(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_'+str(prediction_runs)+'_'+str(correction_runs)+'_DM - Analytical parameters.csv')
  new_analytical_params_df = pd.concat([new_weights_df, new_analytical_params_df])
  new_analytical_params_df.to_csv(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_20000103-20221130_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_'+str(prediction_runs)+'_'+str(correction_runs)+'_DM - New Analytical parameters.csv')
  break
  # analytical_parameters_updated_df = pd.DataFrame(analytical_parameters_updated)
  # analytical_parameters_updated_df.to_excel(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_200001-202207_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_ME - Updated Names Analytical parameters.xlsx')

Iteration 0 for target MPN5P
Attenuated padding value 1
[array([[ 0.10344583,  0.08736546,  0.07389774,  0.06335437,  0.05515523,
         0.04838234],
       [ 0.11932   ,  0.10266089,  0.0871957 ,  0.07367948,  0.06293948,
         0.05493976],
       [ 0.1352367 ,  0.11860877,  0.10252804,  0.08704415,  0.07329629,
         0.06279993],
       ...,
       [-0.90624631, -0.90380514, -0.90524089, -0.90799982, -0.91200005,
        -0.91446701],
       [-0.91076115, -0.90328647, -0.90189224, -0.90464628, -0.90998749,
        -0.9120118 ],
       [-0.91662558, -0.90351469, -0.89924035, -0.90054386, -0.90813625,
        -0.90961947]])]


2024-01-08 08:13:58.099616: W external/local_tsl/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 16.00MiB (rounded to 16777216)requested by op StatelessRandomNormalV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-01-08 08:13:58.099698: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-01-08 08:13:58.099712: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 26, Chunks in use: 26. 6.5KiB allocated for chunks. 6.5KiB in use in bin. 408B client-requested in use in bin.
2024-01-08 08:13:58.099721: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 1, Chunks in use: 0. 512B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2024-01-08 08:13:58.099728

ResourceExhaustedError: {{function_node __wrapped__StatelessRandomNormalV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[4096,1024] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StatelessRandomNormalV2] name: 

In [None]:
def curve_fit_exponential_regression(dates,yields,dates_num,yield_mode,pred_type):
   print("day numbers",dates_num)
   print("yields",yields)
   a,b= np.polyfit(dates_num,np.log(yields),1)
   a_guess=np.exp(b)
   b_guess=a
   popt, pcov =curve_fit(lambda t, a, b: a * np.exp(b * t),dates_num,yields,p0=(a_guess,b_guess),maxfev=10000)
   print("popt",popt)
   print("pocv",pcov)
   factor=popt[0]
   exponent=popt[1]

   print("factor from curve fit",factor)
   print("exponent from curve fit",exponent)
   y_fitted=[]
   for i in dates_num:#range(start_date, end_date)
    y_fitted.append(factor*np.exp(exponent* i))
   plt.figure(figsize=(10,10))
   plt.scatter(dates,yields,color='purple',label='Raw data')
   plt.plot(dates,y_fitted,color='red',label='fitted curve')
   plt.title('exponential regression using curve fit function for '+yield_mode+" for "+pred_type)
   print("yields value produced from curve fitted function",y_fitted)
   day_number_cont=np.arange(dates_num[0],dates_num[-1]+1)
   y_thirty_days_yield=[]
   for i in day_number_cont:
     y_thirty_days_yield.append(factor*np.exp(exponent* i))
   thirty_days_yield=[]
   for i in range(len(day_number_cont)):
      if len(thirty_days_yield)<=30:
        thirty_days_yield.append(y_thirty_days_yield[i]/y_thirty_days_yield[0])
      else:
        thirty_days_yield.append(y_thirty_days_yield[i]/y_thirty_days_yield[i-30])
   print("thirty days yield from curve fit function",thirty_days_yield)
   delta = pd.to_datetime(dates[-1]) -pd.to_datetime(dates[0])
   dates_thirty_days=[]
   for i in range(delta.days + 1):
    day = pd.to_datetime(dates[0]) + timedelta(days=i)
    dates_thirty_days.append(day)
   plot_polynomial_3_regression(dates_thirty_days,thirty_days_yield,day_number_cont,yield_mode,pred_type)
   return np.mean(thirty_days_yield)

100

In [None]:
def plot_exponential_regression(dates,yields,dates_num):
  print("dates",dates)
  print("yields",yields)
  a,b= np.polyfit(dates_num,np.log(yields),1)
  ln_fit=linregress(dates_num,np.log(yields))
  print("exponent from the previously used function",a)
  print("factor from the previously used function",np.exp(b))
  factor=np.exp(b)
  exponent=np.exp(a)
  y_yield=factor*(exponent**dates_num)
  plt.figure(figsize=(10,10))
  plt.scatter(dates,yields,color='purple')
  plt.plot(dates,y_yield,color='red')
  plt.title('exponential regression using old function')
  print("yields value produced from old function",y_yield)
  thirty_days_yield=[]
  for i in range(len(dates)):
    if len(thirty_days_yield)<=30:
      thirty_days_yield.append(y_yield[i]/y_yield[0])
    else:
      thirty_days_yield.append(y_yield[i]/y_yield[i-30])
  print("thirty days yield from old function",thirty_days_yield)
  plot_polynomial_3_regression(dates,thirty_days_yield,dates_num,'forward','swing')
  return np.exp(b),a,(ln_fit.rvalue)**2

In [None]:
def plot_polynomial_3_regression(dates,yields,dates_num,yield_mode,pred_type):
  print("dates",dates)
  print("yields",yields)
  z=np.polyfit(dates_num,yields,3)
  p = np.poly1d(z)
  plt.figure(figsize=(10,10))
  plt.scatter(dates,yields,color='purple')
  plt.plot(dates, p(dates_num), linewidth = 2.5)
  plt.title("polynomial regression curve for "+yield_mode+" for "+pred_type)

In [None]:
#retreival of the analytical parameters for the best run(method1)
best_model_run_number= int(new_analytical_params_df['best_run_number'].iloc[-1])
new_analytical_params_df =pd.DataFrame(new_analytical_params_df.loc[best_model_run_number]).T
new_analytical_params_df.to_excel(best_models_path+targets[target]+'/'+filename+' - New Analytical parameters.xlsx')
# new_analytical_params_df.to_excel(best_models_path+targets[target]+'/proceedit '+today+' '+case+'_20000103-20230831_MPN'+model_case_version_main_target_code+'P_LSTM15_DOHLCAV-10-S-KER-710760-64-100-0.0005-10-'+str(analytical_parametrs)+'L'+str(attenuation_factor)+'_AH - New Analytical parameters.xlsx')