In [1]:
import os
import math
import random #to set the seed to replicate results
from datetime import datetime,timedelta #for today's date
from dateutil.relativedelta import relativedelta
import sys
import requests
import optuna

#from targets_plot_generator.src import generate_plot

import psycopg2
import psycopg2.extras
from psycopg2.extensions import AsIs

import pandas as pd 
import numpy as np #for the e_logarithmic filter (and also some other mathematical operations)
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error,mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import RobustScaler #for preprocessing, it scales features using statistics that are robust to outliers.
from sklearn.linear_model import LinearRegression
from scipy.stats import linregress #for the slope and the value of Y at X=0 of the linear trend line
from scipy.optimize import curve_fit
import tsmoothie #for the Kalman filter, it is an efficient recursive filter that evaluates the state of a dynamic system starting from a series of measurements subject to noise.

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM,Conv1D,MaxPooling1D,Flatten,Bidirectional,Input #the two main layers of the model
from tensorflow.keras.optimizers import Adam#for the training of the model
from tensorflow.keras.models import load_model

random.seed(42)  #set the seed to replicate results

pd.options.display.max_columns = None

plt.style.use('ggplot')
plt.rc(
    'figure',
    autolayout=True,
    figsize=(11,4)
)

plt.rc(
    'axes',
    labelweight='bold',
    labelsize='large',
    titleweight='bold',
    titlesize=20,
    titlepad=10
)

print(tf.config.list_physical_devices('GPU'))

  from .autonotebook import tqdm as notebook_tqdm
2024-01-25 09:51:01.079617: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-25 09:51:01.079745: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-25 09:51:01.328583: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-25 09:51:01.806152: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2024-01-25 09:51:06.862455: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-25 09:51:08.020825: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-25 09:51:08.021120: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

### Preprocess Config

In [2]:
'''This is the timestep which indicates the window size'''
#model_case_version_time_steps= '10'
'''This is the number of periods defined in the target(MPNxP) where x is the number of periods'''
model_case_version_main_target_code='5'

#today = '20220706'
today = datetime.today().strftime('%Y%m%d') #just for names of files (for now)
print('Today is', today)

''' Here, we define the case name'''
case = 'TSLA'
print(case)

'''Here, we define the list of targets we are going to work on and also the average for each target (this value is used during the training for normalization\rescaling of some analytical parameters)'''
targets =['MPN'+model_case_version_main_target_code+'P']  #this must be changed whenever tha targets change

dataset_start_date = '2020-01-01'
train_end_date = "2023-09-29"
validation_end_date="2023-12-31"
#test_end_date="2023-12-31"

Today is 20240125
TSLA


## LBM 0.02 Before the Train

### Utils

In [3]:
'''Function for making sequences (blocks) of test and train data'''
def building_data_sequences(data_X,data_Y, timesteps): #timesteps means how many days we consider for each block

    X=[]
    y_MPNxP = []
    for i in range(len(data_X)-timesteps+1):  #how it works: every timesteps (e.g. 10 days) a block is constituted and for each block data and true values are stored
        X.append(data_X[i:(i+timesteps),:])
        y_MPNxP.append(data_Y[i+timesteps-1])
    return np.array(X), [np.array(y_MPNxP)]

'''Function for computing the analytical parameters'''
def sir_parameters(x,y): #sir stands for slope, intercept, rvalue (actually there's also the average trend line distance or avg_tld, but it came later)

  analytical_params = linregress(x, y)
  slope = analytical_params.slope
  intercept = analytical_params.intercept
  rvalue = analytical_params.rvalue #pay attention that here we have the correlaton coefficient (so not r2 that is the coefficient of determination)
  x_trend_line = slope*x + intercept #this is computed just for the avg_tld
  avg_trend_line_distance = np.mean(np.abs(x_trend_line-y))
  return slope,intercept,rvalue**2,avg_trend_line_distance

def custom_loss_function(attenuated_padding_value):

  def padding_loss_function(y_true, y_pred):

    y_pred = tf.multiply(y_pred, attenuated_padding_value) #this is the multiplication between the predictions and the attenuated_padding_value

    squared_difference = tf.square(y_true - y_pred)

    return tf.reduce_mean(squared_difference, axis=-1) #mse

  return padding_loss_function

def train_validation_test_split(dataframe,train_end_date,validation_end_date,test_end_date=None):
    # Define train period
    dataframe_start_date = dataframe['DATE'].min()

    train_end_date = pd.Timestamp(train_end_date)
    train_end_idx = dataframe.index[dataframe['DATE'] == train_end_date].values[0]
    train_end_idx = train_end_idx-int(model_case_version_main_target_code)

    # Define Validation period
    validation_start_date = (train_end_date + pd.Timedelta(days=1))
    validation_end_date = pd.Timestamp(validation_end_date)

    # Define Test Period
    #test_start_date = (validation_end_date + pd.Timedelta(days=1))
    #test_end_date = pd.Timestamp(test_end_date)

    #Split
    train_df = dataframe.iloc[int(model_case_version_main_target_code):train_end_idx+1]
    validation_df = dataframe.loc[(dataframe['DATE'] >= validation_start_date) & (dataframe['DATE'] <= validation_end_date)]
    #test_df = dataframe.loc[(dataframe['DATE'] >= test_start_date) & (dataframe['DATE'] <= test_end_date)]

    train_period = str(train_df['DATE'].min().date()) + ' / ' + str(train_df['DATE'].max().date())
    print('Train Period: ', train_period)

    validation_period = str(validation_df['DATE'].min().date()) + ' / ' + str(validation_df['DATE'].max().date())
    print('Validation Period: ', validation_period)

    #test_period = str(test_df['DATE'].min().date()) + ' / ' + str(test_df['DATE'].max().date())
    #print('Test Period: ', test_period)

    print('\n')
    print('Train shape: ', train_df.shape)
    print('Validation shape: ', validation_df.shape)
    #print('Test shape: ', test_df.shape)

    return train_df,validation_df


def plot_model_history(history: pd.DataFrame, limit_x = [None,None], limit_y = [None,None]):
   ax = history['loss'].plot(label='Train Loss')
   history['val_loss'].plot(ax=ax, label='Validation Loss')

   ax.set_xlim(limit_x)
   ax.set_ylim(limit_y)
   plt.legend()
   plt.show()   

### Analytical Parameters

In [4]:
#1L = dispersion = 1
#4L = slope + intercept + resqr + dispersion = 1
#5L = all the weighting = 1
#new paramaters case = intercept + slope + end intercept + correlation + dispersion
slope_weighting_exponent_ratio = 1
intercept_weighting_exponent_ratio = 1
end_intercept_weighting_exponent_ratio = 0
rsqr_weighting_exponent_ratio = 1
dispersion_weighting_exponent_ratio = 3

slope_weighting_exponent_predicted_actual = 0
intercept_weighting_exponent_predicted_actual = 0
rsqr_weighting_exponent_predicted_actual = 0
dispersion_weighting_exponent_predicted_actual = 0

analytical_parametrs = str(intercept_weighting_exponent_ratio)+str(slope_weighting_exponent_ratio)+str(end_intercept_weighting_exponent_ratio)+str(rsqr_weighting_exponent_ratio)+str(dispersion_weighting_exponent_ratio)
print(analytical_parametrs)

11013


### Extracting Dataset

In [5]:
''' Conection to PostgreSQL '''
# The credentials to conect to the database
hostname = 'database-1.ctzm0hf7fhri.eu-central-1.rds.amazonaws.com'
database = 'dyDATA_new'
username = 'postgres'
pwd = 'Proc2023awsrdspostgresql'
port_id = 5432
conn = None

#this helps to retreive the data for a particular asset from the database
asset_script="SELECT * FROM "+'\"'+"ASSET_"+case+'\"'+".features_targets_input_view WHERE features_targets_input_view."+'\"'+"cleaned_raw_features_environment_PK"+'\"'+ "= 4"
asset_script

''' The active financial assets '''
# Here we select the active financial asset from the financial asset list table
try:
  with psycopg2.connect(
      host = hostname,
      dbname = database,
      user = username,
      password = pwd,
      port = port_id
  ) as conn:

    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        print('You are connect to the Database:',database)
        select_script = asset_script # do not forget to set to asset_script when data has been uploaded
        cur.execute(select_script)
        data = cur.fetchall()
        cols = []
        # loop to create the dataframe that contains the active financial assets
        for rec in cur.description:
            cols.append(rec[0])
        dohlcav_mpnxp_data= pd.DataFrame(data = data, columns = cols)
        print('Your SQL has executed successfully')

except Exception as error:
  print(error)

finally:
  if conn is not None:
     conn.close()

if case=='TSLA':
  dohlcav_mpnxp_data = dohlcav_mpnxp_data.loc[dohlcav_mpnxp_data['cleaned_raw_features_DCP_date_current_period'] >= dataset_start_date].reset_index(drop=True)

dohlcav_mpnxp_data.head()

You are connect to the Database: dyDATA_new
Your SQL has executed successfully


Unnamed: 0,cleaned_raw_features_id,cleaned_raw_features_DCP_date_current_period,calculated_features_DNCP,cleaned_raw_features_OPCP_open_price_current_period,cleaned_raw_features_HPCP_high_price_current_period,cleaned_raw_features_LPCP_low_price_current_period,cleaned_raw_features_CPCP_close_price_current_period,cleaned_raw_features_ACPCP_adjusted_close_price_current_period,cleaned_raw_features_VTCP_volume_of_transactions_current_period,cleaned_raw_features_environment_PK,calculated_targets_MPN1P,calculated_targets_HPN1P,calculated_targets_LPN1P,calculated_targets_MPN2P,calculated_targets_HPN2P,calculated_targets_LPN2P,calculated_targets_MPN3P,calculated_targets_HPN3P,calculated_targets_LPN3P,calculated_targets_MPN4P,calculated_targets_HPN4P,calculated_targets_LPN4P,calculated_targets_MPN5P,calculated_targets_HPN5P,calculated_targets_LPN5P,calculated_targets_MPN6P,calculated_targets_HPN6P,calculated_targets_LPN6P,calculated_targets_MPN7P,calculated_targets_HPN7P,calculated_targets_LPN7P,calculated_targets_MPN10P,calculated_targets_HPN10P,calculated_targets_LPN10P,calculated_targets_MPN20P,calculated_targets_HPN20P,calculated_targets_LPN20P,calculated_targets_environment_PK
0,2330,2020-01-02,43832.0,28.3,28.713333,28.114,28.684,28.684,142981500.0,4,29.534,30.266666,29.128,29.553917,30.266666,29.128,30.102667,31.442,29.128,30.245333,33.232666,29.128,30.924166,33.253334,29.128,31.356333,33.253334,29.128,31.58,35.042,29.128,32.415584,36.494,29.128,35.272333,43.533333,29.128,4
1,2331,2020-01-03,43833.0,29.366667,30.266666,29.128,29.534,29.534,266677500.0,4,29.726167,30.104,29.333332,30.164,31.442,29.333332,30.924166,33.232666,29.333332,31.356333,33.253334,29.333332,31.58,33.253334,29.333332,31.9265,35.042,29.333332,32.11933,36.494,29.333332,32.908334,36.494,29.333332,35.90775,52.409332,29.333332,4
2,2332,2020-01-06,43836.0,29.364668,30.104,29.333332,30.102667,30.102667,151995010.0,4,30.924166,31.442,30.224,31.356333,33.232666,30.224,31.58,33.253334,30.224,31.9265,33.253334,30.224,32.11933,35.042,30.224,32.415584,36.494,30.224,32.809334,36.494,30.224,33.55425,36.572,30.224,36.487,64.599335,30.224,4
3,2333,2020-01-07,43837.0,30.76,31.442,30.224,31.270666,31.270666,268231500.0,4,32.209333,33.232666,31.215334,32.355584,33.253334,31.215334,32.11933,33.253334,31.215334,32.415584,35.042,31.215334,32.809334,36.494,31.215334,33.186333,36.494,31.215334,33.232666,36.494,31.215334,34.132999,39.63333,31.215334,37.120666,64.599335,31.215334,4
4,2334,2020-01-08,43838.0,31.58,33.232666,31.215334,32.809334,32.809334,467164500.0,4,32.501834,33.253334,31.524668,32.104332,33.253334,31.524668,32.501834,35.042,31.524668,33.02,36.494,31.524668,33.933166,36.494,31.524668,33.748833,36.494,31.524668,33.933166,36.494,31.524668,34.778666,39.63333,31.524668,37.642,64.599335,31.524668,4


### Filtering and Cleaning the Data

In [6]:
# FILTERING
filtered_columns_1=list(dohlcav_mpnxp_data.columns[:9])#to filter out the dates columns and features columns
filtered_columns_2=[x for x in dohlcav_mpnxp_data.columns if  targets[0][3:] in x ]#feature out the main target columns

if model_case_version_main_target_code=='1':
  temp=filtered_columns_2[0]
  temp_2=filtered_columns_2[1]
  filtered_columns_2[0]=filtered_columns_2[2]
  filtered_columns_2[1]=temp
  filtered_columns_2[2]=temp_2

#to add the last two constant columns to the table
filtered_columns_3=['calculated_targets_HPN1P','calculated_targets_LPN1P']
filtered_columns=filtered_columns_1+filtered_columns_2+filtered_columns_3
print(filtered_columns)
dohlcav_mpnxp_data=dohlcav_mpnxp_data[filtered_columns]

# RENAMING
if model_case_version_main_target_code=='1':
  dohlcav_mpnxp_data.columns=["ID","DCP_date_current_period","DNCP_day_number_current_period","OPCP_open_price_current_period","HPCP_high_price_current_period","LPCP_low_price_current_period"
,"CPCP_close_price_current_period","ACPCP_adjusted_close_price_current_period","VTCP_volume_of_transactions_current_period","MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods","HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods","LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods","HPN1P_high_price_next_1_period",
"LPN1P_low_price_next_1_period"
]
else:
  dohlcav_mpnxp_data = dohlcav_mpnxp_data.rename(columns={"cleaned_raw_features_id":"ID",
                                "cleaned_raw_features_DCP_date_current_period": "DCP_date_current_period",
                                "calculated_features_DNCP":"DNCP_day_number_current_period",
                                "cleaned_raw_features_OPCP_open_price_current_period":"OPCP_open_price_current_period",
                                "cleaned_raw_features_HPCP_high_price_current_period":"HPCP_high_price_current_period",
                                "cleaned_raw_features_LPCP_low_price_current_period":"LPCP_low_price_current_period",
                                "cleaned_raw_features_CPCP_close_price_current_period": "CPCP_close_price_current_period",
                                "cleaned_raw_features_ACPCP_adjusted_close_price_current_period":"ACPCP_adjusted_close_price_current_period",
                                "cleaned_raw_features_VTCP_volume_of_transactions_current_period":"VTCP_volume_of_transactions_current_period",
                                filtered_columns_2[0]:"MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods",
                                filtered_columns_2[1]:"HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods",
                                filtered_columns_2[2]:"LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods",
                                filtered_columns_3[0]:"HPN1P_high_price_next_1_period",
                                filtered_columns_3[1]:"LPN1P_low_price_next_1_period",
                                })
  
dohlcav_mpnxp_data.head()

['cleaned_raw_features_id', 'cleaned_raw_features_DCP_date_current_period', 'calculated_features_DNCP', 'cleaned_raw_features_OPCP_open_price_current_period', 'cleaned_raw_features_HPCP_high_price_current_period', 'cleaned_raw_features_LPCP_low_price_current_period', 'cleaned_raw_features_CPCP_close_price_current_period', 'cleaned_raw_features_ACPCP_adjusted_close_price_current_period', 'cleaned_raw_features_VTCP_volume_of_transactions_current_period', 'calculated_targets_MPN5P', 'calculated_targets_HPN5P', 'calculated_targets_LPN5P', 'calculated_targets_HPN1P', 'calculated_targets_LPN1P']


Unnamed: 0,ID,DCP_date_current_period,DNCP_day_number_current_period,OPCP_open_price_current_period,HPCP_high_price_current_period,LPCP_low_price_current_period,CPCP_close_price_current_period,ACPCP_adjusted_close_price_current_period,VTCP_volume_of_transactions_current_period,MPN5P_median_price_next_5_periods,HPN5P_highest_price_next_5_periods,LPN5P_lowest_price_next_5_periods,HPN1P_high_price_next_1_period,LPN1P_low_price_next_1_period
0,2330,2020-01-02,43832.0,28.3,28.713333,28.114,28.684,28.684,142981500.0,30.924166,33.253334,29.128,30.266666,29.128
1,2331,2020-01-03,43833.0,29.366667,30.266666,29.128,29.534,29.534,266677500.0,31.58,33.253334,29.333332,30.104,29.333332
2,2332,2020-01-06,43836.0,29.364668,30.104,29.333332,30.102667,30.102667,151995010.0,32.11933,35.042,30.224,31.442,30.224
3,2333,2020-01-07,43837.0,30.76,31.442,30.224,31.270666,31.270666,268231500.0,32.809334,36.494,31.215334,33.232666,31.215334
4,2334,2020-01-08,43838.0,31.58,33.232666,31.215334,32.809334,32.809334,467164500.0,33.933166,36.494,31.524668,33.253334,31.524668


In [7]:
''' This comprises the list of target in our datasets'''
targets_list=["MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods",
                        "HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods",
                        "LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods",
                        'HPN1P_high_price_next_1_period','LPN1P_low_price_next_1_period']

for i in targets_list:
  if targets[0] in i:
    main_target_column=i

''' What is stop target?'''
stop_target=dohlcav_mpnxp_data.index[-1]+1

#pay attention here because everytime targets change, also the name of the columns change
df = dohlcav_mpnxp_data.drop(["ID"], axis=1)

#pay attention here because everytime targets change, also the name of the columns change
df = df.rename(columns={"DCP_date_current_period": "DATE",
                        "DNCP_day_number_current_period": "DNCP",
                        "OPCP_open_price_current_period":"OPCP",
                        "HPCP_high_price_current_period":"HPCP",
                        "LPCP_low_price_current_period":"LPCP",
                        "CPCP_close_price_current_period":"CPCP",
                        "ACPCP_adjusted_close_price_current_period": "ACPCP",
                        "VTCP_volume_of_transactions_current_period":"VTCP",
                        "MPN"+model_case_version_main_target_code+"P_median_price_next_"+model_case_version_main_target_code+"_periods":"MPN"+model_case_version_main_target_code+"P",
                        "HPN"+model_case_version_main_target_code+"P_highest_price_next_"+model_case_version_main_target_code+"_periods":"HPN"+model_case_version_main_target_code+"P",
                        "LPN"+model_case_version_main_target_code+"P_lowest_price_next_"+model_case_version_main_target_code+"_periods":"LPN"+model_case_version_main_target_code+"P",
                        'HPN1P_high_price_next_1_period':'hpn1p',
                        'LPN1P_low_price_next_1_period':'lpn1p'})

#df = df.set_index('DATE')
#df.index = pd.to_datetime(df.index)
df['DATE'] = pd.to_datetime(df['DATE'])

df.head()

Unnamed: 0,DATE,DNCP,OPCP,HPCP,LPCP,CPCP,ACPCP,VTCP,MPN5P,HPN5P,LPN5P,hpn1p,lpn1p
0,2020-01-02,43832.0,28.3,28.713333,28.114,28.684,28.684,142981500.0,30.924166,33.253334,29.128,30.266666,29.128
1,2020-01-03,43833.0,29.366667,30.266666,29.128,29.534,29.534,266677500.0,31.58,33.253334,29.333332,30.104,29.333332
2,2020-01-06,43836.0,29.364668,30.104,29.333332,30.102667,30.102667,151995010.0,32.11933,35.042,30.224,31.442,30.224
3,2020-01-07,43837.0,30.76,31.442,30.224,31.270666,31.270666,268231500.0,32.809334,36.494,31.215334,33.232666,31.215334
4,2020-01-08,43838.0,31.58,33.232666,31.215334,32.809334,32.809334,467164500.0,33.933166,36.494,31.524668,33.253334,31.524668


### Feature Engineering

In [8]:
base_target_code = 'MPN' + model_case_version_main_target_code + 'P'
base_target_column_index = df.columns.get_loc(base_target_code)

def new_target_column(target_code , shift_back_period):
  prev_target = df[target_code]
  new_target = prev_target[:-shift_back_period]
  first_dates_handling = [0] * shift_back_period
  new_target=np.concatenate((first_dates_handling,new_target))
  return new_target

#Adding multiple targets
new_target_index = base_target_column_index
for i in range(int(model_case_version_main_target_code)):
  new_target_code = 'MPN-' + str(i+1) + 'P'
  df.insert(new_target_index+1,new_target_code,new_target_column(base_target_code,i+1))
  new_target_index = new_target_index + 1
  targets.append(new_target_code)


# '''Ratio Transformation for features'''
df.insert(7,'OPCP_Ratio',df['OPCP']/df['CPCP'])
df.insert(8,'HPCP_Ratio',df['HPCP']/df['CPCP'])
df.insert(9,'LPCP_Ratio',df['LPCP']/df['CPCP'])
df.insert(10,'ACPCP_Ratio',df['ACPCP']/df['CPCP'])
df.insert(df.columns.get_loc('MPN'+model_case_version_main_target_code+'P') ,'MPN'+ model_case_version_main_target_code +'P_Ratio',df['MPN'+ model_case_version_main_target_code +'P'].shift(5)/df['CPCP'].shift(5))
df.insert(df.columns.get_loc('HPN'+model_case_version_main_target_code+'P'),'HPN'+ model_case_version_main_target_code +'P_Ratio',df['HPN'+ model_case_version_main_target_code +'P']/df['CPCP'])
df.insert(df.columns.get_loc('LPN'+model_case_version_main_target_code+'P'),'LPN'+ model_case_version_main_target_code +'P_Ratio',df['LPN'+ model_case_version_main_target_code +'P']/df['CPCP'])
df.insert(df.columns.get_loc('hpn1p')+1,'hpn1p_Ratio',df['hpn1p']/df['CPCP'])
df.insert(df.columns.get_loc('lpn1p')+1,'lpn1p_Ratio',df['lpn1p']/df['CPCP'])

df = df.iloc[int(model_case_version_main_target_code):]

df.head()

Unnamed: 0,DATE,DNCP,OPCP,HPCP,LPCP,CPCP,ACPCP,OPCP_Ratio,HPCP_Ratio,LPCP_Ratio,ACPCP_Ratio,VTCP,MPN5P_Ratio,MPN5P,MPN-1P,MPN-2P,MPN-3P,MPN-4P,MPN-5P,HPN5P_Ratio,HPN5P,LPN5P_Ratio,LPN5P,hpn1p,hpn1p_Ratio,lpn1p,lpn1p_Ratio
5,2020-01-09,43839.0,33.14,33.253334,31.524668,32.089333,32.089333,1.032742,1.036274,0.982403,1.0,426606020.0,1.078098,34.297333,33.933166,32.809334,32.11933,31.58,30.924166,1.137263,36.494,0.984128,31.58,32.329334,1.007479,31.58,0.984128
6,2020-01-10,43840.0,32.11933,32.329334,31.58,31.876667,31.876667,1.007613,1.014201,0.990693,1.0,194392500.0,1.069276,34.378,34.297333,33.933166,32.809334,32.11933,31.58,1.14485,36.494,1.028966,32.8,35.042,1.099299,32.8,1.028966
7,2020-01-13,43843.0,32.9,35.042,32.8,34.990665,34.990665,0.940251,1.001467,0.937393,1.0,397764000.0,1.066993,34.993332,34.378,34.297333,33.933166,32.809334,32.11933,1.045193,36.572,0.937717,32.811333,36.494,1.042964,34.993332,1.000076
8,2020-01-14,43844.0,36.284,36.494,34.993332,35.861332,35.861332,1.011786,1.017642,0.975796,1.0,434943000.0,1.049205,35.048167,34.993332,34.378,34.297333,33.933166,32.809334,1.105183,39.63333,0.91495,32.811333,35.856,0.999851,34.452667,0.960719
9,2020-01-15,43845.0,35.317333,35.856,34.452667,34.566666,34.566666,1.021716,1.0373,0.996702,1.0,260532000.0,1.034253,35.907333,35.048167,34.993332,34.378,34.297333,33.933166,1.146577,39.63333,0.949219,32.811333,34.297333,0.992208,32.811333,0.949219


### Train Validation Test Split

In [9]:
train_df,validation_df = train_validation_test_split(dataframe=df,
                                                             train_end_date=train_end_date,
                                                             validation_end_date=validation_end_date,
                                                             #test_end_date=test_end_date
                                                            )

train_actual_target_list = train_df['MPN'+model_case_version_main_target_code+'P'].to_numpy(dtype='float64')
validation_actual_target_list = validation_df['MPN'+model_case_version_main_target_code+'P'].to_numpy(dtype='float64')
#test_actual_target_list = test_df['MPN'+model_case_version_main_target_code+'P'].to_numpy(dtype='float64')

Train Period:  2020-01-16 / 2023-09-29
Validation Period:  2023-10-02 / 2023-12-29


Train shape:  (933, 27)
Validation shape:  (63, 27)


### Pretreatment

In [10]:
features_start_index = 1
features_stop_index = 13

features = list(train_df.iloc[:,features_start_index:features_stop_index].columns)
targets = list(train_df.iloc[:,features_stop_index:features_stop_index + int(model_case_version_main_target_code) + 1].columns)

features_and_targets = features + targets

print('Features and Targets: ', features_and_targets)
print('Features List: ', features)
print('Targets List: ',targets)


train_dates_np_arr = train_df['DATE'].values
validation_dates_np_arr = validation_df['DATE'].values
#test_dates_np_arr = test_df['DATE'].values


train_df = train_df.set_index('DATE')
validation_df = validation_df.set_index('DATE')
#test_df = test_df.set_index('DATE')

Features and Targets:  ['DNCP', 'OPCP', 'HPCP', 'LPCP', 'CPCP', 'ACPCP', 'OPCP_Ratio', 'HPCP_Ratio', 'LPCP_Ratio', 'ACPCP_Ratio', 'VTCP', 'MPN5P_Ratio', 'MPN5P', 'MPN-1P', 'MPN-2P', 'MPN-3P', 'MPN-4P', 'MPN-5P']
Features List:  ['DNCP', 'OPCP', 'HPCP', 'LPCP', 'CPCP', 'ACPCP', 'OPCP_Ratio', 'HPCP_Ratio', 'LPCP_Ratio', 'ACPCP_Ratio', 'VTCP', 'MPN5P_Ratio']
Targets List:  ['MPN5P', 'MPN-1P', 'MPN-2P', 'MPN-3P', 'MPN-4P', 'MPN-5P']


#### Log Transform

In [11]:
train_df = np.log(train_df[features_and_targets])
validation_df = np.log(validation_df[features_and_targets])
#test_df = np.log(test_df[features_and_targets])

#### Kalman Filter

In [12]:
def kalman_filter(dataframe):
    kalman_smoother=tsmoothie.KalmanSmoother(component='level_trend',  component_noise={'level':0.1, 'trend':0.1})
    kalman_smoother.smooth(dataframe)
    dataframe = pd.DataFrame(kalman_smoother.smooth_data,columns=dataframe.columns,index=dataframe.index)
    return dataframe

train_df = kalman_filter(train_df[features_and_targets])
validation_df = kalman_filter(validation_df[features_and_targets])
#test_df = kalman_filter(test_df[features_and_targets])

#### Robust Scaling

In [13]:
'''
features = list(train_df.iloc[:,:12].columns)
targets = list(train_df.iloc[:,12:12+int(model_case_version_main_target_code)+1].columns)

print('Feaure List: ', features)
print('Target list: ', targets)

robust_scaler = RobustScaler()
train_df = pd.DataFrame(robust_scaler.fit_transform(train_df),columns=train_df.columns,index=train_df.index)
validation_df = pd.DataFrame(robust_scaler.transform(validation_df),columns=validation_df.columns, index=validation_df.index)
test_df = pd.DataFrame(robust_scaler.transform(test_df),columns=test_df.columns,index=test_df.index)
'''

# Fit transform train
robust_scaler_features = RobustScaler().fit(train_df[features])
robust_scaler_targets = RobustScaler().fit(train_df[targets])
 
train_df_features = pd.DataFrame(robust_scaler_features.transform(train_df[features]),
             columns=train_df[features].columns, index=train_df.index)


train_df_targets = pd.DataFrame(robust_scaler_targets.transform(train_df[targets]),
                                columns=train_df[targets].columns, index=train_df.index)

#Transform val
validation_df_features = pd.DataFrame(robust_scaler_features.transform(validation_df[features]),
                             columns=validation_df[features].columns, index=validation_df.index)

validation_df_targets = pd.DataFrame(robust_scaler_targets.transform(validation_df[targets]),
                             columns=validation_df[targets].columns, index=validation_df.index)

# Transform test
'''
test_df_features = pd.DataFrame(robust_scaler_features.transform(test_df[features]),
                             columns=test_df[features].columns, index=test_df.index)

test_df_targets = pd.DataFrame(robust_scaler_targets.transform(test_df[targets]),
                             columns=test_df[targets].columns, index=test_df.index)
'''

'\ntest_df_features = pd.DataFrame(robust_scaler_features.transform(test_df[features]),\n                             columns=test_df[features].columns, index=test_df.index)\n\ntest_df_targets = pd.DataFrame(robust_scaler_targets.transform(test_df[targets]),\n                             columns=test_df[targets].columns, index=test_df.index)\n'

## LMB 0.02 Model

In [14]:
timesteps = 10

twoexp_nodes_number_layer_1 = 7
twoexp_nodes_number_layer_2 = 10
twoexp_nodes_number_layer_3 = 7
twoexp_nodes_number_layer_4 = 6
twoexp_nodes_number_layer_5 = 0

In [15]:
train_features_arr = train_df_features.to_numpy(dtype='float64')
train_targets_arr = train_df_targets.to_numpy(dtype='float64')

validation_arr_features = validation_df_features.to_numpy(dtype='float64')
validation_arr_targets = validation_df_targets.to_numpy(dtype='float64')

'''
test_arr_features = test_df_features.to_numpy(dtype='float64')
test_arr_targets = test_df_targets.to_numpy(dtype='float64')
'''

X_train, y_train = building_data_sequences(train_features_arr,train_targets_arr,timesteps=timesteps)
X_validation, y_validation =  building_data_sequences(validation_arr_features,validation_arr_targets,timesteps=timesteps)

# X_test, y_test = building_data_sequences(test_arr_features,test_arr_targets,timesteps=timesteps)

print(f'X train shape : {X_train.shape}')
print(f'y train shape : {y_train[0].shape}')
print('\n')
print(f'X val shape : {X_validation.shape}')
print(f'y val shape : {y_validation[0].shape}')
print('\n')

'''
print(f'X test shape : {X_test.shape}')
print(f'y test shape : {y_test[0].shape}')
'''

input_shape=((X_train).shape[1],(X_train).shape[2])
print("Input shape obtained is:",input_shape)

X train shape : (924, 10, 12)
y train shape : (924, 6)


X val shape : (54, 10, 12)
y val shape : (54, 6)


Input shape obtained is: (10, 12)


### Define Model

In [18]:
tf.keras.backend.clear_session()

x0 = tf.keras.layers.Input(shape=input_shape)
lstm_layers = 4
lstm_units = np.zeros(lstm_layers, dtype=int)

lstm_units[0] = 700
lstm = LSTM(lstm_units[0], return_sequences=True)(x0)

for i in range(lstm_layers - 1):
    lstm_units[i+1] = lstm_units[i] // 2
    lstm = LSTM(lstm_units[i+1],return_sequences=True)(lstm)

lstm = LSTM(50,return_sequences=False)(lstm)

dense_units = lstm_units[-1]
lstm = Dense(dense_units, activation='relu')(lstm)
lstm = Dense(6)(lstm)

model = tf.keras.Model(inputs=x0,outputs=lstm)
metrics = ["mae"]
model.compile(optimizer="adam", loss="mae", metrics=metrics)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10, 12)]          0         
                                                                 
 lstm (LSTM)                 (None, 10, 700)           1996400   
                                                                 
 lstm_1 (LSTM)               (None, 10, 350)           1471400   
                                                                 
 lstm_2 (LSTM)               (None, 10, 175)           368200    
                                                                 
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10, 12)]          0         
                                                                 
 lstm (LSTM)                 (None, 10, 700)           199640

In [68]:


'''
x0 = Input(shape=input_shape)
lstm = LSTM(2**twoexp_nodes_number_layer_1,return_sequences=True)(x0)
lstm = LSTM(2**twoexp_nodes_number_layer_2,return_sequences=True)(lstm)
lstm = LSTM(2**twoexp_nodes_number_layer_3)(lstm)
lsmt = Dense(2**twoexp_nodes_number_layer_4)(lstm)
lstm = Dense(int(model_case_version_main_target_code)+1)(lstm)


model = tf.keras.Model(inputs=x0,outputs=lstm)
metrics = ["mse"]
model.compile(optimizer='adam',
              loss='mae',
              metrics=metrics
              )

model.summary()
'''

2024-01-25 09:27:15.281621: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-01-25 09:27:15.283590: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-01-25 09:27:15.302935: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10, 12)]          0         
                                                                 
 lstm (LSTM)                 (None, 10, 128)           72192     
                                                                 
 lstm_1 (LSTM)               (None, 10, 1024)          4722688   
                                                                 
 lstm_2 (LSTM)               (None, 128)               590336    
                                                                 
 dense_1 (Dense)             (None, 6)                 774       
                                                                 
Total params: 5,385,990
Trainable params: 5,385,990
Non-trainable params: 0
_________________________________________________________________


In [19]:
history = model.fit(
    X_train,
    y_train,
    batch_size = 64,
    epochs=10,
    validation_data = (X_validation,y_validation)
)

score = model.evaluate(X_validation, y_validation, verbose=0)
print(score)

Epoch 1/10


2024-01-25 09:52:42.763709: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f8d100180f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-25 09:52:42.763765: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Tesla V100-SXM2-16GB, Compute Capability 7.0
2024-01-25 09:52:42.807329: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1706176363.002242    3189 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.047820623964071274, 0.047820623964071274]


### Run Optimization

In [36]:
x0.shape

TensorShape([None, 10, 12])