In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter

from sklearn.preprocessing import MinMaxScaler, StandardScaler

import math
import random
import pickle
import pandas as pd
import numpy as np
from matplotlib.pyplot import *

try:
    import wandb
except:
    !pip install wandb -qqq
    import wandb

import time
from datetime import datetime

In [None]:
DEVICE = torch.device( "cuda" )
print("DEVICE: ", torch.cuda.get_device_name(DEVICE))

def from_numpy( x ):
    return torch.from_numpy( x ).type( torch.float ).to( DEVICE )

def to_numpy( x ):
    return x.detach().cpu().numpy()

DEVICE:  Tesla P100-PCIE-16GB


In [None]:
class SingleInstanceMetaClass(type):
    def __init__(self, name, bases, dic):
        self.__single_instance = None
        super().__init__(name, bases, dic)

    def __call__(cls, *args, **kwargs):
        if cls.__single_instance:
            return cls.__single_instance
        single_obj = cls.__new__(cls)
        single_obj.__init__(*args, **kwargs)
        cls.__single_instance = single_obj
        return single_obj

In [None]:
class Dataloader( metaclass=SingleInstanceMetaClass ):
    """
    Loads return and prices from the previously constructed dataset, a DataFrame, saved in pickle format
    """

    def __init__( self, file_path, moving_average ):
        """
        file path:
            the path of the Dataframe in pickle format
        moving average:
            moving average window size applied to data
        """
        #dataframe is loaded
        self.data_df = self.load_df( file_path, moving_average )

        #number of assets
        self.assets = self.data_df.columns.get_level_values(0).unique()

        #number of features
        self.features = self.data_df.columns.get_level_values(1).unique()

    def load_df( self, file_path: str, moving_average: int ) -> pd.DataFrame :
        """
        file path:
            the path of the Dataframe in pickle format
        moving average:
            moving average window size applied to data

        returns:
            dataframe from file_path
        """
        data_df = pd.read_pickle( file_path )
        data_df =  data_df.rolling( moving_average ).mean().dropna()
        return data_df

    def load_prices( self, ) -> np.ndarray:
        """
        returns numpy array of shape (number of days, number of assets, number of features)
        containing the OHCLV prices
        """
        prices = []
        for asset in self.assets:
            to_append = self.data_df[asset][self.features].values
            prices.append( to_append )

        #prices is [ days, assets, features ]
        prices = np.stack( prices, axis = 1)
        return prices

    def load_returns( self, ):
        """
        returns numpy array of shape (number of days, number of assets, number of features)
        """
        self.returns_df = self.data_df.pct_change().fillna(0)
        returns = []
        for asset in self.assets:
            to_append = self.returns_df[asset][self.features].values
            returns.append( to_append )

        #returns is [ days, assets, features ]
        returns = np.stack( returns, axis = 1)
        return returns

class Dataset():

    def __init__( self, params ):
        # dataloader instance
        self.loader = Dataloader( params.file_path, params.moving_average )

        # we store the data for env accessibility
        self.returns = self.loader.load_returns() # returns are clippend and standardized
        self.true_returns = self.loader.load_returns() # returns are pct variations

        # split the indices for training, validation and testing
        self.split_indices( params.test_portion, params.val_portion )

        # clipping an normalizing the data
        self.scale_data( params.feature_clip, params.vol_clip)

        # other useful parameters
        self.episode_length = params.b_size
        self.encoder_sequence_length = params.encoder_sequence_length

    def split_indices( self, test_portion, val_portion ):
        """
        test_portion: float
            test portion of the dataset
        val_portion: float
            validation portion of the dataset

        split dataset indices in self.train_indices, self.val_indices, self.test_indices : np.ndarray
        in accord with the portions. validation portion and test portion are at the end of dataset
        i.e. closer to present.
        """

        num_periods = self.returns.shape[0]
        start_train_set_index = 0
        start_val_set_index = int( num_periods *( 1 - (test_portion + val_portion) ) )
        start_test_set_index = int( num_periods * ( 1 - val_portion ) )

        self.train_indices = np.arange(start_train_set_index, start_val_set_index)
        self.val_indices = np.arange(start_val_set_index, start_test_set_index)
        self.train_val_indices = np.arange(start_train_set_index, start_test_set_index)
        self.test_indices = np.arange(start_test_set_index, num_periods)


    def scale_data( self, feature_clip = .02, vol_clip = .8):
        """
        feature_clip: float
            clipping value for the OHCL features
        vol_clip: float
            clipping value for Volume

        clip OHCLV data and for each asset, a StardardScaler scales OHCL data and another StandardScaler scales Volumes data
        Standardized data is stored self.returns
        Non standardized data is in sefl.true_returns
        """

        feature_returns = self.returns[:,:,:-1]
        volumes_returns = self.returns[:,:,-1:]

        clipped_features = np.clip( feature_returns, - feature_clip , feature_clip )
        clipped_volumes = np.clip( volumes_returns, - vol_clip, vol_clip )


        features_std = clipped_features.copy()
        volumes_std = clipped_volumes.copy()

        feature_scalers = {}
        volume_scalers = {}

        #scale training data
        for i in range(features_std.shape[1]):
            feature_scalers[i] = StandardScaler()
            volume_scalers[i] = StandardScaler()
            features_std[self.train_val_indices, i, :] = feature_scalers[i].fit_transform(features_std[self.train_val_indices, i, :])
            volumes_std[self.train_val_indices,i,:] = volume_scalers[i].fit_transform( volumes_std[self.train_val_indices,i,:])

            #scale validation data
            #features_std[self.val_indices, i, :] = feature_scalers[i].transform(features_std[self.val_indices, i, :])
            #volumes_std[self.val_indices,i,:] = volume_scalers[i].transform( volumes_std[self.val_indices,i,:])

            #scale test data
            features_std[self.test_indices, i, :] = feature_scalers[i].transform(features_std[self.test_indices, i, :])
            volumes_std[self.test_indices,i,:] = volume_scalers[i].transform( volumes_std[self.test_indices,i,:])

        self.returns = np.concatenate([features_std, volumes_std], axis = -1)


    def load_sequence_indices( self, ):
        """
        return sequence_indices_encoder, reward_returns_indices
        sequence_indices_encoder are used for selecting transformer input from self.returns or self.true_returns in the training phase
        reward_returns_indices are used for reward or target
        """
        #questo metodo viene usato nella parte di RL, in modo da poter eventualmente modificare load_sequence in caso si voglia fare multi step forecasting o altre modifiche alla
        #regressione con il transformer
        indices = self.train_val_indices[ self.encoder_sequence_length : - self.episode_length ]

        starting_index = np.random.choice( indices )
        sequence_indices = np.arange( starting_index, starting_index + self.episode_length )
        sequence_indices_encoder = []
        for i in range( self.episode_length ):
            sequence_indices_encoder.append( np.arange( sequence_indices[i] - self.encoder_sequence_length, sequence_indices[i] ) )
        #to be consistent with sequence indices selected above, having used arange we have to add one
        sequence_indices_encoder = 1 + np.array( sequence_indices_encoder )

        reward_returns_indices = sequence_indices + 1

        return sequence_indices_encoder, reward_returns_indices


    def load_sequence_and_targets( self, ):
        """
        return transformer_input_sequence, regression_target_sequence
        sequence_indices_encoder sequence of standardized data to be used as transformer input
        regression_target_sequence are closing returns, target for regression
        """
        indices = self.train_val_indices[ self.encoder_sequence_length : - self.episode_length ]

        starting_index = np.random.choice( indices )
        sequence_indices = np.arange( starting_index, starting_index + self.episode_length )
        sequence_indices_transformer = []
        for i in range( self.episode_length ):
            sequence_indices_transformer.append( np.arange( sequence_indices[i] - self.encoder_sequence_length, sequence_indices[i] ) )
        #to be consistent with sequence indices selected above, having used arange we have to add one
        sequence_indices_transformer = 1 + np.array( sequence_indices_transformer )

        target_returns_indices = sequence_indices + 1

        transformer_input_sequence = self.returns[ sequence_indices_transformer ]

        #since the prediction is passed as input for actor and critic, standardized returns are target
        regression_target_sequence = self.true_returns[ target_returns_indices, :, -2 ]

        return transformer_input_sequence, regression_target_sequence

    def load_test_sequence_and_targets( self, ):
        """
        return transformer_input_sequence, regression_target_sequence
        sequence_indices_encoder sequence of standardized data to be used as transformer input
        regression_target_sequence are closing returns, target for regression
        """
        indices = self.test_indices[ self.encoder_sequence_length : ]

        sequence_indices_transformer = []
        for index in indices:
            sequence_indices_transformer.append( np.arange( index - self.encoder_sequence_length, index ) )

        #to be consistent with sequence indices selected above, having used arange we have to add one
        sequence_indices_transformer = 1 + np.array( sequence_indices_transformer )
        sequence_indices_transformer = sequence_indices_transformer[:-1]

        target_returns_indices = indices + 1
        target_returns_indices = target_returns_indices[:-1]

        transformer_input_sequence = self.returns[ sequence_indices_transformer ]

        regression_target_sequence = self.true_returns[ target_returns_indices, :, -2 ]

        return transformer_input_sequence, regression_target_sequence

In [None]:
class NormalizationLayer( nn.Module ):

    def __init__( self, d_model, epsilon = 1e-6 ):
        super( NormalizationLayer, self ).__init__()
        self.epsilon = epsilon
        self.w = nn.Parameter( torch.ones( d_model ) )
        self.b = nn.Parameter( torch.zeros( d_model ) )

    def forward( self, x ):
        mean = x.mean( dim = -1, keepdim = True )
        std = x.std( dim = -1, keepdim = True )
        return self.w * ( x - mean ) / ( std + self.epsilon ) + self.b

# as in https://timeseriestransformer.readthedocs.io/en/latest/README.html#installation the embedding layer is replaced by a generic linear layer
class EmbeddingLayer( nn.Module ):

    def __init__( self, in_features, out_features ):
        super( EmbeddingLayer, self ).__init__()
        self.embedding = nn.Linear(in_features, out_features)

    def forward( self, x ):
        return self.embedding(x)


class Time2Vec( nn.Module ):
    """
    Time2Vec implementation

    parameters
    in_features: int
        number of features of the data
    out_features: int
        number of out features (k in the original paper)
    activation_function: function or function like
        the activation function. If none, sin is used
    """

    def __init__( self, in_features, out_features, activation_function = None ):
        super(Time2Vec, self).__init__()

        #i = 0
        self.linear_transformation = nn.Linear( in_features, 1, bias = True )

        #1 <= i <= k
        self.periodic_transformation = nn.Linear( in_features, out_features - 1, bias = True)

        if activation_function == None:
            self.activation_function = torch.sin

    def forward( self, x ):
        # x has shape (sequence_length, in_features)

        # linear_x has shape (sequence_length, 1)
        linear_x = self.linear_transformation( x )

        # periodic_x has shape (sequence_length, out_features - 1)
        periodic_x = self.activation_function( self.periodic_transformation(x) )

        # periodic_x has shape (sequence_length, out_features )
        out = torch.cat( [linear_x, periodic_x], dim = -1 )

        return out


class Query( nn.Module ):

    def __init__( self, in_features, out_features ):
        super( Query, self ).__init__()
        self.linear_layer = nn.Linear(in_features, out_features)

    def forward( self, x ):
        x = self.linear_layer( x )
        return x



class Key( nn.Module ):

    def __init__( self, in_features, out_features ):
        super( Key, self ).__init__()
        self.linear_layer = nn.Linear(in_features, out_features)

    def forward( self, x ):
        x = self.linear_layer( x )
        return x



class Value( nn.Module ):

    def __init__( self, in_features, out_features ):
        super( Value, self ).__init__()
        self.linear_layer = nn.Linear(in_features, out_features)

    def forward( self, x ):
        x = self.linear_layer( x )
        return x


class MultiHeadAttention( nn.Module ):

    def __init__( self, in_features, d_model, num_heads ):
        super( MultiHeadAttention, self ).__init__()

        assert d_model % num_heads == 0

        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads

        self.query = Query( in_features, d_model )
        self.key = Key( in_features, d_model )
        self.value = Value( in_features, d_model )

    def attention( self, query, key, value ):
        matmul_qk = torch.matmul( query, key.transpose(-2, -1) )
        scaled_attention_logits = matmul_qk / math.sqrt( self.depth )
        attention_weights = F.softmax( scaled_attention_logits, dim = -1 )
        output = torch.matmul( attention_weights, value )
        return output, attention_weights

    def forward( self, query, key, value ):

        companies = query.size(0)

        #linear transformation [ assets, sequence_length, d_model]
        query = self.query( query )
        key = self.key( key )
        value = self.value( value )

        # splitting in num_heads -> [ assets, sequence_length, num_heads, depth]
        query = query.contiguous().view( companies, -1 , self.num_heads, self.depth )
        key = key.contiguous().view( companies, -1 , self.num_heads, self.depth )
        value = value.contiguous().view( companies, -1 , self.num_heads, self.depth )

        # [ assets, sequence_length, num_heads, depth]
        # -> [ assets, num_heads, sequence_length, depth]
        query = query.transpose( 2, 1 )
        key = key.transpose( 2, 1 )
        value = value.transpose( 2, 1 )

        # applying attention
        # output [ assets, num_heads, sequence_length, depth]
        # attention_weights [ assets, num_heads, sequence_length_q, sequence_length_k]
        output, attention_weights = self.attention( query, key, value )

        # [ assets, num_heads, sequence_length, depth]
        # -> [ assets, sequence_length, num_heads, depth]
        output = output.transpose( 2, 1 )

        # [ assets, seq_len, d_model ]
        return output.contiguous().view( companies, -1 , self.d_model)


class FeedForward( nn.Module ):

    def __init__( self, in_features, n_layers, d_layers, out_features, dropout ):
        super( FeedForward, self ).__init__()

        layers = nn.ModuleList([])

        if n_layers > 1:
            layers.append( nn.Linear( in_features, d_layers ) )
            layers.append( nn.LeakyReLU() )
            for layer_index in range( n_layers - 1 ):
                layers.append( nn.Linear( d_layers, d_layers))
                layers.append( nn.LeakyReLU() )
                layers.append( nn.Dropout( dropout ) )
            layers.append( nn.Linear( d_layers, out_features ) )
        else:
            layers.append( nn.Linear( in_features, out_features ))

        self.net = nn.Sequential( *layers )

    def forward( self, x ):
        x = self.net(x)
        return x


class EncoderLayer( nn.Module ):

    def __init__( self, d_model, in_features, n_layers_ff, d_layers_ff, num_heads, dropout ):
        super( EncoderLayer, self ).__init__()
        self.norm_layer1 = NormalizationLayer( d_model )
        self.norm_layer2 = NormalizationLayer( d_model )
        self.dropout_layer1 = nn.Dropout( dropout )
        self.dropout_layer2 = nn.Dropout( dropout )
        self.mha = MultiHeadAttention( in_features = in_features,
                                       d_model = d_model,
                                       num_heads = num_heads )
        self.ffnn = FeedForward( in_features = d_model,
                                 n_layers = n_layers_ff,
                                 d_layers = d_layers_ff,
                                 out_features = d_model,
                                 dropout = dropout )

    def forward( self, x ):
        x2 = self.mha( x, x, x )
        x = self.norm_layer1( x + self.dropout_layer1(x2) )
        x2 = self.ffnn( x )
        return self.norm_layer2( x + self.dropout_layer2(x2) )


class DecoderLayer( nn.Module ):

    def __init__( self, d_model, in_features, n_layers_ff, d_layers_ff, num_heads, dropout  ):
        super( DecoderLayer, self ).__init__()
        self.norm_layer1 = NormalizationLayer( d_model )
        self.norm_layer2 = NormalizationLayer( d_model )
        self.norm_layer3 = NormalizationLayer( d_model )
        self.dropout_layer1 = nn.Dropout( dropout )
        self.dropout_layer2 = nn.Dropout( dropout )
        self.dropout_layer3 = nn.Dropout( dropout )
        self.mha1 = MultiHeadAttention( in_features = in_features,
                                       d_model = d_model,
                                       num_heads = num_heads )
        self.mha2 = MultiHeadAttention( in_features = in_features,
                                       d_model = d_model,
                                       num_heads = num_heads )
        self.ffnn = FeedForward( in_features = d_model,
                                 n_layers = n_layers_ff,
                                 d_layers = d_layers_ff,
                                 out_features = d_model,
                                 dropout = dropout )

    def forward( self, x, encoder_output ):
        x2 = self.mha1( x, x, x )
        x = self.norm_layer1( x + self.dropout_layer1(x2) )
        x2 = self.mha2( query = x, key = encoder_output, value = encoder_output )
        x = self.norm_layer2( x + self.dropout_layer2(x2) )
        x2 = self.ffnn( x )

        return self.norm_layer3( x + self.dropout_layer3(x2) )



class Encoder( nn.Module ):

    def __init__( self,
                 d_model,
                 num_layers,
                 num_heads,
                 t2v_units,
                 sequence_length,
                 num_features,
                 num_ff_layers,
                 dim_ff_layers,
                 dropout
                ):
        super( Encoder, self ).__init__()

        self.t2v_layer = Time2Vec( in_features = num_features,
                                   out_features =  t2v_units )
        self.embedding_layer = EmbeddingLayer( num_features + t2v_units, d_model )
        self.encoder_layers = self.get_layers( num_layers = num_layers,
                                               d_model = d_model,
                                               num_ff_layers = num_ff_layers,
                                               dim_ff_layers = dim_ff_layers,
                                               num_heads = num_heads,
                                               dropout = dropout )

    def get_layers( self, num_layers, d_model, num_ff_layers, dim_ff_layers, num_heads, dropout ):
        return nn.ModuleList( [EncoderLayer( d_model = d_model, \
                                             in_features = d_model, \
                                             n_layers_ff = num_ff_layers, \
                                             d_layers_ff = dim_ff_layers, \
                                             num_heads = num_heads,
                                             dropout = dropout ) \
                                for _ in range(num_layers)] )

    def forward( self, x ):
        #input is [companies, sequence_length, features]

        #t2v output is [companies, sequence_length, t2v_units]
        x2 = self.t2v_layer(x)

        #x is [companies, sequence_length, features + t2v_units]
        x = torch.cat( [ x, x2 ], dim = -1)

        #x is [companies, sequence_length, d_model]
        x = self.embedding_layer( x )

        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)

        return x


class Decoder( nn.Module ):

    def __init__( self,
                  d_model,
                  num_layers,
                  num_heads,
                  t2v_units,
                  sequence_length,
                  num_features,
                  num_ff_layers,
                  dim_ff_layers,
                  dropout ):
        super( Decoder, self ).__init__()
        self.t2v_layer = Time2Vec( in_features = num_features,
                                   out_features =  t2v_units )
        self.embedding_layer = EmbeddingLayer( num_features + t2v_units, d_model )
        self.decoder_layers = self.get_layers( num_layers = num_layers,
                                               d_model = d_model,
                                               num_ff_layers = num_ff_layers,
                                               dim_ff_layers = dim_ff_layers,
                                               num_heads = num_heads,
                                               dropout = dropout )

    def get_layers( self, num_layers, d_model, num_ff_layers, dim_ff_layers, num_heads, dropout ):
        return nn.ModuleList( [DecoderLayer( d_model = d_model, \
                                             in_features = d_model, \
                                             n_layers_ff = num_ff_layers, \
                                             d_layers_ff = dim_ff_layers, \
                                             num_heads = num_heads,
                                             dropout = dropout ) \
                                for _ in range(num_layers)] )

    def forward( self, x, encoder_output ):
        #input x is [companies, sequence_length, features]
        #encoder output is [ companies, sequence_length, encoder dimension]

        #t2v output is [batch size, companies, sequence_length, t2v_units]
        x2 = self.t2v_layer(x)

        #x is [batch size, companies, sequence_length, features + t2v_units]
        x = torch.cat( [ x, x2 ], dim = -1)

        #x is [batch size, companies, sequence_length, d_model]
        x = self.embedding_layer( x )

        for decoder_layer in self.decoder_layers:
            x = decoder_layer(x, encoder_output)

        return x



class Transformer( nn.Module ):

    def __init__( self,
                 dim_transformer,
                 encoder_sequence_length,
                 decoder_sequence_length,
                 num_layers,
                 num_heads,
                 t2v_units,
                 num_features,
                 num_ff_layers,
                 dim_ff_layers,
                 dropout
                ):
        super ( Transformer, self ).__init__()
        self.encoder = Encoder( d_model = dim_transformer,
                                num_layers = num_layers,
                                num_heads = num_heads,
                                t2v_units = t2v_units,
                                sequence_length = encoder_sequence_length,
                                num_features = num_features,
                                num_ff_layers = num_ff_layers,
                                dim_ff_layers = dim_ff_layers,
                                dropout = dropout
                               )
        self.decoder = Decoder( d_model = dim_transformer,
                                num_layers = num_layers,
                                num_heads = num_heads,
                                t2v_units = t2v_units,
                                sequence_length = decoder_sequence_length,
                                num_features = num_features,
                                num_ff_layers = num_ff_layers,
                                dim_ff_layers = dim_ff_layers,
                                dropout = dropout
                               )

        self.decoder_sequence_ffnn = FeedForward( in_features = decoder_sequence_length, n_layers = num_ff_layers, d_layers = dim_ff_layers, out_features = 1, dropout = dropout )


        self.decoder_sequence_length = decoder_sequence_length

    def forward( self, x ):

        #[  encoder_sequence_length, companies, features] -> [ companies, encoder_sequence_length, features]
        x = x.transpose(1,0)

        #input x is for encoder [ companies, encoder_sequence_length, features]
        xe = x

        #input xd is for decoder [ companies, decoder_sequence_length, features]
        xd = x[:,-self.decoder_sequence_length:]

        #[ companies, encoder_sequence_length, d_model_encoder]
        encoder_output = self.encoder(xe)

        #[ companies, decoder_sequence_length, d_model_decoder]
        decoder_output = self.decoder(xd, encoder_output)

        #[ companies, decoder_sequence_length, d_model_decoder] -> [ companies, d_model_decoder]
        output = self.decoder_sequence_ffnn( decoder_output.transpose(-2,-1) ).squeeze()

        return output


class RegressionTransformer( nn.Module ):

    def __init__( self, params ):
        super( RegressionTransformer, self ).__init__()

        self.transformer = Transformer( params.dim_transformer,
                                        params.encoder_sequence_length,
                                        params.decoder_sequence_length,
                                        params.num_layers,
                                        params.num_heads,
                                        params.t2v_units,
                                        params.num_features,
                                        params.num_ff_layers,
                                        params.dim_ff_layers,
                                        params.dropout
                                        )

        self.ffnn = FeedForward( params.dim_transformer, params.regression_ff_layers, params.dim_regression_ff_layers, 1, params.dropout)

    def forward( self, x):
        x = self.transformer(x)
        x = self.ffnn(x)
        return x

In [None]:
class Parameters(metaclass=SingleInstanceMetaClass):

    def __init__(self,):

        self.file_path =
        self.etfs = ['XLB', 'XLC', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLRE', 'XLU', 'XLV', 'XLY']
        self.moving_average = 10 # moving average smoothing to be applied to data
        self.val_portion = .1 #validation portion in dataset
        self.test_portion = .1 #test portion in dataset
        self.num_epochs = 300 # number of epochs
        self.b_size = 64 # batch size

        self.dropout = 0.1 # probability of an element to be zeroed in dropout layer

        self.feature_clip = 0.02
        self.vol_clip = 0.8

        self.encoder_sequence_length = 60 # sequence length for encoder input
        self.decoder_sequence_length = 20 # sequence length for decoder input
        self.dim_transformer = 64  #transformer model dimension

        self.num_layers = 1 # number of stacked encoder and decoder layers
        self.num_heads = 2 # number of heads in each layer
        self.t2v_units = 16 # k in t2v paper

        self.num_features = 5 #OHCLV

        self.num_ff_layers = 3 # number of feedforward layers in encoder and decoder layers
        self.dim_ff_layers = 256 #dimension of of feedforward layers in encoder and decoder layers

        self.regression_ff_layers = 4 #  number of feedforward layers in encoder and decoder layers
        self.dim_regression_ff_layers = 512 # dimension of feedforward layers in ffnn that produces prediction

        self.schedule_lr = True # use noam scheduler as in "attention" is all you need
        self.noam_model_size = 256
        self.noam_factor = .1
        self.noam_warmup = 2000

        self.lr = 1e-3 # learning rate in adam optimizer if not using noam scheduler
        self.beta1 = .9 # beta1 parameter in adam optimizer
        self.beta2 = .99 # beta2 parameter in adam optimizer

        self.early_stopping = True # use or not early stopping. I don't use a min_delta so every improvement is an improvement.
        self.patience = 5 # number of epochs with no improvement after which training will be stopped.

        self.seed = 1 # seed for reproducibility

        self.loss = "lch" #loss to use, mse or mae or lch

In [None]:
#losses and metrics
mse_loss = nn.MSELoss()
mae_loss = nn.L1Loss()

def log_cosh_loss( output, target ):
    return torch.mean(torch.log(torch.cosh(target - output + 1e-12)))

In [None]:
def noam_scheduler( step, factor, model_size, warmup ):
    step += 1
    return factor * (model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5)))

In [None]:
#initalization of weights and parameters
p = Parameters()
d = Dataset( p )
#adding number of batches per epoch
p.batch_per_epoch = int(d.train_indices.shape[0] / p.b_size)

#setting seeds
torch.manual_seed( p.seed )
random.seed( p.seed )
np.random.seed( p.seed )
torch.backends.cudnn.deterministic = True

#run name for wandb and tensorboard
run_name = f"Transformer_Regression__{p.seed}__{int(time.time())}"

#weights and biases and tensorboard
wandb.init(
    project = ,
    entity = ,
    sync_tensorboard=True,
    config = vars(p),
    name = run_name,
    save_code = True
)

writer = SummaryWriter()
writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(p).items()])),
    )

#save plot of scheduled lr for the run
steps_to_do = np.arange( p.num_epochs * p.batch_per_epoch)
fig = figure()
title("Learning rate ")
plot([noam_scheduler( step, p.noam_factor, p.noam_model_size, p.noam_warmup) for step in steps_to_do] )
xlabel("Step (#)")
ylabel("Learning rate (#)")
writer.add_figure("plots/scheduled_lr", fig, 0)


#initializing model and optimizer
r = RegressionTransformer( p ).to( DEVICE )
optim = Adam( r.parameters(), lr = p.lr, betas = (p.beta1, p.beta2) )


#init values for early stopping
current_min_val_loss = 1e8
epochs_wo_improve = 0

#parameter for noam scheduler
global_step = 0

#for epoch time printing
start_time = time.time()

#training
for epoch in range(p.num_epochs):
    print(f"EPOCH {epoch} / {p.num_epochs}")

    #metrics and losses in epoch
    ep_mse_losses = []
    ep_mae_losses = []
    ep_lch_losses = []

    #training
    for step in range(p.batch_per_epoch):

        #since lr is update after each step() method call the lr scheduling is done here https://nlp.seas.harvard.edu/2018/04/03/attention.html#optimizer
        if p.schedule_lr:
            #update lr
            lr_now = noam_scheduler( global_step, p.noam_factor, p.noam_model_size, p.noam_warmup )
            optim.param_groups[0]['lr'] = lr_now

        #metrics and losses in batch
        mse_losses = []
        mae_losses = []
        lch_losses = []

        #data is loaded from dataset
        xs_transformer, ys = d.load_sequence_and_targets()

        #data is converted to torch tensor
        xs_transformer = from_numpy(xs_transformer)
        ys = from_numpy( ys ).unsqueeze(-1)

        #zero_grad
        optim.zero_grad()

        #model is in training phase
        r.train()

        for x, y in zip( xs_transformer, ys ):

            #prediction
            y_pred = r( x )

            #losses and metrics
            mse = mse_loss( y_pred, y )
            mae = mae_loss( y_pred, y )
            lch = log_cosh_loss( y_pred, y )

            mse_losses.append(mse.item())
            mae_losses.append(mae.item())
            lch_losses.append(lch.item())

            #backward
            if p.loss == "mse":
                mse.backward()
            if p.loss == "mae":
                mae.backward()
            if p.loss == "lch":
                lch.backward()

        # opt step and update counter for noam scheduler
        optim.step()
        global_step += 1

        ep_mse_losses.append(torch.mean(torch.FloatTensor(mse_losses)).item())
        ep_mae_losses.append(torch.mean(torch.FloatTensor(mae_losses)).item())
        ep_lch_losses.append(torch.mean(torch.FloatTensor(lch_losses)).item())

        #register lr in writer
        writer.add_scalar("charts/learning rate", optim.param_groups[0]['lr'], global_step)


    #register losses in writer
    writer.add_scalar("mse_loss/train", torch.mean(torch.FloatTensor(ep_mse_losses)).item(), epoch)
    writer.add_scalar("mae_loss/train", torch.mean(torch.FloatTensor(ep_mae_losses)).item(), epoch)
    writer.add_scalar("lch_loss/train", torch.mean(torch.FloatTensor(ep_lch_losses)).item(), epoch)


    #validation
    #validation data is loaded
    xs_transformer, ys = d.load_test_sequence_and_targets()

    #data is converted to torch tensor
    val_xs_transformer = from_numpy(xs_transformer)
    val_ys = from_numpy( ys ).unsqueeze(-1)
    val_y_preds = []

    #model is in evaluating phase
    r.eval()

    for x, y in zip( val_xs_transformer, val_ys ):

        with torch.no_grad():
            y_pred = r( x )
            val_y_preds.append( y_pred )

    #convert predictions to torch tensor
    val_y_preds = torch.stack(val_y_preds)

    #compute losses and metrics on validation set
    val_mse = mse_loss( val_y_preds, val_ys )
    val_mae = mae_loss( val_y_preds, val_ys )
    val_lch = log_cosh_loss( val_y_preds, val_ys )

    #register validation losses in writer
    writer.add_scalar("mse_loss/test", val_mse.item(), epoch)
    writer.add_scalar("mae_loss/test", val_mae.item(), epoch)
    writer.add_scalar("lch_loss/test", val_lch.item(), epoch)

    # plot predictions
    val_ys = to_numpy( val_ys )
    val_y_preds = to_numpy( val_y_preds )

    fig, axs = subplots( len(p.etfs), 1 , figsize = (20,25))
    fig.suptitle( "Predictions on test set", fontsize = 18)
    fig.subplots_adjust(top= .95)
    for i in range(len(p.etfs)):
        axs[i].set_title(p.etfs[i])
        axs[i].plot(val_ys[:,i,0], label = "True")
        axs[i].plot(val_y_preds[:,i,0], label = "Predicted")
        axs[i].legend()
        axs[i].set_ylabel("Return (#)")
        if i!=10:
            axs[i].set_xticklabels([])
        if i==10:
            axs[i].set_xlabel("Day from start of set (#)")

    #register figure in writer
    writer.add_figure("plots/test", fig, epoch)


    #print statements at epoch end
    print("{:.2f}s".format( time.time() - start_time ) )
    print(f"mse loss {torch.mean(torch.FloatTensor(ep_mse_losses)).item()} - on test {val_mse.item()}")
    print(f"mae loss {torch.mean(torch.FloatTensor(ep_mae_losses)).item()} -  on test {val_mae.item()}")
    print(f"log cosh loss {torch.mean(torch.FloatTensor(ep_lch_losses)).item()} - on test {val_lch.item()}")
    start_time = time.time()

    #early stopping
    if p.loss == "mse":
        val_loss = val_mse.item()
    if p.loss == "mae":
        val_loss = val_mae.item()
    if p.loss == "lch":
        val_loss = val_lch.item()


    #patience found via validation with value of 24
    epochs_wo_improve += 1
    if epochs_wo_improve == 24:
        print(f"Stopping at epoch {epoch}")
        torch.save(r, f"/content/drive/MyDrive/0_Codice tesi/RUN_DEF/pesi/regression_weights2.pt")
        break

    #if p.early_stopping:
    #    if val_loss < current_min_val_loss:
    #        #update value of min val loss and reset number of epochs without improvement
    #        current_min_val_loss = val_loss
    #        epochs_wo_improve = 0
#
    #        #save model weights
    #        torch.save(r, f"/content/drive/MyDrive/Codice tesi/Regression_logs/{run_name}/regression_weights.pt")
    #        print("saved model weights")
#
    #    else:
    #        epochs_wo_improve += 1
    #
    #    #if early stopping condition is met stop training
    #    if epochs_wo_improve == p.patience:
    #        print(f"Early stopping at epoch {epoch}")
    #        break


writer.close()

[34m[1mwandb[0m: Currently logged in as: [33m4g0[0m (use `wandb login --relogin` to force relogin)


EPOCH 0 / 300
39.22s
mse loss 0.0011333436705172062 - on test 0.0005115600652061403
mae loss 0.032831043004989624 -  on test 0.0219098050147295
log cosh loss 0.000566544127650559 - on test 0.0002557481930125505
EPOCH 1 / 300
41.21s
mse loss 0.00015285349218174815 - on test 3.445145193836652e-05
mae loss 0.009837673977017403 -  on test 0.004013993311673403
log cosh loss 7.641877164132893e-05 - on test 1.721938133414369e-05
EPOCH 2 / 300
39.82s
mse loss 4.204979632049799e-05 - on test 3.083033152506687e-05
mae loss 0.005005711689591408 -  on test 0.0034751631319522858
log cosh loss 2.10198213608237e-05 - on test 1.54077097249683e-05
EPOCH 3 / 300
39.75s
mse loss 4.0883212932385504e-05 - on test 2.7930438591283746e-05
mae loss 0.004903003573417664 -  on test 0.0031304301228374243
log cosh loss 2.0436718841665424e-05 - on test 1.3957087503513321e-05
EPOCH 4 / 300
39.93s
mse loss 3.094307248829864e-05 - on test 2.184652112191543e-05
mae loss 0.0043372781947255135 -  on test 0.00289487512782