# Note versione

In questa versione l'agente ha nello stato un parametro contenente il valore del portafoglio,e deve decidere se investire oppure no.


#Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
from torch.distributions import Normal, Categorical
from sklearn.preprocessing import StandardScaler

import math
import copy
import random
import pickle
import pandas as pd
import numpy as np
import matplotlib
from matplotlib.pyplot import *

try:
    import wandb
except:
    !pip install wandb -qqq
    import wandb

import time
from datetime import datetime

import gym
from gym import spaces
from gym.utils import seeding

import os
from google.colab import files

# Useful functions and classes

In [None]:
DEVICE = torch.device( "cuda" )
print("DEVICE: ", torch.cuda.get_device_name(DEVICE))

#matplotlib.rcParams['figure.dpi'] = 200


def from_numpy( x ):
    return torch.from_numpy( x ).type( torch.float ).to( DEVICE )

def to_numpy( x ):
    return x.detach().cpu().numpy()

DEVICE:  Tesla P100-PCIE-16GB


In [None]:
class SingleInstanceMetaClass(type):
    def __init__(self, name, bases, dic):
        self.__single_instance = None
        super().__init__(name, bases, dic)
 
    def __call__(cls, *args, **kwargs):
        if cls.__single_instance:
            return cls.__single_instance
        single_obj = cls.__new__(cls)
        single_obj.__init__(*args, **kwargs)
        cls.__single_instance = single_obj
        return single_obj

In [None]:
def _ulcer_index(series):
    dd = 1. - series/series.cummax()
    ssdd = np.sum(dd**2)
    return np.sqrt(np.divide(ssdd, series.shape[0] - 1))
    if isinstance(series, pd.DataFrame):
        return series.apply(_ulcer_index)
    else:
        return _ulcer_index(series)


def get_martin_ratio(self):
    """
    Returns the martin ratio for all the input time series.
    """
    serie=self.dropna()
    rendimento=(serie.iloc[-1]-serie.iloc[0])/serie.iloc[0]
    ulcer_index = _ulcer_index(serie)
    martin_ratio = rendimento/ulcer_index
    return martin_ratio

# Regression net

In [None]:
class NormalizationLayer( nn.Module ):
    
    def __init__( self, d_model, epsilon = 1e-6 ):
        super( NormalizationLayer, self ).__init__()
        self.epsilon = epsilon
        self.w = nn.Parameter( torch.ones( d_model ) )
        self.b = nn.Parameter( torch.zeros( d_model ) )
        
    def forward( self, x ):
        mean = x.mean( dim = -1, keepdim = True )
        std = x.std( dim = -1, keepdim = True )
        return self.w * ( x - mean ) / ( std + self.epsilon ) + self.b

# as in https://timeseriestransformer.readthedocs.io/en/latest/README.html#installation the embedding layer is replaced by a generic linear layer
class EmbeddingLayer( nn.Module ):
    
    def __init__( self, in_features, out_features ):
        super( EmbeddingLayer, self ).__init__()
        self.embedding = nn.Linear(in_features, out_features)
        
    def forward( self, x ):
        return self.embedding(x)


class Time2Vec( nn.Module ):
    """
    Time2Vec implementation
    
    parameters
    in_features: int
        number of features of the data
    out_features: int
        number of out features (k in the original paper)
    activation_function: function or function like
        the activation function. If none, sin is used
    """
    
    def __init__( self, in_features, out_features, activation_function = None ):
        super(Time2Vec, self).__init__()
        
        #i = 0
        self.linear_transformation = nn.Linear( in_features, 1, bias = True )
        
        #1 <= i <= k
        self.periodic_transformation = nn.Linear( in_features, out_features - 1, bias = True)
        
        if activation_function == None: 
            self.activation_function = torch.sin
        
    def forward( self, x ):
        # x has shape (sequence_length, in_features)
        
        # linear_x has shape (sequence_length, 1)
        linear_x = self.linear_transformation( x )
        
        # periodic_x has shape (sequence_length, out_features - 1)
        periodic_x = self.activation_function( self.periodic_transformation(x) )
        
        # periodic_x has shape (sequence_length, out_features )
        out = torch.cat( [linear_x, periodic_x], dim = -1 )
        
        return out


class Query( nn.Module ):
    
    def __init__( self, in_features, out_features ):
        super( Query, self ).__init__()
        self.linear_layer = nn.Linear(in_features, out_features)
    
    def forward( self, x ):
        x = self.linear_layer( x )
        return x
        
           
            
class Key( nn.Module ):
    
    def __init__( self, in_features, out_features ):
        super( Key, self ).__init__()
        self.linear_layer = nn.Linear(in_features, out_features)
    
    def forward( self, x ):
        x = self.linear_layer( x )
        return x
    
    
            
class Value( nn.Module ):
    
    def __init__( self, in_features, out_features ):
        super( Value, self ).__init__()
        self.linear_layer = nn.Linear(in_features, out_features)
    
    def forward( self, x ):
        x = self.linear_layer( x )
        return x
    

class MultiHeadAttention( nn.Module ):
    
    def __init__( self, in_features, d_model, num_heads ):
        super( MultiHeadAttention, self ).__init__()
        
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads
        
        self.query = Query( in_features, d_model ) 
        self.key = Key( in_features, d_model ) 
        self.value = Value( in_features, d_model )
        
    def attention( self, query, key, value ):
        matmul_qk = torch.matmul( query, key.transpose(-2, -1) )  
        scaled_attention_logits = matmul_qk / math.sqrt( self.depth )
        attention_weights = F.softmax( scaled_attention_logits, dim = -1 )
        output = torch.matmul( attention_weights, value )
        return output, attention_weights
        
    def forward( self, query, key, value ):
        
        companies = query.size(0)

        #linear transformation [ assets, sequence_length, d_model]
        query = self.query( query )
        key = self.key( key )
        value = self.value( value )
 
        # splitting in num_heads -> [ assets, sequence_length, num_heads, depth]
        query = query.contiguous().view( companies, -1 , self.num_heads, self.depth )
        key = key.contiguous().view( companies, -1 , self.num_heads, self.depth )
        value = value.contiguous().view( companies, -1 , self.num_heads, self.depth )

        # [ assets, sequence_length, num_heads, depth] 
        # -> [ assets, num_heads, sequence_length, depth]
        query = query.transpose( 2, 1 )
        key = key.transpose( 2, 1 )
        value = value.transpose( 2, 1 )

        # applying attention
        # output [ assets, num_heads, sequence_length, depth]
        # attention_weights [ assets, num_heads, sequence_length_q, sequence_length_k]
        output, attention_weights = self.attention( query, key, value )
        
        # [ assets, num_heads, sequence_length, depth]
        # -> [ assets, sequence_length, num_heads, depth]
        output = output.transpose( 2, 1 )
        
        # [ assets, seq_len, d_model ]
        return output.contiguous().view( companies, -1 , self.d_model)


class FeedForward( nn.Module ):
    
    def __init__( self, in_features, n_layers, d_layers, out_features, dropout ):
        super( FeedForward, self ).__init__()
        
        layers = nn.ModuleList([])
        
        if n_layers > 1:
            layers.append( nn.Linear( in_features, d_layers ) )
            layers.append( nn.LeakyReLU() )
            for layer_index in range( n_layers - 1 ):
                layers.append( nn.Linear( d_layers, d_layers))
                layers.append( nn.LeakyReLU() )
                layers.append( nn.Dropout( dropout ) )
            layers.append( nn.Linear( d_layers, out_features ) )
        else:
            layers.append( nn.Linear( in_features, out_features ))

        self.net = nn.Sequential( *layers )
        
    def forward( self, x ):
        x = self.net(x)
        return x


class EncoderLayer( nn.Module ):
    
    def __init__( self, d_model, in_features, n_layers_ff, d_layers_ff, num_heads, dropout ):
        super( EncoderLayer, self ).__init__()
        self.norm_layer1 = NormalizationLayer( d_model )
        self.norm_layer2 = NormalizationLayer( d_model )
        self.dropout_layer1 = nn.Dropout( dropout )
        self.dropout_layer2 = nn.Dropout( dropout )
        self.mha = MultiHeadAttention( in_features = in_features,
                                       d_model = d_model,
                                       num_heads = num_heads )
        self.ffnn = FeedForward( in_features = d_model,
                                 n_layers = n_layers_ff,
                                 d_layers = d_layers_ff,
                                 out_features = d_model, 
                                 dropout = dropout )
        
    def forward( self, x ):
        x2 = self.mha( x, x, x )
        x = self.norm_layer1( x + self.dropout_layer1(x2) )
        x2 = self.ffnn( x )
        return self.norm_layer2( x + self.dropout_layer2(x2) )


class DecoderLayer( nn.Module ):
    
    def __init__( self, d_model, in_features, n_layers_ff, d_layers_ff, num_heads, dropout  ):
        super( DecoderLayer, self ).__init__()
        self.norm_layer1 = NormalizationLayer( d_model )
        self.norm_layer2 = NormalizationLayer( d_model )
        self.norm_layer3 = NormalizationLayer( d_model )
        self.dropout_layer1 = nn.Dropout( dropout )
        self.dropout_layer2 = nn.Dropout( dropout )
        self.dropout_layer3 = nn.Dropout( dropout )
        self.mha1 = MultiHeadAttention( in_features = in_features,
                                       d_model = d_model,
                                       num_heads = num_heads )
        self.mha2 = MultiHeadAttention( in_features = in_features,
                                       d_model = d_model,
                                       num_heads = num_heads )
        self.ffnn = FeedForward( in_features = d_model,
                                 n_layers = n_layers_ff,
                                 d_layers = d_layers_ff,
                                 out_features = d_model, 
                                 dropout = dropout )
        
    def forward( self, x, encoder_output ):
        x2 = self.mha1( x, x, x )
        x = self.norm_layer1( x + self.dropout_layer1(x2) )
        x2 = self.mha2( query = x, key = encoder_output, value = encoder_output )
        x = self.norm_layer2( x + self.dropout_layer2(x2) )
        x2 = self.ffnn( x )

        return self.norm_layer3( x + self.dropout_layer3(x2) )



class Encoder( nn.Module ):
    
    def __init__( self, 
                 d_model,
                 num_layers,
                 num_heads,
                 t2v_units,
                 sequence_length,
                 num_features,
                 num_ff_layers,
                 dim_ff_layers,
                 dropout
                ):
        super( Encoder, self ).__init__()
        
        self.t2v_layer = Time2Vec( in_features = num_features,
                                   out_features =  t2v_units )
        self.embedding_layer = EmbeddingLayer( num_features + t2v_units, d_model )
        self.encoder_layers = self.get_layers( num_layers = num_layers,
                                               d_model = d_model, 
                                               num_ff_layers = num_ff_layers, 
                                               dim_ff_layers = dim_ff_layers, 
                                               num_heads = num_heads,
                                               dropout = dropout )
        
    def get_layers( self, num_layers, d_model, num_ff_layers, dim_ff_layers, num_heads, dropout ):
        return nn.ModuleList( [EncoderLayer( d_model = d_model, \
                                             in_features = d_model, \
                                             n_layers_ff = num_ff_layers, \
                                             d_layers_ff = dim_ff_layers, \
                                             num_heads = num_heads,
                                             dropout = dropout ) \
                                for _ in range(num_layers)] )
        
    def forward( self, x ):
        #input is [companies, sequence_length, features]

        #t2v output is [companies, sequence_length, t2v_units]
        x2 = self.t2v_layer(x)

        #x is [companies, sequence_length, features + t2v_units]
        x = torch.cat( [ x, x2 ], dim = -1)

        #x is [companies, sequence_length, d_model]
        x = self.embedding_layer( x )

        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)

        return x


class Decoder( nn.Module ):
    
    def __init__( self,
                  d_model,
                  num_layers, 
                  num_heads, 
                  t2v_units, 
                  sequence_length,
                  num_features,
                  num_ff_layers,
                  dim_ff_layers, 
                  dropout ):
        super( Decoder, self ).__init__()
        self.t2v_layer = Time2Vec( in_features = num_features,
                                   out_features =  t2v_units )
        self.embedding_layer = EmbeddingLayer( num_features + t2v_units, d_model )
        self.decoder_layers = self.get_layers( num_layers = num_layers,
                                               d_model = d_model, 
                                               num_ff_layers = num_ff_layers, 
                                               dim_ff_layers = dim_ff_layers, 
                                               num_heads = num_heads, 
                                               dropout = dropout )
        
    def get_layers( self, num_layers, d_model, num_ff_layers, dim_ff_layers, num_heads, dropout ):
        return nn.ModuleList( [DecoderLayer( d_model = d_model, \
                                             in_features = d_model, \
                                             n_layers_ff = num_ff_layers, \
                                             d_layers_ff = dim_ff_layers, \
                                             num_heads = num_heads,
                                             dropout = dropout ) \
                                for _ in range(num_layers)] )
        
    def forward( self, x, encoder_output ):
        #input x is [companies, sequence_length, features]
        #encoder output is [ companies, sequence_length, encoder dimension]

        #t2v output is [batch size, companies, sequence_length, t2v_units]
        x2 = self.t2v_layer(x)

        #x is [batch size, companies, sequence_length, features + t2v_units]
        x = torch.cat( [ x, x2 ], dim = -1)

        #x is [batch size, companies, sequence_length, d_model]
        x = self.embedding_layer( x )

        for decoder_layer in self.decoder_layers:
            x = decoder_layer(x, encoder_output)

        return x



class Transformer( nn.Module ):

    def __init__( self, 
                 dim_transformer, 
                 encoder_sequence_length, 
                 decoder_sequence_length,
                 num_layers, 
                 num_heads,
                 t2v_units,
                 num_features, 
                 num_ff_layers, 
                 dim_ff_layers,
                 dropout
                ):
        super ( Transformer, self ).__init__()
        self.encoder = Encoder( d_model = dim_transformer, 
                                num_layers = num_layers,
                                num_heads = num_heads,
                                t2v_units = t2v_units,
                                sequence_length = encoder_sequence_length,
                                num_features = num_features,
                                num_ff_layers = num_ff_layers,
                                dim_ff_layers = dim_ff_layers,
                                dropout = dropout
                               )
        self.decoder = Decoder( d_model = dim_transformer, 
                                num_layers = num_layers,
                                num_heads = num_heads,
                                t2v_units = t2v_units,
                                sequence_length = decoder_sequence_length,
                                num_features = num_features,
                                num_ff_layers = num_ff_layers,
                                dim_ff_layers = dim_ff_layers,
                                dropout = dropout
                               ) 
        
        self.decoder_sequence_ffnn = FeedForward( in_features = decoder_sequence_length, n_layers = num_ff_layers, d_layers = dim_ff_layers, out_features = 1, dropout = dropout )


        self.decoder_sequence_length = decoder_sequence_length
        
    def forward( self, x ):
        
        #[  encoder_sequence_length, companies, features] -> [ companies, encoder_sequence_length, features]
        x = x.transpose(1,0) 

        #input x is for encoder [ companies, encoder_sequence_length, features]
        xe = x
        
        #input xd is for decoder [ companies, decoder_sequence_length, features]
        xd = x[:,-self.decoder_sequence_length:]

        #[ companies, encoder_sequence_length, d_model_encoder]
        encoder_output = self.encoder(xe)

        #[ companies, decoder_sequence_length, d_model_decoder]
        decoder_output = self.decoder(xd, encoder_output)

        #[ companies, decoder_sequence_length, d_model_decoder] -> [ companies, d_model_decoder]
        output = self.decoder_sequence_ffnn( decoder_output.transpose(-2,-1) ).squeeze()

        return output


class RegressionTransformer( nn.Module ):

    def __init__( self, params ):
        super( RegressionTransformer, self ).__init__()

        self.transformer = Transformer( params.dim_transformer, 
                                        params.encoder_sequence_length, 
                                        params.decoder_sequence_length,
                                        params.num_layers, 
                                        params.num_heads,
                                        params.t2v_units,
                                        params.num_features, 
                                        params.num_ff_layers, 
                                        params.dim_ff_layers,
                                        params.dropout
                                        )
        
        self.ffnn = FeedForward( params.dim_transformer, params.regression_ff_layers, params.dim_regression_ff_layers, 1, params.dropout)

    def forward( self, x):
        x = self.transformer(x)
        x = self.ffnn(x)
        return x

# Dataloader and Dataset

In [None]:
class Dataloader( metaclass=SingleInstanceMetaClass ):
    """
    Loads return and prices from the previously constructed dataset, a DataFrame, saved in pickle format
    """

    def __init__( self, file_path, moving_average ):
        """
        file path: 
            the path of the Dataframe in pickle format
        moving average:
            moving average window size applied to data
        """
        #dataframe is loaded
        self.data_df = self.load_df( file_path, moving_average )

        #number of assets
        self.assets = self.data_df.columns.get_level_values(0).unique()
        
        #number of features 
        self.features = self.data_df.columns.get_level_values(1).unique()

    def load_df( self, file_path: str, moving_average: int ) -> pd.DataFrame :
        """
        file path: 
            the path of the Dataframe in pickle format
        moving average:
            moving average window size applied to data

        returns:
            dataframe from file_path
        """
        data_df = pd.read_pickle( file_path )
        data_df =  data_df.rolling( moving_average ).mean().dropna()
        return data_df

    def load_prices( self, ) -> np.ndarray:
        """
        returns numpy array of shape (number of days, number of assets, number of features)
        containing the OHCLV prices 
        """
        prices = []
        for asset in self.assets:
            to_append = self.data_df[asset][self.features].values 
            prices.append( to_append )

        #prices is [ days, assets, features ]
        prices = np.stack( prices, axis = 1)
        return prices

    def load_returns( self, ):
        """
        returns numpy array of shape (number of days, number of assets, number of features)
        """
        self.returns_df = self.data_df.pct_change().fillna(0)
        returns = []
        for asset in self.assets:
            to_append = self.returns_df[asset][self.features].values
            returns.append( to_append )
        
        #returns is [ days, assets, features ]
        returns = np.stack( returns, axis = 1)
        return returns

class Dataset( metaclass=SingleInstanceMetaClass ):

    def __init__( self, params ):
        # dataloader instance 
        self.loader = Dataloader( params.file_path, params.moving_average )

        # we store the data for env accessibility
        self.returns = self.loader.load_returns() # returns are clippend and standardized
        self.true_returns = self.loader.load_returns() # returns are pct variations 
        self.prices = self.loader.load_prices()

        # split the indices for training, validation and testing
        self.split_indices( params.test_portion, params.val_portion )

        # clipping an normalizing the data
        self.scale_data( params.feature_clip, params.vol_clip)

        # other useful parameters
        #self.episode_length = params.b_size
        self.episode_length = params.episode_length
        self.encoder_sequence_length = params.encoder_sequence_length

    def split_indices( self, test_portion, val_portion ):
        """
        test_portion: float
            test portion of the dataset
        val_portion: float
            validation portion of the dataset

        split dataset indices in self.train_indices, self.val_indices, self.test_indices : np.ndarray 
        in accord with the portions. validation portion and test portion are at the end of dataset 
        i.e. closer to present. 
        """

        num_periods = self.returns.shape[0]
        start_train_set_index = 0
        start_val_set_index = int( num_periods *( 1 - (test_portion + val_portion) ) )
        start_test_set_index = int( num_periods * ( 1 - val_portion ) )

        self.train_indices = np.arange(start_train_set_index, start_val_set_index)
        self.val_indices = np.arange(start_val_set_index, start_test_set_index)
        self.train_val_indices = np.arange(start_train_set_index, start_test_set_index)
        self.test_indices = np.arange(start_test_set_index, num_periods)
    

    def scale_data( self, feature_clip = .02, vol_clip = .8):
        """
        feature_clip: float
            clipping value for the OHCL features
        vol_clip: float
            clipping value for Volume

        clip OHCLV data and for each asset, a StardardScaler scales OHCL data and another StandardScaler scales Volumes data
        Standardized data is stored self.returns
        Non standardized data is in sefl.true_returns  
        """

        feature_returns = self.returns[:,:,:-1]
        volumes_returns = self.returns[:,:,-1:]

        clipped_features = np.clip( feature_returns, - feature_clip , feature_clip )
        clipped_volumes = np.clip( volumes_returns, - vol_clip, vol_clip )


        features_std = clipped_features.copy()
        volumes_std = clipped_volumes.copy()

        feature_scalers = {}
        volume_scalers = {}

        #scale training data
        for i in range(features_std.shape[1]):
            feature_scalers[i] = StandardScaler()
            volume_scalers[i] = StandardScaler()
            features_std[self.train_val_indices, i, :] = feature_scalers[i].fit_transform(features_std[self.train_val_indices, i, :]) 
            volumes_std[self.train_val_indices,i,:] = volume_scalers[i].fit_transform( volumes_std[self.train_val_indices,i,:])

            #scale validation data
            #features_std[self.val_indices, i, :] = feature_scalers[i].transform(features_std[self.val_indices, i, :]) 
            #volumes_std[self.val_indices,i,:] = volume_scalers[i].transform( volumes_std[self.val_indices,i,:])

            #scale test data
            features_std[self.test_indices, i, :] = feature_scalers[i].transform(features_std[self.test_indices, i, :]) 
            volumes_std[self.test_indices,i,:] = volume_scalers[i].transform( volumes_std[self.test_indices,i,:])

        self.returns = np.concatenate([features_std, volumes_std], axis = -1)


    def load_sequence_indices( self, ):
        """
        return sequence_indices_encoder, reward_returns_indices
        sequence_indices_encoder are used for selecting transformer input from self.returns or self.true_returns in the training phase
        reward_returns_indices are used for reward or target 
        """
        #questo metodo viene usato nella parte di RL, in modo da poter eventualmente modificare load_sequence in caso si voglia fare multi step forecasting o altre modifiche alla 
        #regressione con il transformer
        indices = self.train_val_indices[ self.encoder_sequence_length : - self.episode_length ]

        starting_index = np.random.choice( indices )
        sequence_indices = np.arange( starting_index, starting_index + self.episode_length )
        sequence_indices_encoder = []
        for i in range( self.episode_length ):
            sequence_indices_encoder.append( np.arange( sequence_indices[i] - self.encoder_sequence_length, sequence_indices[i] ) )
        #to be consistent with sequence indices selected above, having used arange we have to add one
        sequence_indices_encoder = 1 + np.array( sequence_indices_encoder )

        reward_returns_indices = sequence_indices + 1

        return sequence_indices_encoder, reward_returns_indices

    def load_test_indices( self, ):
        """
        return sequence_indices_encoder, reward_returns_indices
        sequence_indices_encoder are used for selecting transformer input from self.returns or self.true_returns in the training phase
        reward_returns_indices are used for reward or target 
        """
        #questo metodo viene usato nella parte di RL, in modo da poter eventualmente modificare load_sequence in caso si voglia fare multi step forecasting o altre modifiche alla 
        #regressione con il transformer
        indices = self.test_indices[ self.encoder_sequence_length : ]

        sequence_indices_transformer = []
        for index in indices:
            sequence_indices_transformer.append( np.arange( index - self.encoder_sequence_length, index ) )
            
        #to be consistent with sequence indices selected above, having used arange we have to add one
        sequence_indices_transformer = 1 + np.array( sequence_indices_transformer )
        sequence_indices_transformer = sequence_indices_transformer[:-1]

        reward_returns_indices = indices + 1
        reward_returns_indices = reward_returns_indices[:-1]

        return sequence_indices_transformer, reward_returns_indices, indices

    
    def load_sequence_and_targets( self, ):
        """
        return transformer_input_sequence, regression_target_sequence
        sequence_indices_encoder sequence of standardized data to be used as transformer input
        regression_target_sequence are closing returns, target for regression
        """
        indices = self.train_val_indices[ self.encoder_sequence_length : - self.episode_length ]

        starting_index = np.random.choice( indices )
        sequence_indices = np.arange( starting_index, starting_index + self.episode_length )
        sequence_indices_transformer = []
        for i in range( self.episode_length ):
            sequence_indices_transformer.append( np.arange( sequence_indices[i] - self.encoder_sequence_length, sequence_indices[i] ) )
        #to be consistent with sequence indices selected above, having used arange we have to add one
        sequence_indices_transformer = 1 + np.array( sequence_indices_transformer )

        target_returns_indices = sequence_indices + 1

        transformer_input_sequence = self.returns[ sequence_indices_transformer ]

        #since the prediction is passed as input for actor and critic, standardized returns are target
        regression_target_sequence = self.returns[ target_returns_indices, :, -2 ]

        return transformer_input_sequence, regression_target_sequence

    def load_test_sequence_and_targets( self, ):
        """
        return transformer_input_sequence, regression_target_sequence
        sequence_indices_encoder sequence of standardized data to be used as transformer input
        regression_target_sequence are closing returns, target for regression
        """
        indices = self.test_indices[ self.encoder_sequence_length : ]

        sequence_indices_transformer = []
        for index in indices:
            sequence_indices_transformer.append( np.arange( index - self.encoder_sequence_length, index ) )
            
        #to be consistent with sequence indices selected above, having used arange we have to add one
        sequence_indices_transformer = 1 + np.array( sequence_indices_transformer )
        sequence_indices_transformer = sequence_indices_transformer[:-1]

        target_returns_indices = indices + 1
        target_returns_indices = target_returns_indices[:-1]

        transformer_input_sequence = self.returns[ sequence_indices_transformer ]

        regression_target_sequence = self.returns[ target_returns_indices, :, -2 ]

        return transformer_input_sequence, regression_target_sequence

# Environment

In [None]:
class Sequence():

    def __init__(self, num_assets, sequence_len, composition_difference_coef, risk_coef):

        self.num_assets = num_assets
        self.sequence_len = sequence_len
        self.composition_difference_coef = composition_difference_coef
        self.risk_coef = risk_coef


    def encode( self, ):
        #next observation is concatenation of transformer output and portfolio composition
        next_obs = np.concatenate( [self.predicted_sequence[ self._idx ], self.portfolio_composition, self.portfolio_value] )
        return next_obs

    def reset_portfolio_composition( self, ):
        portfolio_composition = np.ones((self.num_assets,)) / (self.num_assets)
        return portfolio_composition


    def reset_portfolio_value( self, ):
        initial_portfolio_value = np.ones(1,)
        return initial_portfolio_value


    def reset( self, predicted_sequence, closing_reward_returns ):
        #current index in the sequence
        self._idx = 0

        #output of the transformer, i.e. prediction of closing returns 
        self.predicted_sequence = predicted_sequence
        
        # true returns for the day i.e. input of transformer
        self.closing_reward_returns = closing_reward_returns

        #current portfolio composition
        self.portfolio_composition = self.reset_portfolio_composition()
        self.portfolio_value = self.reset_portfolio_value()

        self.done = False

        obs = self.encode()

        return obs
    
    def step( self, action ):

        #action in this version is to_act * action

        #calculating reward
        next_portfolio_composition, reward, portfolio_returns, pc_difference_sum, portfolio_std = self.reward( action )

        #updating the portfolio composition

        #TODO: HERE MODIFY THE PORTFOLIO COMPOSITION SHIT
        self._idx += 1

        self.portfolio_composition = next_portfolio_composition

        self.portfolio_value = self.portfolio_value * (1+portfolio_returns) - pc_difference_sum

        #checking if done
        self.done = self._idx + 1 == self.sequence_len 
        #getting next_observation
        obs = self.encode()

        info = {}
        info["portfolio returns"] = portfolio_returns
        info["portfolio difference sum"] = pc_difference_sum
        info["portfolio returns std"] = portfolio_std
        
        return obs, reward, self.done, info

    def reward( self, action ):

        #if action is to keep portfolio composition then we do nothing
        #in this case action is zeros

        if action.sum() == 0:
            #this is the do nothing case
            next_portfolio_composition = self.portfolio_composition

        if action.sum() != 0:
            #this is the perform action case
            next_portfolio_composition = to_numpy( F.softmax(from_numpy(action), dim = -1) )

        returns = self.closing_reward_returns[self._idx]
        
        portfolio_elements_returns = (next_portfolio_composition * returns)

        portfolio_returns = portfolio_elements_returns.sum()

        portfolio_std = portfolio_elements_returns.std()

        #if the action is not performed this is zero
        pc_difference =  np.absolute(next_portfolio_composition - self.portfolio_composition)
        pc_difference_sum = pc_difference.sum()

        
        reward = portfolio_returns - self.composition_difference_coef * pc_difference_sum - self.risk_coef * portfolio_std

        return next_portfolio_composition, reward, portfolio_returns, pc_difference_sum, portfolio_std



class CustomEnv( gym.Env ):
    #required for gym.Env compatibility
    metadata = {'render.modes': ['human']}

    
    def __init__(self,):
        super(CustomEnv, self).__init__()

        # internal value of parameters
        self.params = Parameters()

        #parameters to be used in env
        self.num_assets = self.params.num_assets + 1
        self.prediction_shape = self.params.num_assets 

        # for memory reasons (vec env), dataset is passed as argument
        self.dataset = Dataset( self.params )

        #for reward
        self._true_returns = self.dataset.true_returns

        #for observations
        self._returns = self.dataset.returns

        # closing prices
        self._closing_prices = self.dataset.prices[:,:,-2]

        #a sequence object that produces observations, compute rewards and keeps track of portfolio composition 
        self._sequence = Sequence( sequence_len = self.params.episode_length, 
                                   num_assets = self.num_assets, 
                                   composition_difference_coef = self.params.composition_difference_coef, 
                                   risk_coef = self.params.risk_coef) 

        self.regression_net = self.load_regression_net()


        self.action_space = spaces.Box( low = 0, 
                                        high = 1., 
                                        shape = (self.num_assets,), 
                                        dtype = np.float32)
        
        self.observation_space = spaces.Box( low = -np.inf,
                                             high = np.inf,
                                             shape= (self.prediction_shape + self.num_assets + 1,), 
                                             dtype= np.float32)
        
    def load_regression_net( self, trained_regression_model_path = "/content/drive/MyDrive/0_Codice tesi/RUN_DEF/pesi/regression_weights.pt" ):
        regression_net = torch.load( trained_regression_model_path )
        return regression_net

    def step( self, action ):

        next_obs, reward, done, info = self._sequence.step( action )
        #info is a dictionary 

        return next_obs, reward, done, info
    

    def reset( self, ):

        sequence_indices_transformer, reward_returns_indices = self.dataset.load_sequence_indices()
        transformer_in_sequence = from_numpy(self._returns[ sequence_indices_transformer ])

        predicted_sequence = []
        for transformer_in in transformer_in_sequence:
            with torch.no_grad():
                pred = self.regression_net( transformer_in ).flatten() # after flatten dim: (num_assets,)
            predicted_sequence.append( to_numpy(pred) )
        predicted_sequence = np.array( predicted_sequence ) # dim: (sequence_len, num_assets,)

        closing_reward_returns = np.concatenate( [np.zeros((self.params.episode_length,1) ), self._true_returns[ reward_returns_indices, :, -2 ]], axis = -1)

        obs = self._sequence.reset( predicted_sequence, closing_reward_returns )

        return obs

    def render( self, ):
        pass

    def close( self, ):
        pass

    def test_reset( self, ):
        # a run on validation set
        sequence_indices_transformer, reward_returns_indices, indices_for_closing_prices = self.dataset.load_test_indices()
        transformer_in_sequence = from_numpy(self._returns[ sequence_indices_transformer ])

        predicted_sequence = []
        for transformer_in in transformer_in_sequence:
            with torch.no_grad():
                pred = self.regression_net( transformer_in ).flatten() # after flatten dim: (num_assets,)
            predicted_sequence.append( to_numpy(pred) )
        predicted_sequence = np.array( predicted_sequence ) # dim: (sequence_len, num_assets,)

        #true sequence to confront with predicted in plot
        true_sequence = self._returns[ reward_returns_indices, :, -2 ]

        #selecting the closing returns in validation set for reward
        closing_reward_returns = np.concatenate( [np.zeros((predicted_sequence.shape[0],1) ), self._true_returns[ reward_returns_indices, :, -2 ]], axis = -1) 

        #a validation sequence with different sequence length from the training one is created
        self._test_sequence = Sequence( sequence_len = predicted_sequence.shape[0], 
                                       num_assets = self.num_assets, 
                                       composition_difference_coef = self.params.composition_difference_coef, 
                                       risk_coef = self.params.risk_coef) 
        
        obs = self._test_sequence.reset( predicted_sequence, closing_reward_returns )
        
        #closing price portfolios for martin ratio
        closing_prices = self._closing_prices[ indices_for_closing_prices ]

        return obs, closing_reward_returns, closing_prices, true_sequence, predicted_sequence

    def test_step( self, action ):
        next_obs, reward, done, info = self._test_sequence.step( action )
        return next_obs, reward, done, info

    def test( self, ):
        #a run on test set
        pass

In [None]:
def make_env():

    def child( ):
        env = CustomEnv()
        env = gym.wrappers.RecordEpisodeStatistics( env )
        return env

    return child

# Actor Critic

In [None]:
def layer_init( layer, std = np.sqrt(2), bias_const = 0.0 ):
    """
    PPO specific layer initialization
    
    parameters
    
    std: float or float-like
        default np.sqrt(2)
        in actor last layer set to 0.01
        in critic last layer set to 1.
        
    bias: float 
        default 0
        do not change
    """
    
    if isinstance( layer, nn.Linear ):
        torch.nn.init.orthogonal_( layer.weight, std )
        torch.nn.init.constant_( layer.bias, bias_const )
    return layer

class Critic( nn.Module ):
    """
    Critic architecture for Actor-Critic
    
    parameters
    input_shape: int
        the single observation shape of the vector environment 
        can be obtained as np.array(vec_env.single_observation_space.shape).prod()
    """
    
    def __init__( self, input_shape, n_layers, d_layers ):
        super( Critic, self ).__init__()

        layers = nn.ModuleList([])

        if n_layers > 1:
            layers.append( layer_init( nn.Linear( input_shape, d_layers ) ) ) #std is sqrt(2)
            layers.append( nn.Tanh(), )
            for layer_index in range( n_layers - 1 ):
                layers.append( layer_init( nn.Linear( d_layers, d_layers ) ) )  #std is sqrt(2)
                layers.append( nn.Tanh() )
            layers.append( layer_init( nn.Linear( d_layers,1 ), std = 1.) ) #std is 1.
        else:
            layers.append( layer_init( nn.Linear( input_shape, 1 ), std = 1. ) ) #std is 1.

        self.net = nn.Sequential( *layers )
        
    def forward( self, x):
        x = self.net( x )
        return x




class Actor( nn.Module ):
    """
    Actor architecture for Actor-Critic
    
    parameters
    input_shape: int
        the single observation shape of the vector environment 
        can be obtained as np.array(vec_env.single_observation_space.shape).prod()
        
    action_number: int
        the number of action in the single environment of the vector environment 
        can be obtained as np.prod(vec_env.single_action_space.shape)
    """
    
    def __init__( self, input_shape, action_number, n_layers, d_layers ):
        super( Actor, self ).__init__()

        layers = nn.ModuleList([])
    
        if n_layers > 1:
            layers.append( layer_init( nn.Linear( input_shape, d_layers ) ) ) #std is sqrt(2)
            layers.append( nn.Tanh(), )
            for layer_index in range( n_layers - 1 ):
                layers.append( layer_init( nn.Linear( d_layers, d_layers ) ) )  #std is sqrt(2)
                layers.append( nn.Tanh() )
            layers.append( layer_init( nn.Linear( d_layers, action_number ), std = .01) ) #std is .01 
        else:
            layers.append( layer_init( nn.Linear( input_shape, action_number ), std = .01) ) #std is .01 

        self.pc_net = nn.Sequential( *layers )

        layers = nn.ModuleList([])
    
        if n_layers > 1:
            layers.append( layer_init( nn.Linear( input_shape, d_layers ) ) ) #std is sqrt(2)
            layers.append( nn.Tanh(), )
            for layer_index in range( n_layers - 1 ):
                layers.append( layer_init( nn.Linear( d_layers, d_layers ) ) )  #std is sqrt(2)
                layers.append( nn.Tanh() )
            layers.append( layer_init( nn.Linear( d_layers, 2 ), std = .01) ) #std is .01 
        else:
            layers.append( layer_init( nn.Linear( input_shape, 2 ), std = .01) ) #std is .01 
        
        self.act_net = nn.Sequential( *layers )

    def forward( self, x):
        pc = self.pc_net( x )
        to_act = self.act_net( x )
        return pc, to_act



class Agent( nn.Module ):
    
    """
    the PPO agent
    
    parameters
    vec_env: gym.vector.SyncVectorEnv
        the vectorized environment in use
    """
    
    def __init__( self, vec_env, n_layers, d_layers ):
        super( Agent, self).__init__()
        self.critic = Critic( np.array(vec_env.single_observation_space.shape).prod(), 
                              n_layers,
                              d_layers )
        self.actor = Actor( np.array(vec_env.single_observation_space.shape).prod(), 
                            np.array(vec_env.single_action_space.shape).prod(), 
                            n_layers,
                            d_layers )
        self.actor_logstd = nn.Parameter( torch.zeros(1, np.prod(vec_env.single_action_space.shape)) )
        
    #inference for critic
    def get_value( self, x ):
        # x is observation (number of envs, features in single vec observation)
        #returns tensor (num_envs, 1)
        return self.critic( x )
    
    def get_action_and_value( self, x, action = None, to_act = None ):
        
        #first i get un normalized action probabilities
        action_mean, to_act_logits = self.actor( x )
        action_logstd = self.actor_logstd.expand_as( action_mean )
        action_std = torch.exp( action_logstd )
        
        probs = Normal( action_mean, action_std )
        to_act_probs = Categorical( logits = to_act_logits)
        
        #in the rollout phase, we sample actions
        if action is None and to_act is None:
            action = probs.sample()
            to_act = to_act_probs.sample()
           
        #log probabilities of the action
        logprobs = probs.log_prob( action ).sum(1)

        to_act_logprobs = to_act_probs.log_prob( to_act )
        
        #entropy of the distribution
        entropy = probs.entropy().sum(1)
        to_act_entropy = to_act_probs.entropy()
        
        value = self.critic( x )
        
        return action, to_act, logprobs, to_act_logprobs, entropy, to_act_entropy, value

In [None]:
##

#x , x2

#-> rete_on_off( x, x2) #add this try and select adequate layer


#[10,90] -> [.1,.9]

# Parameters

In [None]:
class Parameters(metaclass=SingleInstanceMetaClass):

    def __init__(self,):

        self.file_path = 
        self.etfs = ['XLB', 'XLC', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLRE', 'XLU', 'XLV', 'XLY']
        self.num_assets = len(self.etfs) #number of assets
        self.moving_average = 10 # moving average smoothing to be applied to data
        self.val_portion = .1 #validation portion in dataset
        self.test_portion = .1 #test portion in dataset

        self.feature_clip = 0.02
        self.vol_clip = 0.8

        self.encoder_sequence_length = 60 # sequence length for encoder input
        self.decoder_sequence_length = 20 # sequence length for decoder input
        self.dim_transformer = 64  #transformer model dimension

        self.num_features = 5 #OHCLV

        #PPO

        self.seed = 1 # seed for reproducibility

        self.num_envs = 8 #number of parallel environments
        self.episode_length = 200 #number of steps in each environment per policy rollout

        self.num_ac_layers = 3 # number of feedforward layers in actor and critic
        self.dim_ac_layers = 128 #dimension of of feedforward layers in actor and critic

        self.composition_difference_coef = 0.0025 #(0.25%) #coefficient for reward calculation
        self.risk_coef = 1. #coefficient for reward calculation

        self.total_timesteps =  5000000 #the envvironment steps
        self.num_steps = self.episode_length #steps in rollout

        self.batch_size = self.num_envs * self.num_steps
        self.num_updates = self.total_timesteps // self.batch_size #number of updates

        self.anneal_lr = True

        self.use_single_lr = True
        self.learning_rate = 1e-4
        self.beta1 = .9
        self.beta2 = .9
        
        self.actor_learning_rate = 1e-4
        self.actor_beta1 = .9 # beta1 parameter in adam optimizer
        self.actor_beta2 = .99 # beta2 parameter in adam optimizer
        self.critic_learning_rate = 5e-4
        self.critic_beta1 = .9 # beta1 parameter in adam optimizer
        self.critic_beta2 = .99 # beta2 parameter in adam optimizer
       
        self.adam_eps = 1e-05

        self.gae = True #use GAE for advantage calculations
        self.gamma = .99
        self.gae_lambda = .95

        self.num_minibatches = 16
        self.minibatch_size = self.batch_size // self.num_minibatches

        self.num_update_epochs = 10 #number of times the policy is updated

        self.adv_normalization = True # normalize advantages

        self.clip_coef = .2 #log probabilities ratio clip value
        self.anneal_clip_coef = False #linear decrease of clip coef from init value to 0, but decreases performances
        self.clip_values = True # clip value function loss

        self.ent_loss_coef = 1e-6
        self.v_loss_coef = 0.5

        self.max_grad_norm = .5 #global maximum gradient clipping

        self.target_kl = None # 0.015 default in openai spinning. Altrimenti, None



# Main

In [None]:
p = Parameters()
d = Dataset( p )

run_name = f"PortfolioPPO_act_or_not__{p.seed}__{int(time.time())}"

wandb.init( 
    project = ,
    entity = ,
    sync_tensorboard=True,
    config = vars(p),
    name = run_name,
    monitor_gym = True,
    save_code = True
)

writer = SummaryWriter()
writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(p).items()])),
    )


#setting the seed
random.seed( p.seed )
np.random.seed( p.seed )
torch.manual_seed( p.seed )
torch.backends.cudnn.deterministic = True

vec_env = gym.vector.SyncVectorEnv( [make_env() for i in range( p.num_envs ) ] )

agent = Agent( vec_env, p.num_ac_layers, p.dim_ac_layers ).to( DEVICE )

if p.use_single_lr:
    optim = Adam( agent.parameters(), lr = p.learning_rate, betas = (p.beta1, p.beta2), eps = p.adam_eps)
else:
    actor_optim = Adam( agent.actor.parameters(), lr=p.actor_learning_rate, betas=(p.actor_beta1, p.actor_beta2), eps=p.adam_eps)
    critic_optim = Adam( agent.critic.parameters(), lr=p.critic_learning_rate, betas=(p.critic_beta1, p.critic_beta2), eps=p.adam_eps)


#Per i rollouts, non uso una classe ma questi tensori
#obs is ( number of steps, number of envs, features in single vec observation )
obs = torch.zeros((p.num_steps, p.num_envs) + vec_env.single_observation_space.shape).to( DEVICE )

#actions is ( number of steps, number of envs, features in single vec action )
#features in single vec action is in this case 1
actions = torch.zeros((p.num_steps, p.num_envs) + vec_env.single_action_space.shape).to( DEVICE )
to_act_actions = torch.zeros((p.num_steps, p.num_envs) + tuple([1])).to( DEVICE )

#these are ( number of steps, number of envs )
logprobs = torch.zeros((p.num_steps, p.num_envs)).to( DEVICE )
to_act_logprobs = torch.zeros((p.num_steps, p.num_envs)).to( DEVICE )
rewards = torch.zeros((p.num_steps, p.num_envs)).to( DEVICE )
dones = torch.zeros((p.num_steps, p.num_envs)).to( DEVICE )
values = torch.zeros((p.num_steps, p.num_envs)).to( DEVICE )

#for tracking reward components
#portfolio_returns = torch.zeros((p.num_steps, p.num_envs)).to( DEVICE )
#portfolio_differences = torch.zeros((p.num_steps, p.num_envs)).to( DEVICE )
#portfolio_returns_std = torch.zeros((p.num_steps, p.num_envs)).to( DEVICE )


# a global step counter
global_step = 0

# for time tracking
start_time = time.time()

# to store initial observation
next_obs = from_numpy( vec_env.reset() )

# to store initial termination conditions
next_done = torch.zeros( p.num_envs ).to( DEVICE )

for update in range(1, p.num_updates + 1):
    
    #un update è una iterazione del training loop così composto
    #0.update lr
    #1.policy rollouts
    #2.process rollouts data
    #3.policy training
    
    #0.update lr
    if p.anneal_lr:
        # decrescita lineare dal valore iniziale fino a 0
        # decremento ogni training loop
        frac = 1. - (update - 1)/p.num_updates
        if p.use_single_lr:
            learning_rate_now = frac * p.learning_rate
            optim.param_groups[0]['lr'] = learning_rate_now
        else:
            actor_learning_rate_now = frac * p.actor_learning_rate
            actor_optim.param_groups[0]['lr'] = actor_learning_rate_now
            critic_learning_rate_now = frac * p.critic_learning_rate
            critic_optim.param_groups[0]['lr'] = critic_learning_rate_now
        
    if p.anneal_clip_coef:
        frac = 1. - (update - 1)/p.num_updates
        clip_coef_now = frac * p.clip_coef
        p.clip_coef = clip_coef_now
        
        
    #1.policy_rollouts
    for step in range(0, p.num_steps ):
        global_step += 1 * p.num_envs
        
        obs[step] = next_obs
        dones[step] = next_done
        
        with torch.no_grad():
            action, to_act, logprob, to_act_logprob, _, _, value = agent.get_action_and_value( next_obs )
            values[step] = value.flatten()

        actions[step] = action
        to_act_actions[step] = to_act.unsqueeze(-1)
        logprobs[step] = logprob 
        to_act_logprobs[step] = to_act_logprob
        
        #step dell'environment
        next_obs, reward, done, info = vec_env.step( to_numpy( to_act.unsqueeze(-1) * action ) )

        rewards[step] = from_numpy( reward )
        
        #for tracking reward components
        #portfolio_returns[step] = from_numpy( info["portfolio returns"] )
        #portfolio_differences[step] = from_numpy( info["portfolio difference sum"] )
        #portfolio_returns_std[step] = from_numpy( info["portfolio returns std"] )
        
        next_obs = from_numpy( next_obs )
        next_done = from_numpy( done )
        
        #a print
        for item in info:
            if "episode" in item.keys():
                print(f"global step:{global_step}, episode return:{item['episode']['r']} ")
                writer.add_scalar("Charts/episode_return", item["episode"]["r"], global_step ) 
            break
                
    #2.process data
    
    #calcolo 
    with torch.no_grad():
        next_value = agent.get_value(next_obs) #(num_envs, 1)
        next_value = next_value.reshape(1, -1) #(1,num_envs)
        if p.gae: #da implementazione originale
            advantages = torch.zeros_like(rewards).to( DEVICE )
            lastgaelam = 0
            for t in reversed(range(p.num_steps)):
                if t == p.num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    nextvalues = next_value
                else:
                    nextnonterminal = 1.0 - dones[t + 1]
                    nextvalues = values[t + 1]
                delta = rewards[t] + p.gamma * nextvalues * nextnonterminal - values[t]
                advantages[t] = lastgaelam = delta + p.gamma * p.gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + values
        else: #modo tipico di calcolare advantages
            returns = torch.zeros_like(rewards).to( DEVICE )
            for t in reversed(range(p.num_steps)):
                if t == p.num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    next_return = next_value
                else:
                    nextnonterminal = 1.0 - dones[t + 1]
                    next_return = returns[t + 1]
                returns[t] = rewards[t] + p.gamma * nextnonterminal * next_return
            advantages = returns - values
        # i returns sono differenti nei due modi (lo posso vedere stampando returns.sum())
    
    #per usare minibatches, appiattisco i dati calcolati in modo da avere la batch per esteso
    # in generale (num_steps, num_envs, single environment shape) -> (batch_size, single environment shape)

    b_obs = obs.reshape((-1,) + vec_env.single_observation_space.shape )
    b_actions = actions.reshape((-1,) + vec_env.single_action_space.shape )
    b_to_act_actions = to_act_actions.reshape( -1 )
    b_logprobs = logprobs.reshape(-1)
    b_to_act_logprobs = to_act_logprobs.reshape(-1)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)
    
    
    #3.policy training
    b_indices = np.arange(p.batch_size)
    #clipped_fractions = [] # tengo conto di quanto spesso il ratio viene clippato
    for epoch in range( p.num_update_epochs ):
        np.random.shuffle( b_indices )
        
        #ora itero sulla batch una minibatch alla volta
        for start in range(0, p.batch_size, p.minibatch_size):
            end = start + p.minibatch_size
            mb_indices = b_indices[ start : end ]
            
            #faccio forward pass sulle osservazioni della minibatch
            _ , _, new_logprob, new_to_act_logprob, entropy, to_act_entropy, new_values = agent.get_action_and_value( x = b_obs[ mb_indices ],
                                                                                                                      action = b_actions[ mb_indices ],
                                                                                                                      to_act = b_to_act_actions[mb_indices]  )
            
            #azione con cui calcolo il reward: to_act * pc            

            #advantages normalization
            mb_advantages = b_advantages[ mb_indices ]
            if p.adv_normalization:
                mb_advantages = (mb_advantages - mb_advantages.mean())/( mb_advantages.std() + 1e-8)

            #this time I try to compute different losses for the two networks of the actor (that have separate weights)
            log_ratio = new_logprob - b_logprobs[ mb_indices ]
            ratio = log_ratio.exp()

            #debug variables
            with torch.no_grad():
                approx_kl = ((ratio - 1) - log_ratio).mean()

            #policy loss
            surr_loss1 = - mb_advantages * ratio
            surr_loss2 = - mb_advantages * torch.clamp( ratio, 1 - p.clip_coef, 1 + p.clip_coef )
            # prendo il max siccome ho considero "- advantages "
            #policy_loss_1 = torch.max(surr_loss1, surr_loss2).mean()
            policy_loss_1 = ( b_to_act_actions[mb_indices] *  torch.max(surr_loss1, surr_loss2) ).mean()

            #now for the second net
            to_act_log_ratio = new_to_act_logprob - b_to_act_logprobs[ mb_indices ]
            to_act_ratio = to_act_log_ratio.exp()
            to_act_surr_loss1 = - mb_advantages * to_act_ratio
            to_act_surr_loss2 = - mb_advantages * torch.clamp( to_act_ratio, 1 - p.clip_coef, 1 + p.clip_coef )
            # prendo il max siccome ho considero "- advantages "
            policy_loss_2 = torch.max(to_act_surr_loss1, to_act_surr_loss2).mean()

            policy_loss = policy_loss_1 + policy_loss_2
            
            #value loss
            new_values = new_values.view(-1) # ( minibatch_size, 1) -> (minibatch_size)
            
            if p.clip_values:
                value_loss_unclipped = (new_values - b_returns[ mb_indices ]) ** 2
                #values clipped are minibatch values + or - the clipped difference 
                #between minibatch new values and minibatch values itself  
                values_clipped = b_values[ mb_indices ] + torch.clamp(
                        new_values - b_values[ mb_indices ],
                        -p.clip_coef,
                        p.clip_coef,
                    )
                value_loss_clipped = (values_clipped - b_returns[ mb_indices ]) ** 2
                value_loss = torch.max(value_loss_unclipped, value_loss_clipped)
                value_loss = 0.5 * value_loss.mean()
            else: #value loss è MSE
                value_loss = .5 * ((new_values - b_returns[ mb_indices ] )**2).mean()

            #entropy loss
            entropy_loss = entropy.mean()

            loss = policy_loss - p.ent_loss_coef * entropy_loss + p.v_loss_coef * value_loss

            if p.use_single_lr:
                optim.zero_grad()
            else:
                actor_optim.zero_grad()
                critic_optim.zero_grad()

            loss.backward()
            nn.utils.clip_grad_norm_( agent.parameters(), p.max_grad_norm )

            if p.use_single_lr:
                optim.step()
            else:
                actor_optim.step()
                critic_optim.step()
        
        #kl stop at batch level. lo potrei usare anche a minibatch level
        if p.target_kl is not None:
            if approx_kl > p.target_kl:
                break
    
  
    #if p.use_single_lr:
    #    writer.add_scalar("Charts/learning_rate", optim.param_groups[0]["lr"], global_step)
    #else:
    #    writer.add_scalar("Charts/actor_learning_rate", actor_optim.param_groups[0]["lr"], global_step)
    #    writer.add_scalar("Charts/critic_learning_rate", critic_optim.param_groups[0]["lr"], global_step)
    #writer.add_scalar("Charts/clip_coefficient", p.clip_coef, global_step)
    #writer.add_scalar("losses/value_loss", value_loss.item(), global_step)
    #writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step)
    #writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
    #writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
    
    if update % 25 == 0:

        #Critic network output and its target
        _ , _, _, _, _, _, new_values = agent.get_action_and_value( x = b_obs, action = b_actions, to_act = b_to_act_actions )
        new_values = new_values.view(-1)

        fig = figure(figsize = (13,8))
        title("Values and returns in batch")
        plot( to_numpy(b_returns[:p.episode_length]), label = "Returns")
        plot( to_numpy(new_values[:p.episode_length]), label = "Values")
        xlabel( "Batch element (#)")
        ylabel( "Values, Returns (#)")
        legend( loc = "upper right")
        writer.add_figure("media/critic_prediction_and_target", fig, global_step)
        close('all')


        # validation 
        single_env = vec_env.envs[0]

        #data for plots
        portfolio_compositions_val = []
        actions_val = []
        to_act_actions_val = []
        values_val = []
        rewards_val = []
        #for tracking reward components
        portfolio_returns_val = []
        portfolio_differences_val = []
        portfolio_returns_std_val = []

        #closing returns is np.ndarray with closing returns for portfolio value
        next_obs_val, closing_returns_val, closing_prices, true_sequence, predicted_sequence = single_env.test_reset() 
        next_obs_val =  from_numpy( next_obs_val ) 

        #register portfolio composition
        portfolio_compositions_val.append( to_numpy(next_obs_val[-single_env.num_assets-1:-1]) )
        #actions_val.append( to_numpy(next_obs_val[-single_env.num_assets:]) )
        values_val.append(0)
        rewards_val.append(0)


        done_val = False

        while not done_val:
            
            with torch.no_grad():
                
                #the output of the agent is 
                #action, to_act, logprob, to_act_logprob, _, _, value = agent( next_obs )
                
                (action_val, to_act_val), value_val = agent.actor( next_obs_val ), agent.critic( next_obs_val )
            
            to_act_val = torch.argmax( to_act_val )
            next_obs_val, reward_val, done_val, info_val = single_env.test_step( to_numpy( to_act_val * action_val ) )
            next_obs_val = from_numpy( next_obs_val )


            #register portfolio composition, value, reward
            next_portfolio_composition_val = next_obs_val[ p.num_assets : -1 ]

            portfolio_compositions_val.append( to_numpy(next_portfolio_composition_val) )
            actions_val.append( to_numpy(action_val) )
            to_act_actions_val.append( to_numpy(to_act_val) )
            values_val.append( value_val.item() )
            rewards_val.append( reward_val.item() )

            portfolio_returns_val.append( info_val["portfolio returns"]  )
            portfolio_differences_val.append( info_val["portfolio difference sum"] )
            portfolio_returns_std_val.append( info_val["portfolio returns std"] ) 


        portfolio_compositions_val = np.array( portfolio_compositions_val )
        actions_val = np.array( actions_val )
        to_act_actions_val = np.array( to_act_actions_val )
        values_val = np.array( values_val )
        rewards_val = np.array( rewards_val )
        portfolio_returns_val = np.array( portfolio_returns_val )
        portfolio_differences_val = np.array( portfolio_differences_val )
        portfolio_returns_std_val = np.array( portfolio_returns_std_val )

        # as in https://github.com/openai/gym/blob/master/gym/wrappers/record_episode_statistics.py#L29
        writer.add_scalar("Charts/episode_return_val", rewards_val.sum(), global_step)
        
        #compute portfolio value
        element_returns_val = portfolio_compositions_val * closing_returns_val
        portfolio_returns_val = np.sum( element_returns_val, axis = -1 )
        #commission costs
        difference_portfolio_composition_val = np.absolute( portfolio_compositions_val[:-1,:] - portfolio_compositions_val[1:,:] )
        difference_portfolio_composition_val = np.concatenate( [np.zeros_like( difference_portfolio_composition_val[0:1,:]), difference_portfolio_composition_val ], axis = 0)
        difference_portfolio_composition_val = difference_portfolio_composition_val.sum( axis = 1 )
        difference_portfolio_composition_val.shape
        #returns and value
        portfolio_returns_val = portfolio_returns_val - p.composition_difference_coef * difference_portfolio_composition_val
        portfolio_values_val = np.cumprod( 1 + portfolio_returns_val )

        #equi portfolio as baseline
        equi_portfolio_compositions_val = np.array( [portfolio_compositions_val[0] ]*portfolio_compositions_val.shape[0])
        equi_element_returns_val = equi_portfolio_compositions_val * closing_returns_val
        equi_portfolio_returns_val = np.sum( equi_element_returns_val, axis = -1 )
        equi_portfolio_values_val = np.cumprod( 1 + equi_portfolio_returns_val )

        #compute martin ratio

        
        #equi_portfolio_close = np.sum( equi_portfolio_compositions_val[:,1:] * closing_prices, axis = 1)
        equi_martin_ratio = get_martin_ratio( pd.Series(equi_portfolio_values_val) )

        #portfolio_close = np.sum( portfolio_compositions_val[:,1:] * closing_prices, axis = 1)
        martin_ratio = get_martin_ratio( pd.Series(portfolio_values_val) )

        #upi for agent portfolio
        portfolio_values_val = pd.Series( portfolio_values_val )
        drawdown = 1. - portfolio_values_val/portfolio_values_val.cummax()
        #drawdown *= 100
        drawdown = np.cumsum( drawdown**2 )
        indices = np.arange(drawdown.shape[0])
        div = np.divide( drawdown, indices )
        div[div == np.inf] = 0
        ulcer_index = np.sqrt( div )
        returns = portfolio_values_val.apply( lambda x: (x - portfolio_values_val.iloc[0]) / portfolio_values_val.iloc[0] )
        upis = returns / ulcer_index

        #upi for equi portfolio
        equi_portfolio_values_val = pd.Series( equi_portfolio_values_val )
        equi_drawdown = 1. - equi_portfolio_values_val/equi_portfolio_values_val.cummax()
        #equi_drawdown *= 100
        equi_drawdown = np.cumsum( equi_drawdown**2 )
        equi_indices = np.arange(equi_drawdown.shape[0])
        equi_div = np.divide( equi_drawdown, equi_indices )
        equi_div[equi_div == np.inf] = 0
        equi_ulcer_index = np.sqrt( equi_div )
        equi_returns = equi_portfolio_values_val.apply( lambda x: (x - equi_portfolio_values_val.iloc[0]) / equi_portfolio_values_val.iloc[0] )
        equi_upis = equi_returns / equi_ulcer_index

        assert np.round(upis.iloc[-1], 3 ) == np.round(martin_ratio, 3 ), "Differs from Dario's function"
        assert np.round(equi_upis.iloc[-1], 3 ) == np.round(equi_martin_ratio, 3 ), "Differs from Dario's function"
       

        #register in writer
        writer.add_scalar("Charts/agent_martin_ratio_val", martin_ratio, global_step)
        writer.add_scalar("Charts/equi_martin_ratio_val", equi_martin_ratio, global_step)


        #### PLOTS ####

        ### PORTFOLIO VALUE COMPARISON PLOT ###
        #agent portfolio value compared to equi portfolio value
        fig, ax = subplots( 1, 1, figsize = (13,8))
        title("Agent and equally-weighted portfolio value comparison on test set")
        ax.plot(portfolio_values_val, label = "agent pv")
        ax.plot(equi_portfolio_values_val, label = "equi pv")
        ax.legend()
        ax.grid(axis='both', which='both')
        ax.set_xlabel("Day from start of set (#)")
        ax.set_ylabel("Portfolio value (#)")
        writer.add_figure("media/agent_equi_portfolio_value_comparison", fig, global_step)
        close('all')
        

        ### AGENT PORTFOLIO COMPOSITION PLOT ###
        #plot with bars
        xs = np.arange( portfolio_compositions_val.shape[0] )
        all_assets = ["cash"]+p.etfs
        width = 1

        fig, ax = subplots( 1,1, figsize = (13,8) )
        title("Agent portfolio composition")
        for i in range( len(all_assets) ):
            if i == 0:
                ax.bar(xs, portfolio_compositions_val[:,i], width, label  = all_assets[i] )
            else:
                ax.bar(xs, portfolio_compositions_val[:,i], width, bottom=np.sum( portfolio_compositions_val[:,:i], axis = 1 ), label  = all_assets[i] )
        legend()
        ylabel("Composition (%)")
        xlabel("Day from start of test set (#)")
        writer.add_figure("media/agent_portfolio_composition", fig, global_step)
        close('all')
        

        

        ### PLOT WITH TRANSFORMER PREDICTION ###
        fig, axs = subplots( p.num_assets , 1, figsize = (13,30))
        fig.suptitle("Returns, predictions and allocation in test set", fontsize = 21)
        fig.subplots_adjust(top= .95)
        for i in range( p.num_assets):
            axs[i].set_title(f"{p.etfs[i]}")
            axs[i].plot( true_sequence[:,i], label = "True returns" )
            axs[i].plot( predicted_sequence[:,i], label = "Predicted returns" )
            axs[i].set_ylabel("returns (%)")
            axs2 = axs[i].twinx()
            axs2.plot( portfolio_compositions_val[:,i+1], color = "tab:green",label = "agent allocation")
            axs2.set_ylabel( "Allocation(%) " )
            axs2.legend( loc = "lower right")
            axs[i].legend( loc = "upper right" )
            axs[i].grid(axis='x', which='both')
            if i!=p.num_assets - 1:
                axs[i].set_xticklabels([])
            if i==p.num_assets:
                axs[i].set_xlabel("Day from start of set(#)")
        writer.add_figure("Media/prediction_allocations", fig, global_step)
        close('all')

        ### PLOTS WITH RETURNS ###
        fig, axs = subplots( 1 + p.num_assets , 1, figsize = (13,30))
        fig.suptitle("Returns and allocation in test set", fontsize = 21)
        fig.subplots_adjust(top= .95)
        for i in range( p.num_assets + 1):
            if i == 0:
                axs[i].set_title("CASH")
            else:
                axs[i].set_title(f"{p.etfs[i-1]}")
            axs[i].plot( closing_returns_val[:,i], label = "closing returns")
            axs[i].hlines( 0, xmin = 0, xmax = closing_returns_val[:,i].shape[0], linestyles= "dashed", alpha = .3)
            axs[i].set_ylabel("returns (%)")
            axs2 = axs[i].twinx()
            axs2.plot( portfolio_compositions_val[:,i], color = "tab:orange",label = "agent allocation")
            axs2.set_ylabel( "Allocation(%) " )
            axs2.legend( loc = "lower right")
            axs[i].legend( loc = "upper right" )
            axs[i].grid(axis='x', which='both')
            if i!=11:
                axs[i].set_xticklabels([])
            if i==11:
                axs[i].set_xlabel("Day from start of set(#)")
        writer.add_figure("Media/returns_and_allocation", fig, global_step)
        close('all')

        fig, axs = subplots( 1 + p.num_assets , 1, figsize = (13,30))
        fig.suptitle("Returns and to act choice in test set", fontsize = 21)
        fig.subplots_adjust(top= .95)
        for i in range( p.num_assets + 1):
            if i == 0:
                axs[i].set_title("CASH")
            else:
                axs[i].set_title(f"{p.etfs[i-1]}")
            axs[i].plot( closing_returns_val[:,i], label = "closing returns")
            axs[i].hlines( 0, xmin = 0, xmax = closing_returns_val[:,i].shape[0], linestyles= "dashed", alpha = .3)
            axs[i].set_ylabel("returns (%)")
            axs2 = axs[i].twinx()
            axs2.plot( to_act_actions_val, color = "tab:orange",label = "to perform action")
            axs2.set_ylabel( "to perform action (bool) " )
            axs2.legend( loc = "lower right")
            axs[i].legend( loc = "upper right" )
            axs[i].grid(axis='x', which='both')
            if i!=11:
                axs[i].set_xticklabels([])
            if i==11:
                axs[i].set_xlabel("Day from start of set(#)")
        writer.add_figure("Media/returns_and_to_act", fig, global_step)
        close('all')


        ### PLOTS WITH CLOSING PRICES ####
        #Closing prices and allocation
        fig, axs = subplots( p.num_assets , 1, figsize = (13,30))
        fig.suptitle("Closing prices and allocation in test set", fontsize = 21)
        fig.subplots_adjust(top= .95)
        for i in range( p.num_assets):
            axs[i].set_title(f"{p.etfs[i]}")
            axs[i].plot( closing_prices[:,i], label = "Closing prices" )
            axs[i].set_ylabel("closing prices ($)")
            axs2 = axs[i].twinx()
            axs2.plot( portfolio_compositions_val[:,i+1], color = "tab:orange",label = "agent allocation")
            axs2.set_ylabel( "Allocation(%) " )
            axs2.legend( loc = "lower right")
            axs[i].legend( loc = "upper right" )
            axs[i].grid(axis='x', which='both')
            if i!=p.num_assets - 1:
                axs[i].set_xticklabels([])
            if i==p.num_assets:
                axs[i].set_xlabel("Day from start of set(#)")
        writer.add_figure("Media/prices_allocation", fig, global_step)
        #savefig( f"/content/drive/MyDrive/0_Codice tesi/PPO_logs/{run_name}/prices_allocation/{str(global_step)}.png")
        #files.download( f"/content/drive/MyDrive/0_Codice tesi/PPO_logs/{run_name}/tpa/{str(global_step)}.png")
        #show()
        close('all')

        #Closing prices and to act
        fig, axs = subplots( p.num_assets , 1, figsize = (13,30))
        fig.suptitle("Closing prices and to act choice in test set", fontsize = 21)
        fig.subplots_adjust(top= .95)
        for i in range( p.num_assets):
            axs[i].set_title(f"{p.etfs[i]}")
            axs[i].plot( closing_prices[:,i], label = "Closing prices" )
            axs[i].set_ylabel("closing prices ($)")
            axs2 = axs[i].twinx()
            axs2.plot( to_act_actions_val, color = "tab:orange",label = "to perform action")
            axs2.set_ylabel( "to perform action (bool) " )
            axs2.legend( loc = "lower right")
            axs[i].legend( loc = "upper right" )
            axs[i].grid(axis='x', which='both')
            if i!=p.num_assets - 1:
                axs[i].set_xticklabels([])
            if i==p.num_assets:
                axs[i].set_xlabel("Day from start of set(#)")
        writer.add_figure("Media/prices_to_act", fig, global_step)
        #savefig( f"/content/drive/MyDrive/0_Codice tesi/PPO_logs/{run_name}/prices_allocation/{str(global_step)}.png")
        #files.download( f"/content/drive/MyDrive/0_Codice tesi/PPO_logs/{run_name}/tpa/{str(global_step)}.png")
        #show()
        close('all')


        ##### ULCER PERFORMANCE INDEX PLOT ####
        fig, ax = subplots(2,2, figsize = (13,8), sharex = True)
        fig.suptitle(f"Agent UPI {upis.iloc[-1]:.2f}")

        ax[0,0].set_title("Agent portfolio value")
        ax[0,0].plot(portfolio_values_val.values, color = "black")
        ax[0,0].set_ylabel( "Price ($)" )
        ax[0,0].set_xlabel( "Day from start of set (#)" )
        ax[0,0].grid(axis='both', which='both')

        ax[1,0].set_title("Agent portfolio UPI")
        ax[1,0].plot(upis.replace([np.inf, -np.inf], np.nan ).fillna(0).values, color = "red")
        ax[1,0].set_ylabel( "UPI (#)" )
        ax[1,0].set_xlabel( "Day from start of set (#)" )
        ax[1,0].grid(axis='both', which='both')

        ax[0,1].set_title("Equi portfolio value")
        ax[0,1].plot(equi_portfolio_values_val.values, color = "black")
        ax[0,1].set_ylabel( "Price ($)" )
        ax[0,1].set_xlabel( "Day from start of set (#)" )
        ax[0,1].grid(axis='both', which='both')

        ax[1,1].set_title("Equi portfolio UPI")
        ax[1,1].plot(equi_upis.replace([np.inf, -np.inf], np.nan ).fillna(0).values, color = "red")
        ax[1,1].set_ylabel( "UPI (#)" )
        ax[1,1].set_xlabel( "Day from start of set (#)" )
        ax[1,1].grid(axis='both', which='both')
        writer.add_figure("Media/UPI", fig, global_step)
        #show()
        close('all')


        ### REWARD AND COMPONENTS PLOT ####

        #fig, ax = subplots( 4, 1, figsize = (13,15), sharex = True)
        #fig.suptitle(f"Reward with components")
        #ax[0].set_title("Reward")
        #ax[0].plot( rewards_val, label = "reward" )
        #ax[0].plot( portfolio_returns_val, label = "closing returns", alpha = .5 )
        #ax[0].plot( p.composition_difference_coef * portfolio_differences_val, label = "commission", alpha = .5 )
        #ax[0].plot( p.risk_coef * portfolio_returns_std_val, label = "risk", alpha = .5 )
        #ax[0].legend()
        #ax[0].set_ylabel( "reward (#)" )
#
        #ax[1].set_title("Portfolio returns")
        #ax[1].plot( portfolio_returns_val )
        #ax[1].set_ylabel( "reward (#)" )
#
        #ax[2].set_title("Commission proxy term")
        #ax[2].plot( portfolio_differences_val )
        #ax[2].set_ylabel( "reward (#)" )
#
        #ax[3].set_title("Portfolio returns std")
        #ax[3].plot( portfolio_returns_std_val )
        #ax[3].set_ylabel( "reward (#)" )
        #ax[3].set_xlabel( "Day from start of set (#)" )
        #writer.add_figure("Media/Reward_and_components", fig, global_step)
        #close('all')

vec_env.close()
writer.close()

[34m[1mwandb[0m: Currently logged in as: [33m4g0[0m (use `wandb login --relogin` to force relogin)


global step:1592, episode return:-0.19757809439729612 
global step:3184, episode return:-0.08188520325555347 
global step:4776, episode return:-0.13872056209156525 
global step:6368, episode return:-0.040800812490922794 
global step:7960, episode return:-0.09691321786964112 
global step:9552, episode return:-0.3535958145838438 
global step:11144, episode return:-0.2910867770268129 
global step:12736, episode return:-0.22110343406406288 
global step:14328, episode return:-0.2507754849392531 
global step:15920, episode return:-0.2508657011137901 
global step:17512, episode return:-0.06459671067727529 
global step:19104, episode return:-0.20660196342805737 
global step:20696, episode return:-0.1632170386661128 
global step:22288, episode return:-0.18534275269265968 
global step:23880, episode return:0.004698081695613367 
global step:25472, episode return:-0.17799270613764048 
global step:27064, episode return:-0.01819579405829223 
global step:28656, episode return:-0.01705785401067935 
gl

  canvas.draw()


global step:81192, episode return:-0.013720819991337911 
global step:82784, episode return:0.06100357460513187 
global step:84376, episode return:0.1188404878453429 
global step:85968, episode return:0.060197645970549926 
global step:87560, episode return:0.0295771641437864 
global step:89152, episode return:-0.10739417897773958 
global step:90744, episode return:0.08533435271117569 
global step:92336, episode return:0.04141331084254153 
global step:93928, episode return:0.06587041579046111 
global step:95520, episode return:0.02754668961147485 
global step:97112, episode return:-0.15752834319008366 
global step:98704, episode return:0.15176228560752186 
global step:100296, episode return:-0.590493844050932 
global step:101888, episode return:0.03712685382372919 
global step:103480, episode return:-0.2326987528665804 
global step:105072, episode return:-0.06599941091502658 
global step:106664, episode return:-0.022934741577637915 
global step:108256, episode return:0.042198344415237864

  canvas.draw()


global step:480784, episode return:-0.10813810460721383 
global step:482376, episode return:0.08576739610485398 
global step:483968, episode return:0.09679178778058099 
global step:485560, episode return:0.01186141730095822 
global step:487152, episode return:-0.17428027738956942 
global step:488744, episode return:-0.14793476962458985 
global step:490336, episode return:0.04496498482882078 
global step:491928, episode return:-0.041767656024648415 
global step:493520, episode return:0.0007650885516951957 
global step:495112, episode return:0.0419175458726382 
global step:496704, episode return:0.02107416701738505 
global step:498296, episode return:-0.07405541002757407 
global step:499888, episode return:-0.02189802997573779 
global step:501480, episode return:0.05880227699686701 
global step:503072, episode return:-0.03357509744986907 
global step:504664, episode return:0.05532287790995407 
global step:506256, episode return:0.038294696810425535 
global step:507848, episode return:0.0

  canvas.draw()


global step:840576, episode return:0.07367443898983514 
global step:842168, episode return:-0.026329668802076957 
global step:843760, episode return:0.027780466616643542 
global step:845352, episode return:-0.024597461337907887 
global step:846944, episode return:0.357491577287806 
global step:848536, episode return:0.07901240125632669 
global step:850128, episode return:0.05977735694729876 
global step:851720, episode return:0.03526775702163389 
global step:853312, episode return:0.12059211077120431 
global step:854904, episode return:-0.07089934335960145 
global step:856496, episode return:0.0072913588862760755 
global step:858088, episode return:-0.009719384984320373 
global step:859680, episode return:0.07255477144708689 
global step:861272, episode return:0.0669112842803711 
global step:862864, episode return:-0.1308897893101882 
global step:864456, episode return:0.08659756108664779 
global step:866048, episode return:0.002693717229197029 
global step:867640, episode return:-0.03