<a href="https://colab.research.google.com/github/Alezoso/seq2seq_att/blob/main/seq2seq_att.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from torch import nn


I have a multivariate time series problem:
5 time series as predictive variable, 1 as target.
Want to use seq2seq lstm with Bahdanau Attention

The code runs but I'm not sure if it is correct how I compute the attention

In [None]:
#Encoder
#input size sequence_length, batch_size, features
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=True)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

In [None]:
#Attention

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size * 2, hidden_size * 2, bias=False)
        self.Ua = nn.Linear(hidden_size * 2, hidden_size * 2, bias=False)
        self.Va = nn.Linear(hidden_size * 2, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden => [batch size, dec hid dim]
        # encoder_outputs => [src len, batch size, enc hid dim * 2]
        src_len = encoder_outputs.shape[0]
        hidden = hidden[-1].unsqueeze(0).repeat(src_len, 1, 1)
        hidden = hidden.permute(1, 0, 2)  # [batch size, src len, dec hid dim]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # [batch size, src len, enc hid dim * 2]
        scores = self.Va(torch.tanh(self.Wa(hidden) + self.Ua(encoder_outputs)))  # [batch size, src len, 1]
        weights = F.softmax(scores, dim=1)  # [batch size, src len, 1]
        context = torch.bmm(weights.permute(0, 2, 1), encoder_outputs)  # [batch size, 1, enc hid dim * 2]
        return context.permute(1, 0, 2), weights  # [1, batch size, enc hid dim * 2], [batch size, src len, 1]

In [None]:
#Decoder
#input size batch_size,features (2=last encoder input fourth feature or decoder output and additional feature from encoder input)
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, attention):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.attention = attention
        self.lstm = nn.LSTM((hidden_size * 2) + input_size+1, hidden_size * 2, num_layers)
        self.fc1 = nn.Linear((hidden_size * 2) + (hidden_size * 2), hidden_size * 2)
        self.fc2 = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)  # [1, batch size]
        context, _ = self.attention(hidden, encoder_outputs)  # [1, batch size, enc hid dim * 2]
        rnn_input = torch.cat((input, context), dim=2)  # [1, batch size, (enc hid dim * 2) + emb dim]
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))  # output => [1, batch size, dec hid dim]
        output = output.squeeze(0)  # [batch size, dec hid dim]
        context = context.squeeze(0)  # [batch size, enc hid dim * 2]
        out = F.relu(self.fc1(torch.cat((output, context), dim=1)))  # [batch size, dec hid dim * 2]
        out = self.dropout(out)
        out = self.fc2(out)  # [batch size, output_dim]
        return out, hidden, cell

In [None]:
#Seq2Seq

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, max_target_length):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.max_target_length = max_target_length
        # self.teacher_forcing_ratio = teacher_forcing_ratio
        # print('teacher_forcing_ratio',teacher_forcing_ratio)
    def forward(self, source, target=None,teacher_forcing_ratio=0.5):
        # print('teacher_forcing_ratio',teacher_forcing_ratio)
        batch_size = source.shape[1]
        target_vocab_size = self.decoder.output_size
        outputs = torch.zeros(self.max_target_length, batch_size, target_vocab_size).to(device)
        encoder_outputs, hidden, cell = self.encoder(source)
        hidden = torch.cat((hidden[:self.decoder.num_layers], hidden[self.decoder.num_layers:]), dim=2)
        cell = torch.cat((cell[:self.decoder.num_layers], cell[self.decoder.num_layers:]), dim=2)
        x = source[-1][:, 3].unsqueeze(1)  # Last input of encoder NDVI as initial input to decoder
        previous_timestep = source[-1][:, 0].unsqueeze(1)  # Last input of encoder DOY as initial input to decoder (day of the year)
        #This is a special input that I want to provide to the net
        # print('x', x.shape)
        # print('previous_timestep', previous_timestep)
        x = torch.cat((x, previous_timestep), dim=1)
        # print('x', x.shape)
        for t in range(0, self.max_target_length):
            output, hidden, cell = self.decoder(x, hidden, cell, encoder_outputs)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio
            if target is not None and teacher_force:
                x = target[t].squeeze(0)  # Teacher forcing
            else:
                x = output  # Next input is the model's own prediction
                # print('x2', x.shape)
            if tensor_timestep.shape==previous_timestep.shape:
                tensor_timestep_temp=tensor_timestep
                tensor_timestep_temp=tensor_timestep_temp.to(device)
                previous_timestep=previous_timestep+tensor_timestep_temp #I want the model only to work with NDVI as target but with 2 inputs: NDVI and DOY
                #DOY has a fixed timestep, so I now exactly what is
                #I add the delta DOY to the previous doy
            else :
                min_dim = min(tensor_timestep.shape[0], previous_timestep.shape[0])#only triggers when the last batch is not exactly the fixed batch_size
                tensor_timestep_temp = tensor_timestep[:min_dim]
                tensor_timestep_temp=tensor_timestep_temp.to(device)
                previous_timestep=previous_timestep+tensor_timestep_temp
            x = torch.cat((x, previous_timestep), dim=1)
        return outputs

In [None]:
#model parameters

INPUT_DIM = 5  # Input dimension for the encoder
OUTPUT_DIM = 1  # Output dimension for the decoder
ENC_HID_DIM = 60  # Hidden dimension for the encoder
DEC_HID_DIM = 60  # Hidden dimension for the decoder
N_LAYERS_E = 2  # Number of layers encoder
N_LAYERS_D = 1  # Number of layers decoder

MAX_TARGET_LENGTH = 3
TEACHER_FORCING_RATIO = 0.5
batch_size = 64

timestep=train_inputs[0,8,0]-train_inputs[0,7,0]
tensor_timestep = torch.ones((64, 1), dtype=torch.float32) * timestep
tensor_timestep=tensor_timestep.to(device)
del timestep
# Initiate the model
encoder = Encoder(INPUT_DIM, ENC_HID_DIM, N_LAYERS_E).to(device)
attention = BahdanauAttention(ENC_HID_DIM).to(device)
decoder = Decoder(OUTPUT_DIM, DEC_HID_DIM, OUTPUT_DIM, N_LAYERS_D, attention).to(device)
# model = Seq2Seq(encoder, decoder, MAX_TARGET_LENGTH, TEACHER_FORCING_RATIO).to(device)
model = Seq2Seq(encoder, decoder, MAX_TARGET_LENGTH).to(device)

summary(model)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4, min_lr=0.00001)
