In [25]:
#data is from European Parliament proceedings, download here: http://www.statmt.org/europarl/ 
PATH = '/home/tarushii/PythonNotebooks/Other_Projects/Transformer/data/' #set to the path to your data
en_data = open(PATH + 'europarl-v7.fr-en.en', encoding='utf-8').read().split('\n')
fr_data = open(PATH + 'europarl-v7.fr-en.fr', encoding='utf-8').read().split('\n')

print(en_data[0], fr_data[0])

Resumption of the session Reprise de la session


In [2]:
import spacy
import torch 
import torchtext
from torch import nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
en_field = torchtext.data.Field(sequential=True, use_vocab=True, init_token='<sos>', eos_token='<eos>', tokenize='spacy', tokenizer_language='en')
fr_field = torchtext.data.Field(sequential=True, use_vocab=True, init_token = '<sos>', eos_token = '<eos>', tokenize='spacy', tokenizer_language='fr')

In [4]:
#Now, covert the data to a csv file to take advantage of torchtext's versatile TabularDataset class
import pandas as pd
from sklearn.model_selection import train_test_split

raw_data = {'English' : [line for line in en_data], 'French': [line for line in fr_data]}
df = pd.DataFrame(raw_data, columns=["English", "French"])
# create train and validation set 
train, val = train_test_split(df, test_size=0.1)
train.to_csv(PATH + "train.csv", index=False)
val.to_csv(PATH + "val.csv", index=False)


In [28]:
train_set,val_set = torchtext.data.TabularDataset.splits(path=PATH, train='train.csv', validation='val.csv', format='csv', fields=[('English', en_field), ('French', fr_field)])

In [29]:
en_field.build_vocab(train_set, val_set, vectors='glove.6B.100d')
fr_field.build_vocab(train_set, val_set, vectors='glove.6B.100d')

.vector_cache/glove.6B.zip: 862MB [06:32, 2.20MB/s]
100%|█████████▉| 398654/400000 [00:30<00:00, 21900.10it/s]

In [59]:
train_iter, val_iter = torchtext.data.BucketIterator.splits((train_set, val_set), batch_size=2, sort_key=lambda x: len(x.French), shuffle=True)

In [70]:
class Embedder(nn.Module):
    def __init__(self, embedding):
        super().__init__()
        self.embed = nn.Embedding(embedding.shape[0], embedding.shape[1])
        self.embed.weight.data.copy_(embedding)
        self.embed.weight.requires_grad = False
    def forward(self, input_sequence):
        return self.embed(input_sequence)

class Encoder(nn.Module):
    def __init__(self, hidden_size, embedding, num_layers=1, dropout=0.0):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = Embedder(embedding)
        self.gru = nn.GRU(embedding.shape[1], hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=True, batch_first=True)
    def forward(self, input_sequence):
        embedded = self.embedding(input_sequence) 
      #  x = nn.utils.rnn.pack_padded_sequence(x, lens) # unpad
        outputs, hidden_state = self.gru(embedded) # gru returns hidden state of all timesteps as well as hidden state at last timestep
        # pad the sequence to the max length in the batch
     #   output, _ = nn.utils.rnn.pad_packed_sequence(output)
        # The ouput of a GRU has shape (seq_len, batch, hidden_size * num_directions)
        # Because the Encoder is bidirectional, combine the results from the 
        # forward and reversed sequence by simply adding them together.
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden_state

class Decoder(nn.Module):
    def __init__(self, batch_size, hidden_size, embedding, num_layers=1, drop_prob=0.1):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.embedding_size = embedding.shape[1]
        self.embedding = Embedder(embedding)
        self.attn = nn.Linear(hidden_size * 2, 1)
        self.gru = nn.GRU(embedding.shape[1]+hidden_size, hidden_size, num_layers=num_layers, dropout=drop_prob, batch_first=True)
        self.classifier = nn.Linear(hidden_size, embedding.shape[0])

    def forward(self, decoder_hidden, encoder_outputs, inputs):
        # Embed input words
        embedded = self.embedding(inputs)
        #Assumed size of decoder_hidden -> (num_layers, batch_size, embedding_size) size of encoder_outputs -> (batch_size, sentence_len, embedding_size)
        #need to convert length of decoder_hidden to (batch_size, sentence_len, embedding_size)
        self.sequence_len = encoder_outputs.shape[1]
        decoder_hidden = torch.sum(decoder_hidden, axis=0)
        attn_inp = decoder_hidden.unsqueeze(1).repeat(1, self.sequence_len,1)
        weights = self.attn(torch.cat((attn_inp, encoder_outputs), dim = 2)).squeeze()
        normalized_weights = F.softmax(weights)
        attn_applied = torch.bmm(normalized_weights.unsqueeze(1), encoder_outputs).sqeeze()
        cat_input = torch.cat((embedded, attn_applied), axis=2)
        output, hidden_state = self.gru(cat_input, decoder_hidden.unsqueeze(0))
        output = self.classifier(output).squeeze()
        return output, hidden_state
  
def train(encoder, decoder, encoder_opt, decoder_opt, criterion, input, target):
    #set both to train moode
    encoder.train()
    decoder.train()
    #pass through encoder
    enc_output, enc_hidden = encoder(target)
    #initialize input to '<sos>' tokends anddecoder hidden state to final encoder hidden state
    dec_input, dec_hidden = target[:, 0].unsqueeze(1), enc_hidden
    loss = 0
    for i in range(1, target.shape[1]):
        dec_input, dec_hidden = decoder(dec_hidden, enc_output, dec_input)
        loss += criterion(dec_input, target[:, i])
        topv, topi = decoder_output.topk(1)
        dec_input = topi.squeeze().detach()  # detach from history as input
    loss.backward()
    encoder_opt.step()
    decoder_opt.step()
    return loss



In [71]:
hidden_size = 200
batch_size = 2
epochs = 1
encoder = Encoder(hidden_size, en_field.vocab.vectors)#.to(device)
decoder = Decoder(batch_size, hidden_size, fr_field.vocab.vectors)#.to(device)
encoder_opt = torch.optim.Adam([param for param in encoder.parameters() if param.requires_grad == True], lr=1.0e-4)
decoder_opt = torch.optim.Adam([param for param in decoder.parameters() if param.requires_grad == True], lr=1.0e-4)
criterion = nn.CrossEntropyLoss(ignore_index=1)

In [72]:
it = iter(train_iter)
batch = next(it)
#set both to train moode
#encoder.train()
#decoder.train()
#pass through encoder
input = batch.English.t()#.to(device)
target = batch.French.t()#.to(device)
enc_output, enc_hidden = encoder(target)
#initialize input to '<sos>' tokends anddecoder hidden state to final encoder hidden state
dec_input, dec_hidden = target[:, 0].unsqueeze(1), enc_hidden
loss = 0
for i in range(1, target.shape[1]):
    dec_input, dec_hidden = decoder(dec_hidden, enc_output, dec_input)
    loss += criterion(dec_input, target[:, i])
    topv, topi = dec_input.topk(1)
    dec_input = topi.squeeze().detach()  # detach from history as input
loss.backward()
encoder_opt.step()
decoder_opt.step()
print(loss)

torch.Size([2, 1, 100]) torch.Size([2, 1, 200])
torch.Size([2, 100]) torch.Size([2, 1, 200])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [47]:
for i in range(epochs):
    losses = []
    print('Epoch %x:' % i, end='')
    c = 0
    for batch in train_iter:
        input = batch.English.t().to(device)
        target = batch.French.t().to(device)
        losses.append(train(encoder, decoder, encoder_opt, decoder_opt, criterion, input, target))
        if (c % 10 == 0):
            print('.', end='')
        c+=1
    print(' loss: ', sum(losses)/len(losses))



Epoch 0:

TypeError: object of type 'function' has no len()