In [4]:
import warnings
import torch
from torch import nn
import numpy as np
import codecs
from torch.utils.data import DataLoader, Dataset
!pip install torchtext==0.8.0 
from torchtext.data.functional import generate_sp_model,load_sp_model, sentencepiece_numericalizer, sentencepiece_tokenizer
from torchtext import data
import spacy
import torchtext
import copy
import sys
from nltk.tokenize import word_tokenize
import nltk
import torch.nn.functional as F
import random
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# The field class is going to depricated soon, so there are constant warnings about it, this is to shut them down.
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
!nvidia-smi
torch.cuda.is_available()

Tue Dec 22 22:06:18 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 460.27.04    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    29W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

True

In [6]:
# Our tokenizer
def tokenizer(txt):
    txt = txt.lower()
    tokenized = word_tokenize(txt)
    return tokenized

In [7]:
# Definning the field with the custom tokenizer
src_sentence_field = data.Field(sequential=True, tokenize=tokenizer, batch_first=True)
trg_sentence_field = data.Field(sequential=True, tokenize=tokenizer,
                           init_token='<s>', eos_token='</s>', batch_first=True)

In [11]:
#Loading pair sentence using TabularDataset
dataset_translation = data.TabularDataset(path='~/../cons13411/trainset.txt', format='TSV',
                                         fields=[('src',src_sentence_field), ('trg', trg_sentence_field)],
                                         )
trainset, valset, testset = dataset_translation.split(split_ratio=[0.8,0.1,0.1])

In [12]:
for j,i in enumerate(dataset_translation):
    try:
        if i.trg:
            continue
        else:
            print (i.src)
    except:
        print(i.src)
#         dataset_translation.examples[j].trg = ['...', '...']
       

In [13]:
# Building the BucketBatch iterator 
src_sentence_field.build_vocab(dataset_translation)
trg_sentence_field.build_vocab(dataset_translation)
train_iterator, val_iterator, test_iterator = data.BucketIterator.splits((trainset,valset,testset), (128,64,32),
                                                                         sort_key= lambda x:len(x.src))

In [14]:
#Encoder Class
class Encoder(nn.Module):
    def __init__(self, num_layer, num_hidden_size, embed_size, vocab_size, gpu , dropout = 0.2):
        super(Encoder, self).__init__()
        
        self.num_layer = num_layer
        self.hidden_size = num_hidden_size
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.gpu = gpu
        self.dropout_v = dropout
        self.dropout = nn.Dropout(self.dropout_v)

        self.embedding = nn.Embedding(self.vocab_size,self.embed_size)
        
        self.GRU = nn.GRU(self.embed_size, self.hidden_size, self.num_layer,
                            batch_first=True)
    
    def forward(self, input):
        embeddings = self.embedding(input)
        embeddings = self.dropout(embeddings)
        output, hidden = self.GRU(embeddings)
        #print(f"encoder output size: {output.shape}")
        return output, hidden
    
    def init_hidden(self, batch_size):
        if self.gpu:
              hidden_state = torch.zeros(self.num_layer,batch_size,self.hidden_size).cuda()
         # cell_state = torch.randn(self.num_layer,batch_size,self.hidden_size).cuda()
        else: 
              hidden_state = torch.zeros(self.num_layer,batch_size,self.hidden_size)
          #cell_state = torch.randn(self.num_layer,batch_size,self.hidden_size)
        #hidden = (hidden_state,cell_state)
        return hidden_state

In [15]:
#Decoder Class
class Decoder(nn.Module):
    def __init__(self, num_layer, num_hidden_size, embed_size, vocab_size, gpu, dropout=0.0):
        super(Decoder, self).__init__()
        
        self.num_layer = num_layer
        self.hidden_size = num_hidden_size
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.gpu = gpu
        self.embedding = nn.Embedding(self.vocab_size,self.embed_size)
        self.GRU = nn.GRU(self.hidden_size, self.hidden_size, self.num_layer,
                            batch_first=True)
        self.linear = nn.Linear(self.hidden_size, self.vocab_size)
    
    def forward(self, input, hidden):

        if len(input.size()) == 1:
            input = input.unsqueeze(1)

        embeddings = self.embedding(input)
        embeddings = F.relu(embeddings)
        output, hidden = self.GRU(embeddings, hidden)
        decoder_output = self.linear(output)
        return decoder_output, hidden

In [53]:
def train(src_seq, encoder, trg_seq, decoder, criterion, trg_vocab, gpu=False, teacher_forcing_ratio = 0.5):

        encoder_output, encoder_hidden = encoder(src_seq)
        decoder_hidden = encoder_hidden
        batch_size = trg_seq.shape[0]
        target_out_label = trg_seq[:,:]
        target_in_label = trg_seq[:,1:-1]
        decoder_input = torch.tensor(np.full((batch_size,1), trg_vocab.stoi['<s>']))
        if gpu:
            decoder_input = decoder_input.cuda()

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        batch_loss = 0
        batch_words = 0
        batch_correct = 0

        if use_teacher_forcing:
            for tg_id in range(len(target_in_label)):
                    if tg_id == target_in_label.shape[1]:
                          break
                  # print(decoder_input.get_device())
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                    tg_word = target_out_label[:,tg_id+1]
                    batch_loss += criterion(decoder_output.squeeze(), tg_word)
                    decoder_input = target_in_label[:,tg_id]
          # if gpu:
          #   decoder_input.cuda()
        else:

            for tg_id in range(len(target_in_label)):
                if tg_id == target_in_label.shape[1]:
                        break
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                tg_word = target_out_label[:,tg_id+1]
                batch_loss += criterion(decoder_output.squeeze(), tg_word)
                predict = F.log_softmax(decoder_output.detach(),dim=2).topk(1)[1]
                decoder_input = predict.squeeze().detach()
          # if gpu:
          #   decoder_input.cuda()
          # if predict == trg_vocab.stoi['</s>']:
          #   break
      

        return batch_loss


In [54]:
def evaluate( encoder, decoder, eval_iterator, criterion, trg_vocab, epoch, gpu=False, max_len=40):

        encoder.eval()
        decoder.eval()
        total_words = 0
        total_correct = 0
        accu = np.inf

        with torch.no_grad():
            losses = []
            total = 0
            for batch in eval_iterator:
                encoder.zero_grad()
                decoder.zero_grad()
                batch_loss = 0
                batch_words = 0
                batch_correct = 0
                
                src_batch = batch.src
                trg_batch = batch.trg
              
                if gpu:
                    src_batch = src_batch.cuda()
                    trg_batch = trg_batch.cuda()
                
                if len(src_batch.size()) == 1:
                    continue

                target_out_label = trg_batch[:,1:]
                batch_size = trg_batch.size(0)

                encoder_output, encoder_hidden = encoder(src_batch)
                decoder_hidden = encoder_hidden
                batch_size = trg_batch.shape[0]
                target_out_label = trg_batch[:,1:]
                decoder_input = torch.tensor(np.full((batch_size,1), trg_vocab.stoi['<s>']))
                if gpu:
                    decoder_input = decoder_input.cuda()
                predictions = []
                batch_loss = 0
                batch_words = 0
                batch_correct = 0
                for tg_id in range(max_len):
                    if tg_id == target_out_label.shape[1]:
                          break
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                    tg_word = target_out_label[:,tg_id]
                    batch_loss += criterion(decoder_output.squeeze(), tg_word)
                    predict = F.log_softmax(decoder_output.detach(),dim=2).topk(1)[1]
                    decoder_input = predict.squeeze().detach()
                    predictions.append(decoder_input)
                    if epoch % 5 == 0:
                            batch_words += torch.sum(torch.tensor([1 if tg_word[i] != trg_vocab.stoi['<PAD>']
                                                                else 0 for i in range(len(tg_word))]))
                            batch_correct += torch.sum(torch.tensor([1 if predict[i] == tg_word[i] and 
                                                                    tg_word[i] != trg_vocab.stoi['<PAD>'] 
                                                                    else 0 for i in range(len(tg_word)) ]))
                    
                avg_batch_loss = batch_loss.item()/max_len
                losses.append(avg_batch_loss)
                if epoch % 5 == 0:
                    total_words += batch_words
                    total_correct += batch_correct
                total += 1

        if epoch % 5 == 0:
            accu = float(total_correct)/total_words
        encoder.train()
        decoder.train()
        return (sum(losses) / total), accu, predictions


In [16]:
#Hyperparameter settings
num_layer = 1
num_hidden_size = 64
embed_size = 64
encode_vocab_size = len(src_sentence_field.vocab)
decoder_vocab_size = len(trg_sentence_field.vocab)
#encoder_dropout = 0.2

In [1]:
#Train Procedure 
gpu_device = False
if torch.cuda.is_available():
    torch.device('cuda')
    gpu_device = True

np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)
encoder = Encoder(num_layer,num_hidden_size,embed_size,encode_vocab_size, gpu_device)
decoder = Decoder(num_layer,num_hidden_size,embed_size,decoder_vocab_size, gpu_device)

if gpu_device:
    encoder.cuda()
    decoder.cuda()

criterion = nn.CrossEntropyLoss(ignore_index=src_sentence_field.vocab['<PAD>'])
# optimencoder = torch.optim.Adam(encoder.parameters(),lr=0.001, weight_decay=0.0, betas=(0.9, 0.999),
#                          eps=1e-8, amsgrad=False) 
# optimdecoder = torch.optim.Adam(decoder.parameters(),lr=0.001, weight_decay=0.0, betas=(0.9, 0.999),
#                          eps=1e-8, amsgrad=False) 
optimencoder = torch.optim.Adam(encoder.parameters(),lr=0.001, weight_decay=0.0, betas=(0.9, 0.999),
                         eps=1e-8, amsgrad=False) 
optimdecoder = torch.optim.Adam(decoder.parameters(),lr=0.001, weight_decay=0.0, betas=(0.9, 0.999),
                         eps=1e-8, amsgrad=False) 

epoch = 100

least_loss = np.inf
train_loss = []
eval_loss = []

for i in range(epoch):
    print('Ep {:4d}'.format(i), end='')
    losses = []
    total = 0
    
    total_correct = 0
    total_words = 0
    
    for batch in train_iterator:

        src_batch = batch.src
        trg_batch = batch.trg
        
        if gpu_device:
            src_batch = src_batch.cuda()
            trg_batch = trg_batch.cuda()


        encoder.zero_grad()
        decoder.zero_grad()

        encoder.train(mode = True)
        decoder.train(mode = True)
        
        if len(src_batch.size()) == 1:
            continue        

        batch_loss = train(src_batch, encoder, trg_batch, decoder, criterion,
                           trg_sentence_field.vocab, gpu=gpu_device, teacher_forcing_ratio=0.5)
            
        batch_loss.backward()
        optimencoder.step()
        optimdecoder.step()
        avg_loss = batch_loss.item()/trg_batch.size(1)
        losses.append(avg_loss)
        # total_correct += batch_correct
        # total_words += batch_word
        #print(f"accuracy batch: {float(batch_correct)/batch_word}")
        total += 1
      
    epoch_loss = np.sum(losses)/total
    train_loss.append(epoch_loss)
    # accuracy = float(total_correct)/total_words
    print(' |Train loss {:4f}'.format(epoch_loss), end='')
    eval_loss, eval_accu, predictions = evaluate(encoder, decoder, val_iterator,
                                                       criterion, trg_sentence_field.vocab,
                                                       i, gpu=gpu_device)
    print(' |Evaluation loss {:4f}'.format(eval_loss), end='')
    print(' |Eval Acc {:4f}'.format(eval_accu), end='')
    if least_loss > eval_loss :
        least_loss = eval_loss
        torch.save(encoder.state_dict(), './encoder_noattention_sgd.pth')
        torch.save(decoder.state_dict(), './decoder_noattention_sgd.pth')
        best_encoder = copy.deepcopy(encoder)
        best_decoder = copy.deepcopy(decoder)
        print('|Saved\n')
    else:
        print('\n')

NameError: name 'torch' is not defined

In [21]:
# Transfering a tensor to a sentence (BPE encoding)
def denumericalization(vocab, tensor):
    sentence = ''
    for index in tensor:
        if index in [1,2,3]:
            continue

        sentence += ' '+ vocab.itos[index]
    return sentence.strip()

In [22]:
def sampling(encoder,decoder, input_sequence, src_sentence_field, trg_sentence_field, temperature = 0.0 ):
    
    inpute_tokenized = src_sentence_field.tokenize(input_sequence)
    input_seq = src_sentence_field.numericalize([inpute_tokenized])
    
    target_tokens = [trg_sentence_field.vocab.stoi['<s>']]
    
    target_seq = torch.tensor(np.full((1,1), trg_sentence_field.vocab.stoi['<s>'] ))
    EOS = trg_sentence_field.vocab.stoi['</s>']
    
    trg_tokens = prediction(encoder,decoder,input_seq, target_seq, EOS, temperature=temperature)
    
    print(trg_tokens)

    return trg_tokens

In [23]:
    def prediction(encoder, decoder, input_seq, target_seq, EOS, temperature=0.0):
        
        target_tokens = [] 
        encoder.eval()
        decoder.eval()
        with torch.no_grad():
        
            encoder.zero_grad()
            decoder.zero_grad()


            batch_size = target_seq.size(0)

            if torch.cuda.is_available():
                input_seq = input_seq.cuda()
                target_seq = target_seq.cuda()

            encoder_output, encoder_hidden = encoder(input_seq)
            decoder_hidden = encoder_hidden
            decoder_output, decoder_hidden = decoder(target_seq, decoder_hidden)
            
            symbol = logsoftmax_sample(decoder_output, temperature=temperature)
            symbol_list = symbol.tolist()[0][0]
            target_tokens.append(symbol_list[0])
            counter = 0 
            while symbol != EOS and counter < 100 :
                # print(f"{symbol}  : {trg_sentence_field.vocab.itos[symbol]}")
                # print(decoder_output)
                target_seq = torch.tensor(np.full((1,1), symbol_list[0] ))
                if torch.cuda.is_available():
                    target_seq = target_seq.cuda()
                decoder_output, decoder_hidden = decoder(target_seq, decoder_hidden)
                symbol = logsoftmax_sample(decoder_output, temperature=temperature)
                symbol_list = symbol.tolist()[0][0]
                target_tokens.append(symbol_list[0])
                counter +=1

                
            return target_tokens
        

    def logsoftmax_sample(logits, temperature=1.0):  

        u = np.random.uniform(low=1e-6, high=1.0 - 1e-6, size=logits.shape)
        g = -np.log(-np.log(u))
        g = torch.from_numpy(g)
        if torch.cuda.is_available():
            g = g.cuda()
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
        probs =log_probs + g * temperature

        return probs.topk(1)[1]

In [17]:
gpu_device = False
if torch.cuda.is_available():
    torch.device('cuda')
    gpu_device = True

np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)
encoder = Encoder(num_layer,num_hidden_size,embed_size,encode_vocab_size, gpu_device)
decoder = Decoder(num_layer,num_hidden_size,embed_size,decoder_vocab_size, gpu_device)

if gpu_device:
    encoder.cuda()
    decoder.cuda()

criterion = nn.CrossEntropyLoss(ignore_index=src_sentence_field.vocab['<PAD>'])

encoder.load_state_dict(torch.load('./encoder_noattention.pth'))
decoder.load_state_dict(torch.load('./decoder_noattention.pth'))

<All keys matched successfully>

In [36]:
predicted = sampling(encoder, decoder, 'We do not know what is happening.', src_sentence_field, trg_sentence_field, temperature=0.3)
denumericalization(trg_sentence_field.vocab, predicted)

[52, 14, 19, 4, 101, 18, 4, 6, 3]


'wenn wir nicht , sondern daß , .'

In [37]:
from nltk.translate.bleu_score import sentence_bleu

In [52]:
trans = ['wenn', 'wir', 'nicht',  ',', 'sondern', 'daß', ',' ,'.']
ref = [['wir', 'wissen', 'nicht', ',', 'was', 'passiert', '.']]
sentence_bleu(ref, trans, weights=(0,1,0,0))

0.14285714285714285