# text2rdf

## Imports

In [1]:
# General purpose
import os
import glob
import random
import xml.etree.ElementTree as ET

# PyTorch 
import torch
import torch.nn as nn
from torch import optim

# Bert
from transformers import BertTokenizer, BertModel, BertConfig
#Bleu score
from nltk.translate.bleu_score import corpus_bleu

## Preprocessing

### Select Data-sub-set (given restricted nr of triples per sentence)

In [2]:
# How many triples to train and test system on (min: 1, max: 7)
MIN_NUM_TRIPLES = 1
MAX_NUM_TRIPLES = 2

In [3]:
# Set paths where to retrieve data from
DS_BASE_PATH = './WebNLG/'

TRAIN_PATH = DS_BASE_PATH + 'train/'
TEST_PATH = DS_BASE_PATH + 'dev/'

TRAIN_DIRS = [ TRAIN_PATH + str(i) + 'triples/' for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1) ]
TEST_DIRS  = [ TEST_PATH  + str(i) + 'triples/' for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1) ]

In [4]:
# Print selected directories
print('Train dirs:', TRAIN_DIRS)
print('Test  dirs:', TEST_DIRS)

Train dirs: ['./WebNLG/train/1triples/', './WebNLG/train/2triples/']
Test  dirs: ['./WebNLG/dev/1triples/', './WebNLG/dev/2triples/']


### Load Data

#### Load Settings (do not touch)

In [5]:
originaltripleset_index = 0  # Index of 'originaltripleset' attribute in XML entry
modifiedtripleset_index = 1  # Index of 'modifiedtripleset' attribute in XML entry
first_lexical_index = 2      # Index as of which verbalizations of RDF triples start in entry

#### Train Data

In [6]:
# Usage of train: train[target_nr_triples][element_id]['target_attribute']
train = [[] for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Documents how many entries there are per number of triples
train_stats = [0 for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Iterate through all files per number of triples and per category and load data
for i, d in enumerate(TRAIN_DIRS):
    nr_triples = list(range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1))[i]
    
    for filename in glob.iglob(d+'/**', recursive=False):
        if os.path.isfile(filename): # Filter dirs
            #print('File:', filename)
            
            tree = ET.parse(filename)
            root = tree.getroot()
            
            entries = root[0]
            train_stats[nr_triples-MIN_NUM_TRIPLES] += len(entries)
            
            for entry in entries:
                #print('Original triple set: ', entry[originaltripleset_index])
                #print('Modified triple set: ', entry[modifiedtripleset_index])
                
                modified_triple_set = entry[modifiedtripleset_index]
                
                for triple in modified_triple_set:
                    
                    verbalizations = entry[first_lexical_index:]
                
                    for verbalization in verbalizations:
                        #print('Text:', verbalization, verbalization.tag, verbalization.attrib, verbalization.text)
                        #print('Trip:', triple, triple.tag, triple.attrib, triple.text)
                        
                        train[i].append({ 'category': entry.attrib['category'],
                                          'id': entry.attrib['eid'],
                                          'triple_cnt': nr_triples,
                                          'text': verbalization.text,
                                          'triple': [text.strip() for text in triple.text.split('|')]
                                        })
                        
print(train)
print(train_stats)

[[{'category': 'Airport', 'id': 'Id1', 'triple_cnt': 1, 'text': 'The Aarhus is the airport of Aarhus, Denmark.', 'triple': ['Aarhus_Airport', 'cityServed', '"Aarhus, Denmark"']}, {'category': 'Airport', 'id': 'Id1', 'triple_cnt': 1, 'text': 'Aarhus Airport serves the city of Aarhus, Denmark.', 'triple': ['Aarhus_Airport', 'cityServed', '"Aarhus, Denmark"']}, {'category': 'Airport', 'id': 'Id2', 'triple_cnt': 1, 'text': 'Aarhus airport serves the city of Aarhus.', 'triple': ['Aarhus_Airport', 'cityServed', 'Aarhus']}, {'category': 'Airport', 'id': 'Id3', 'triple_cnt': 1, 'text': 'Aarhus Airport is 25 metres above sea level.', 'triple': ['Aarhus_Airport', 'elevationAboveTheSeaLevel_(in_metres)', '25.0']}, {'category': 'Airport', 'id': 'Id3', 'triple_cnt': 1, 'text': 'Aarhus airport is at an elevation of 25 metres above seal level.', 'triple': ['Aarhus_Airport', 'elevationAboveTheSeaLevel_(in_metres)', '25.0']}, {'category': 'Airport', 'id': 'Id3', 'triple_cnt': 1, 'text': 'Aarhus Airport


[1788, 1208]


#### Test Data

In [7]:
# Usage of test: test[target_nr_triples][element_id]['target_attribute']
test = [[] for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Documents how many entries there are per number of triples
test_stats = [0 for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Iterate through all files per number of triples and per category and load data
for i, d in enumerate(TEST_DIRS):
    nr_triples = list(range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1))[i]
    
    for filename in glob.iglob(d+'/**', recursive=False):
        if os.path.isfile(filename): # Filter dirs
            #print('File:', filename)
            
            tree = ET.parse(filename)
            root = tree.getroot()
            
            entries = root[0]
            test_stats[nr_triples-MIN_NUM_TRIPLES] += len(entries)
            
            for entry in entries:
                #print('Original triple set: ', entry[originaltripleset_index])
                #print('Modified triple set: ', entry[modifiedtripleset_index])
                
                modified_triple_set = entry[modifiedtripleset_index]
                
                for triple in modified_triple_set:
                    
                    verbalizations = entry[first_lexical_index:]
                
                    for verbalization in verbalizations:
                        #print('Text:', verbalization, verbalization.tag, verbalization.attrib, verbalization.text)
                        #print('Trip:', triple, triple.tag, triple.attrib, triple.text)
                        
                        test[i].append({ 'category': entry.attrib['category'],
                                          'id': entry.attrib['eid'],
                                          'triple_cnt': nr_triples,
                                          'text': verbalization.text,
                                          'triple': [text.strip() for text in triple.text.split('|')]
                                        })
                        
print(test)
print(test_stats)

[[{'category': 'Airport', 'id': 'Id1', 'triple_cnt': 1, 'text': 'The leader of Aarhus is Jacob Bundsgaard.', 'triple': ['Aarhus', 'leaderName', 'Jacob_Bundsgaard']}, {'category': 'Airport', 'id': 'Id2', 'triple_cnt': 1, 'text': "Aarhus Airport's runway length is 2702.0.", 'triple': ['Aarhus_Airport', 'runwayLength', '2702.0']}, {'category': 'Airport', 'id': 'Id2', 'triple_cnt': 1, 'text': 'The Aarhus Airport has a runway length of 2702.0.', 'triple': ['Aarhus_Airport', 'runwayLength', '2702.0']}, {'category': 'Airport', 'id': 'Id3', 'triple_cnt': 1, 'text': '\n        ', 'triple': ['Adirondack_Regional_Airport', 'elevation', '"507.0"^^xsd:double']}, {'category': 'Airport', 'id': 'Id3', 'triple_cnt': 1, 'text': 'Adirondack Regional Airport is 507 metres above sea level.', 'triple': ['Adirondack_Regional_Airport', 'elevation', '"507.0"^^xsd:double']}, {'category': 'Airport', 'id': 'Id4', 'triple_cnt': 1, 'text': 'Adirondack Regional airport is located at Harrietstown, New York.', 'triple


[226, 152]


#### Spilt Train Data into Train and Dev (for intermindiate validation throughout training)

In [8]:
# Percentage of train data reserved for validation throughout training
dev_percentage = 0.15

In [9]:
# Init dev dataset
dev = [[] for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Sample number of dev instances per number of triples
dev_stats = [int(dev_percentage * train_stats[i]) for i in range(0, MAX_NUM_TRIPLES+1-MIN_NUM_TRIPLES)]

print('Samples per nr of triples:', dev_stats)

# Sample indices to be reserved for dev dataset for each nr of triples
dev_indices = [random.sample(range(0, len(train[i])), dev_stats[i]) for i in range(0, MAX_NUM_TRIPLES+1-MIN_NUM_TRIPLES)]
for i in range(len(dev_indices)):
    dev_indices[i].sort(reverse=True)

# Copy selected dev-entries into dev & delete all duplicates/related entries from train dataset
for nr_triples in range(0, MAX_NUM_TRIPLES+1-MIN_NUM_TRIPLES):
    
    # Iterate through all indices reserved for validation set (per nr of triples)
    for index in dev_indices[nr_triples]:
        
        # Select index'th train entry (to become dev/validation data)
        selected_entry = train[nr_triples][index]
        
        # Extract indentifying attributes
        entrys_category = selected_entry['category']
        entrys_idx = selected_entry['id']
        
        # Put selected entry into dev set
        dev[nr_triples].append(selected_entry)
        
        # Find all entries of matching index & category and remove them from train data
        for entry in train[nr_triples]:
            if entry['id'] == entrys_idx and entry['category'] == entrys_category:
                train[nr_triples].remove(entry)
                
print(dev)

Samples per nr of triples: [268, 181]


[[{'category': 'WrittenWork', 'id': 'Id215', 'triple_cnt': 1, 'text': "Weymouth Sands' author was John Cowper Powys.", 'triple': ['Weymouth_Sands', 'author', 'John_Cowper_Powys']}, {'category': 'WrittenWork', 'id': 'Id211', 'triple_cnt': 1, 'text': '\n        ', 'triple': ['United_States', 'leader', 'Barack_Obama']}, {'category': 'WrittenWork', 'id': 'Id208', 'triple_cnt': 1, 'text': 'In the United States one of the ethnic groups is African American.', 'triple': ['United_States', 'ethnicGroup', 'African_Americans']}, {'category': 'WrittenWork', 'id': 'Id206', 'triple_cnt': 1, 'text': 'The capital city of the United States is Washington D.C.', 'triple': ['United_States', 'capital', 'Washington,_D.C.']}, {'category': 'WrittenWork', 'id': 'Id205', 'triple_cnt': 1, 'text': '\n        ', 'triple': ['United_Kingdom', 'leaderName', 'David_Cameron']}, {'category': 'WrittenWork', 'id': 'Id194', 'triple_cnt': 1, 'text': 'The Polish Academy of Sciences serves Poland.', 'triple': ['Polish_Academy_

#### Print Stats

In [10]:
print('Minimal number of triples:', MIN_NUM_TRIPLES)
print('Maximal number of triples:', MAX_NUM_TRIPLES)

print()

print('Training: ')
for nr_triples in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1):
    print('Given %i triples per sentence: ' % nr_triples)
    print('Number of combinations of triples and verbalizations:', len(train[nr_triples-MIN_NUM_TRIPLES]))

print()

print('Dev: ')
for nr_triples in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1):
    print('Given %i triples per sentence: ' % nr_triples)
    print('Number of combinations of triples and verbalizations:', len(dev[nr_triples-MIN_NUM_TRIPLES]))

print()

print('Testing: ')
for nr_triples in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1):
    print('Given %i triples per sentence: ' % nr_triples)
    print('Number of combinations of triples and verbalizations:', len(test[nr_triples-MIN_NUM_TRIPLES]))

Minimal number of triples: 1
Maximal number of triples: 2

Training: 
Given 1 triples per sentence: 
Number of combinations of triples and verbalizations: 4191
Given 2 triples per sentence: 
Number of combinations of triples and verbalizations: 7025

Dev: 
Given 1 triples per sentence: 
Number of combinations of triples and verbalizations: 268
Given 2 triples per sentence: 
Number of combinations of triples and verbalizations: 181

Testing: 
Given 1 triples per sentence: 
Number of combinations of triples and verbalizations: 575
Given 2 triples per sentence: 
Number of combinations of triples and verbalizations: 924


# Neural Machine Translation (NMT) Model Definition 

## TODO: needs updating

## Idea:
1. Encoder: 
1.1 Input==Word Embedding; 
1.2 Output==Context Vector (that is: Encoding of sentence; contained in hidden state after having observed last embedding)

2. Decoder:
2.1 Input==Context Vector
2.2 Output==Probability distribution over output vocab

3. Seq2Seq model: Combining the two

## BERT Encoder

#### Tokenizer

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### Model

In [12]:
bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)#.to('cuda')

## Decoder

### Soft Attention Model
This model implements the Soft Attention model presented in http://proceedings.mlr.press/v37/xuc15.pdf. 
1. Attention energies (i.e. energy per annotation vector) get computed: $e_{ti}=f_{att}(a_i,h_{t−1})$. Note that this formula implies that the Decoder's previous hidden state $h_{t-1}$ needs to be appended to each individual annotation vector $a_i$ before feeding their concatenation through a fully-connected layer $f_{att}$. 
2. Attention weights $\alpha$ get computed from the aforementioned energies: $\alpha_t = softmax(e_t)$, where $\alpha_{ti} = \frac{exp(e_{ti})}{\sum^L_{k=1} exp(e_{tk})}$.

Note: $t$ stands for time, while $i$ identifies the particular annotation vector currently under consideration.

In [13]:
class SoftAttention(nn.Module):
    
    def __init__(self, 
                 annotation_size,  # Tuple: (num_annotations, num_features_per_annotation)
                 hidden_len        # Number of nodes in Decoder's hidden state weight matrix
                ):
        
        super(SoftAttention, self).__init__()
        print('SA INIT')
        # Variables
        self.num_annotations = annotation_size[0]
        self.annotation_features = annotation_size[1]
        self.hidden_size = hidden_len
        
        # Layers
        self.attn = nn.Linear(self.annotation_features + self.hidden_size, 1, bias=True)
        self.softmax = nn.Softmax(dim=1)
        print('an size:', annotation_size) # 8x96
        print('an features (96?):', self.annotation_features) # 96
        print('hid size:', hidden_len)     # 96
        
    def forward(self, annotations, prev_hidden):
        
        # Repeat prev_hidded X times to append it to each of the annotation vectors (per batch element)
        repeated_hidden = torch.cat(
            [
                torch.repeat_interleave(hid, repeats=self.num_annotations, dim=0).unsqueeze(0)
                for hid in prev_hidden.split(1)
            ]
        )
        
        # Append previous hidden state to all annotation vectors (for each individual batch element)
        # Input to attention weight calculation
        print('SA:', annotations.size(), repeated_hidden.size())
        input = torch.cat((annotations, repeated_hidden), dim=2)
        print('Input size:', input.size())
        print(self.attn)
        
        # Compute the relative attention scores per feaure (e_{ti}=f_{att}(a_i,h_{t−1}) from paper)
        energies = self.attn(input)
        
        print('energies...')
        
        # Compute final attention weights (i.e. alpha)
        attn_weights = self.softmax(energies)
        print('attn_weights:', attn_weights.size())
        return attn_weights


### Decoder itself (employing Soft Attention)

In [13]:
class Decoder(nn.Module):
    
    def __init__(self, 
                 annotation_size,      # Size of annotation vectors produced by Encoder
                 out_vocab_size,       # How many words there are in the RDF-output language
                 embedding_dim,        # Length of a word embedding
                 hidden_dim,           # Nr hidden nodes
                 output_dim,           # Vocab size
                 bidirectional=False,  # Whether to have bidirectional GRU
                 n_layers=1,           # Nr layers in GRU
                 drop_prob=0.2         # Percent of node-dropouts
                ):
        
        super(Decoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.n_directions = 1 if not bidirectional else 2  # TODO: make use of it...
        
        self.attn = SoftAttention(annotation_size=annotation_size, hidden_len=hidden_dim)
        self.gru = nn.GRU(annotation_size[1]+embedding_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
        
    def forward(self, 
                annotations,  # Static annotation vectors (for each batch element)
                embeddings,   # Word embeddings of most recently generated word (per batch element)
                h_old         # Previous hidden state per batch element
               ):
        print('Decoder forward:')
        print('embeddings:\t', embeddings.size())
        print('h_old:\t\t', h_old.size())
        
        annotation_weights = self.attn(annotations, h_old.squeeze())#.unsqueeze(2)
        print('annotations:', annotations.size())
        print('annotation_weights:', annotation_weights.size())
        weighted_annotations = annotations * annotation_weights
        print('weighted_annotations:', weighted_annotations.size())
        context_vectors = torch.sum(weighted_annotations, dim=1)
        print('context_vectors:', context_vectors.size())
        
        x = torch.cat((context_vectors, embeddings), dim=1)
        print('x:', x.size())
        x = x.unsqueeze(1) # Add une dimension for 'sequence'
        
        print('Decoder x:', x.size(), 'h_old:', h_old.size())
        print(self.gru)
        out, h = self.gru(x, h_old)
        out = out.squeeze()
        out = self.softmax(self.fc(self.relu(out)))
        print('h:', h.size())
        return out, h
    
    
    def init_hidden(self, annotation_vectors):
        # Mean of annotation vector per batch element
        # Assumes that number of hidden nodes == number annotation features
        hidden = torch.mean(annotation_vectors, dim=1)#.to(device)
        return hidden

## Encoder for seq2seq

In [18]:
# use it only if required otherwise can be removed.


class Encoder(nn.Module):
    
    def __init__(self, 
                 input_dim,
                 embedding_dim,        # Length of a word embedding
                 hidden_dim,           # Nr hidden nodes
                 n_layers = 1,         # Nr layers in GRU
                 dropout =0.2
                ):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):        
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim] 
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs =
        #hidden =
        #cell =
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

## Train loop

In [14]:
def predict(x, 
            word_embeddings,        # Decoder's word embeddings
            word2idx,               # 
            idx2word,               # 
            encoder,                # 
            decoder,                # 
            tokenizer,              # 
            loss_fn,                # 
            max_len=7,              # 
            batch_size=32,          # 
            compute_grads=False,    # 
            targets=None,           # 
            return_textual=False    # Whether to return predictions in index-form (default) or as textual strings
           ):
    
    accumulated_loss = 0.
    
    # Init documentation of predictions
    predicted_indices = torch.zeros([batch_size, max_len]) # Numeric
    if return_textual:
        predicted_words = ['']*batch_size
    
    # Tokenize sampled minibatch sentences
    inputs = tokenizer(x, 
                       return_tensors="pt",     # Return tensors in pt=PyTorch format
                       padding=True,            # Pad all sentences in mini-batch to have the same length
                       add_special_tokens=True) # Add "Start of sequence", "End of sequence", ... tokens. 

    # Encode sentences: Pass tokenization output-dict-contents to model
    outputs = encoder(**inputs) 

    # Retrieve hidden state to be passed into Decoder as annotation vectors
    # Reshape to get a set of 8 feature vectors from last hidden state
    annotations = outputs.last_hidden_state[:, -1, :].reshape(batch_size,8,-1)
    print('Annotations size after cropping & reshape:', annotations.size())

    # Init Decoder's hidden state
    hidden = decoder.init_hidden(annotations).unsqueeze(0)
    print('Initial hidden:', hidden.size(), 'given annotations:', annotations.size())
    
    # Construct initial embeddings (start tokens)
    embeddings = word_embeddings(torch.zeros([batch_size], dtype=int))
    
    for t in range(max_len):
        # Get decodings (aka prob distrib. over output vocab per batch element) for time step t
        prob_dist, hidden = decoder(annotations, # Static vector containing annotations per batch element 
                                    embeddings,  # Word embedding predicted last iteration (per batch element)
                                    hidden       # Decoder's hidden state of last iteratipn per batch element
                                    )

        # Get predicted word index from predicted probability distribution (per batch element)
        word_index = torch.max(prob_dist, dim=1).indices
        print('word_index:', word_index)
        
        # Get corresponding word embedding (by index; per batch element)
        embedding = word_embeddings(word_index)
        
        # TODO: optional teacher forcing?

        # Record predicted words
        predicted_indices[:, t] = word_index
        
        # Record textual words if required
        if return_textual:
            
            # Get predicted word (per batch element)
            predicted_word = [idx2word(batch_element) for batch_element in word_index]
        
            for e in range(batch_size):
                predicted_words[e] += predicted_word[e]

        if compute_grads:
            
            print('prob_dist:', prob_dist.size())
            print('targets:', targets[:, t].size(), targets[:, t])
            
            # Compute (averaged over all batch elements given current time step t)
            loss = loss_fn(prob_dist, targets[:, t])

            # Compute & back-propagate gradients
            loss.backward()
            
            # Document loss
            #accumulated_loss += loss.item()
            
    ret_object = {
        'predicted_indices': predicted_indices,
    }
    
    if targets:
        ret_object['loss'] = accumulated_loss
    if return_textual:
        ret_object['predicted_words'] = predicted_words
        
    return ret_object 

In [15]:
def rdf_vocab_constructor(raw_vocab):
    print(raw_vocab)
    vocab_count, word2idx, idx2word = 3, {'START': 0, 'PAD': 1, 'END': 2}, {0: 'START', 1: 'PAD', 2: 'END'}
    
    for partition in raw_vocab: # Different partitions with respect to nr or triples per sentence
        for train_instance in partition:
            triple = train_instance['triple']
            for token in triple:
                if token not in word2idx:
                    word2idx[token] = vocab_count
                    idx2word[vocab_count] = token
                    vocab_count += 1
    return vocab_count, word2idx, idx2word

In [16]:
def training(train_data, 
          val_data,  
          epochs, 
          minibatch_size=32,
          embedding_dim=300,
          eval_frequency=10, # Every how many epochs to run intermediate evaluation
          learning_rate=0.00001
         ):
    
    # Construct RDF vocab
    vocab_count, word2idx, idx2word = rdf_vocab_constructor(train_data)
    
    # Construct embeddings
    rdf_vocab = nn.Embedding(num_embeddings=vocab_count, embedding_dim=embedding_dim, padding_idx=0)
    
    # Define model
    encoder = bert_model
    decoder = Decoder(
        annotation_size=(8,96),    # Size of annotation vectors produced by Encoder
        out_vocab_size=vocab_count, # How many words there are in the RDF-output language
        embedding_dim=300,          # Length of a word embedding
        hidden_dim=96,             # Nr hidden nodes
        output_dim=vocab_count,             # Vocab size
    )
    
    # Optimizer
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    loss = nn.CrossEntropyLoss()
    
    # For both train and validation data & for all number of tuples per sentence 
    # (in [MIN_NUM_TRIPLES, MAX_NUM_TRIPLES]), get the nr of train-/test instances
    len_x_train = [len(train_set) for train_set in train]
    len_x_val = [len(val_set) for val_set in dev]
    
    # Development of both train- and validation losses over course of training
    train_losses, val_losses = [], []
    
    # Train
    for epoch in range(epochs):
        
        train_loss, eval_loss = 0., 0.
        
        # Reset gradients
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        # Perform own train step for each nr of triples per sentence separately
        for i, nt in enumerate(range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)):
            
            # Sample minibatch indices
            minibatch_idx = random.sample(population=range(len_x_train[i]), k=minibatch_size)
            
            # Number of tokens to be predicted (per batch element)
            num_preds = nt*3+1 # = nr triples * 3 + stop_token 
            
            # Construct proper minibatch
            inputs = [train_data[i][idx]['text'] for idx in minibatch_idx]
            targets = torch.ones([minibatch_size, num_preds], dtype=int)
            for mb_i, idx in enumerate(minibatch_idx):
                for t, token in enumerate(train_data[i][idx]['triple']):
                    targets[mb_i, t] = word2idx[token]
            targets[:, -1] = 2  # 2 = Stop word index
            print(inputs)
            print(targets)
            # Predict
            ret_object = predict(inputs,
                                 rdf_vocab,              # Decoder's word embeddings
                                 word2idx,               # 
                                 idx2word,               # 
                                 encoder,                # 
                                 decoder,                # 
                                 tokenizer,              # 
                                 loss,                   # 
                                 max_len=num_preds,      # Nr of tokens to be predicted
                                 batch_size=32,          # 
                                 compute_grads=True,     # 
                                 targets=targets,        # 
                                 return_textual=False    # Whether to return predictions in index-form (default) or as textual strings
                                )
            
            train_loss += ret_object['loss']
            print("New loss:", ret_object['loss'])
            
        # Apply gradients
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        # Intermediate evaluation
        
        # Save losses
        
    return loss_train, loss_eval, encoder, decoder


# Train

In [17]:
train_loss, eval_loss, encoder, decoder = training(train, dev, epochs=20)

[[{'category': 'Airport', 'id': 'Id1', 'triple_cnt': 1, 'text': 'The Aarhus is the airport of Aarhus, Denmark.', 'triple': ['Aarhus_Airport', 'cityServed', '"Aarhus, Denmark"']}, {'category': 'Airport', 'id': 'Id1', 'triple_cnt': 1, 'text': 'Aarhus Airport serves the city of Aarhus, Denmark.', 'triple': ['Aarhus_Airport', 'cityServed', '"Aarhus, Denmark"']}, {'category': 'Airport', 'id': 'Id2', 'triple_cnt': 1, 'text': 'Aarhus airport serves the city of Aarhus.', 'triple': ['Aarhus_Airport', 'cityServed', 'Aarhus']}, {'category': 'Airport', 'id': 'Id3', 'triple_cnt': 1, 'text': 'Aarhus Airport is 25 metres above sea level.', 'triple': ['Aarhus_Airport', 'elevationAboveTheSeaLevel_(in_metres)', '25.0']}, {'category': 'Airport', 'id': 'Id3', 'triple_cnt': 1, 'text': 'Aarhus airport is at an elevation of 25 metres above seal level.', 'triple': ['Aarhus_Airport', 'elevationAboveTheSeaLevel_(in_metres)', '25.0']}, {'category': 'Airport', 'id': 'Id3', 'triple_cnt': 1, 'text': 'Aarhus Airport




NameError: name 'SoftAttention' is not defined

# Test
### Used exclusively for evaluation on test data after training is fuly finished

## Evaluation - Bleu Score

In [19]:
# Function for calculating the BLEU score for multiple sentence.
def calculate_bleu(data, train, dev, model, max_len = 7):
    
    trgs = []
    pred_trgs = []       
    src = dev
    trg = test
    # Get the data and feed it into pred_trg after Seq2seq
    #pred_trg = pred_trg[:-1]      
    #pred_trgs.append(pred_trg)
    #trgs.append([trg])
        
    return corpus_bleu(pred_trgs, trgs)