# text2rdf

## Imports

In [None]:
# General purpose
import os
import glob
import random
import xml.etree.ElementTree as ET

# PyTorch 
import torch
import torch.nn as nn

# Bert
from transformers import BertTokenizer, BertModel, BertConfig

## Preprocessing

### Select Data-sub-set (given restricted nr of triples per sentence)

In [None]:
# How many triples to train and test system on (min: 1, max: 7)
MIN_NUM_TRIPLES = 1
MAX_NUM_TRIPLES = 2

In [None]:
# Set paths where to retrieve data from
DS_BASE_PATH = './WebNLG/'

TRAIN_PATH = DS_BASE_PATH + 'train/'
TEST_PATH = DS_BASE_PATH + 'dev/'

TRAIN_DIRS = [ TRAIN_PATH + str(i) + 'triples/' for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1) ]
TEST_DIRS  = [ TEST_PATH  + str(i) + 'triples/' for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1) ]

In [None]:
# Print selected directories
print('Train dirs:', TRAIN_DIRS)
print('Test  dirs:', TEST_DIRS)

### Load Data

#### Load Settings (do not touch)

In [None]:
originaltripleset_index = 0  # Index of 'originaltripleset' attribute in XML entry
modifiedtripleset_index = 1  # Index of 'modifiedtripleset' attribute in XML entry
first_lexical_index = 2      # Index as of which verbalizations of RDF triples start in entry

#### Train Data

In [None]:
# Usage of train: train[target_nr_triples][element_id]['target_attribute']
train = [[] for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Documents how many entries there are per number of triples
train_stats = [0 for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Iterate through all files per number of triples and per category and load data
for i, d in enumerate(TRAIN_DIRS):
    nr_triples = list(range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1))[i]
    
    for filename in glob.iglob(d+'/**', recursive=False):
        if os.path.isfile(filename): # Filter dirs
            #print('File:', filename)
            
            tree = ET.parse(filename)
            root = tree.getroot()
            
            entries = root[0]
            train_stats[nr_triples-MIN_NUM_TRIPLES] += len(entries)
            
            for entry in entries:
                #print('Original triple set: ', entry[originaltripleset_index])
                #print('Modified triple set: ', entry[modifiedtripleset_index])
                
                modified_triple_set = entry[modifiedtripleset_index]
                
                for triple in modified_triple_set:
                    
                    verbalizations = entry[first_lexical_index:]
                
                    for verbalization in verbalizations:
                        #print('Text:', verbalization, verbalization.tag, verbalization.attrib, verbalization.text)
                        #print('Trip:', triple, triple.tag, triple.attrib, triple.text)
                        
                        train[i].append({ 'category': entry.attrib['category'],
                                          'id': entry.attrib['eid'],
                                          'triple_cnt': nr_triples,
                                          'text': verbalization.text,
                                          'triple': [text.strip() for text in triple.text.split('|')]
                                        })
                        
print(train)
print(train_stats)

#### Test Data

In [None]:
# Usage of test: test[target_nr_triples][element_id]['target_attribute']
test = [[] for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Documents how many entries there are per number of triples
test_stats = [0 for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Iterate through all files per number of triples and per category and load data
for i, d in enumerate(TEST_DIRS):
    nr_triples = list(range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1))[i]
    
    for filename in glob.iglob(d+'/**', recursive=False):
        if os.path.isfile(filename): # Filter dirs
            #print('File:', filename)
            
            tree = ET.parse(filename)
            root = tree.getroot()
            
            entries = root[0]
            test_stats[nr_triples-MIN_NUM_TRIPLES] += len(entries)
            
            for entry in entries:
                #print('Original triple set: ', entry[originaltripleset_index])
                #print('Modified triple set: ', entry[modifiedtripleset_index])
                
                modified_triple_set = entry[modifiedtripleset_index]
                
                for triple in modified_triple_set:
                    
                    verbalizations = entry[first_lexical_index:]
                
                    for verbalization in verbalizations:
                        #print('Text:', verbalization, verbalization.tag, verbalization.attrib, verbalization.text)
                        #print('Trip:', triple, triple.tag, triple.attrib, triple.text)
                        
                        test[i].append({ 'category': entry.attrib['category'],
                                          'id': entry.attrib['eid'],
                                          'triple_cnt': nr_triples,
                                          'text': verbalization.text,
                                          'triple': [text.strip() for text in triple.text.split('|')]
                                        })
                        
print(test)
print(test_stats)

#### Spilt Train Data into Train and Dev (for intermindiate validation throughout training)

In [None]:
# Percentage of train data reserved for validation throughout training
dev_percentage = 0.15

In [None]:
# Init dev dataset
dev = [[] for i in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1)]

# Sample number of dev instances per number of triples
dev_stats = [int(dev_percentage * train_stats[i]) for i in range(0, MAX_NUM_TRIPLES+1-MIN_NUM_TRIPLES)]

print('Samples per nr of triples:', dev_stats)

# Sample indices to be reserved for dev dataset for each nr of triples
dev_indices = [random.sample(range(0, len(train[i])), dev_stats[i]) for i in range(0, MAX_NUM_TRIPLES+1-MIN_NUM_TRIPLES)]
for i in range(len(dev_indices)):
    dev_indices[i].sort(reverse=True)

# Copy selected dev-entries into dev & delete all duplicates/related entries from train dataset
for nr_triples in range(0, MAX_NUM_TRIPLES+1-MIN_NUM_TRIPLES):
    
    # Iterate through all indices reserved for validation set (per nr of triples)
    for index in dev_indices[nr_triples]:
        
        # Select index'th train entry (to become dev/validation data)
        selected_entry = train[nr_triples][index]
        
        # Extract indentifying attributes
        entrys_category = selected_entry['category']
        entrys_idx = selected_entry['id']
        
        # Put selected entry into dev set
        dev[nr_triples].append(selected_entry)
        
        # Find all entries of matching index & category and remove them from train data
        for entry in train[nr_triples]:
            if entry['id'] == entrys_idx and entry['category'] == entrys_category:
                train[nr_triples].remove(entry)
                
print(dev)

#### Print Stats

In [None]:
print('Minimal number of triples:', MIN_NUM_TRIPLES)
print('Maximal number of triples:', MAX_NUM_TRIPLES)

print()

print('Training: ')
for nr_triples in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1):
    print('Given %i triples per sentence: ' % nr_triples)
    print('Number of combinations of triples and verbalizations:', len(train[nr_triples-MIN_NUM_TRIPLES]))

print()

print('Dev: ')
for nr_triples in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1):
    print('Given %i triples per sentence: ' % nr_triples)
    print('Number of combinations of triples and verbalizations:', len(dev[nr_triples-MIN_NUM_TRIPLES]))

print()

print('Testing: ')
for nr_triples in range(MIN_NUM_TRIPLES, MAX_NUM_TRIPLES+1):
    print('Given %i triples per sentence: ' % nr_triples)
    print('Number of combinations of triples and verbalizations:', len(test[nr_triples-MIN_NUM_TRIPLES]))

# Neural Machine Translation (NMT) Model Definition 

## TODO: needs updating

## Idea:
1. Encoder: 
1.1 Input==Word Embedding (or Letter embedding); 
1.2 Output==Context Vector (that is: Encoding of sentence; contained in hidden state after having observed last embedding)

2. Decoder:
2.1 Input==Context Vector
2.2 Output==Probability distribution over output vocab (might be words or letters alternatively)

3. Seq2Seq model: Combining the two

## BERT Encoder

#### Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### Model

In [None]:
model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)#.to('cuda')

## Decoder

### Soft Attention Model
This model implements the Soft Attention model presented in http://proceedings.mlr.press/v37/xuc15.pdf. 
1. Attention energies (i.e. energy per annotation vector) get computed: $e_{ti}=f_{att}(a_i,h_{t−1})$. Note that this formula implies that the Decoder's previous hidden state $h_{t-1}$ needs to be appended to each individual annotation vector $a_i$ before feeding their concatenation through a fully-connected layer $f_{att}$. 
2. Attention weights $\alpha$ get computed from the aforementioned energies: $\alpha_t = softmax(e_t)$, where $\alpha_{ti} = \frac{exp(e_{ti})}{\sum^L_{k=1} exp(e_{tk})}$.

Note: $t$ stands for time, while $i$ identifies the particular annotation vector currently under consideration.

In [None]:
class SoftAttention(nn.Module):
    
    def __init__(self, 
                 annotation_size,  # Tuple: (num_annotations, num_features_per_annotation)
                 hidden_len        # Number of nodes in Decoder's hidden state weight matrix
                ):
        
        super(SoftAttention, self).__init__()
        
        # Variables
        self.num_annotations = annotation_size[0]
        self.annotation_features = annotation_size[1]
        self.hidden_size = hidden_len
        
        # Layers
        self.attn = nn.Linear(self.annotation_features + self.hidden_size, 1, bias=True)
        self.softmax = nn.Softmax(dim=1)
        
        
    def forward(self, annotations, prev_hidden):
        
        # Repeat prev_hidded X times to append it to each of the annotation vectors (per batch element)
        repeated_hidden = torch.cat(
            [
                torch.repeat_interleave(hid, repeats=self.num_annotations, dim=0).unsqueeze(0)
                for hid in prev_hidden.split(1)
            ]
        )
        
        # Append previous hidden state to all annotation vectors (for each individual batch element)
        # Input to attention weight calculation
        input = torch.cat((annotations, repeated_hidden), dim=2)
        
        # Compute the relative attention scores per feaure (e_{ti}=f_{att}(a_i,h_{t−1}) from paper)
        energies = self.attn(input)
        
        # Compute final attention weights (i.e. alpha)
        attn_weights = self.softmax(energies)
        
        return attn_weights


### Decoder itself (employing Soft Attention)

In [132]:
class Decoder(nn.Module):
    
    def __init__(self, 
                 annotation_size, # Size of annotation vectors produced by Encoder
                 hidden_nodes,    # Size of this module's (i.e. Decoder's) hidden state
                 out_vocab_size,  # How many words there are in the RDF-output language
                 dropout_p=0.1    # Percent of node-dropouts
                ):
        
        super(Decoder, self).__init__()
        
        # Variables
        self.dropout_p = dropout_p
        
        
        # Layers
        self.attn = SoftAttention(annotation_size=annotation_size, hidden_len=hidden_nodes)
        self.rnn = nn.GRU() # TODO
        self.dropout = nn.Dropout(self.dropout_p)
        
        
    def forward(self, annotations, prev_hidden):
        
        # TODO (don't forget about Dropout)
        
        return None
    
    def init_hidden(self, annotation_vecs):
        return None # TODO: Mean over annotation_vectors

## seq2seq model

In [None]:
# Takes input and feeds it through both Encoder and (subsequently) Decoder

## Train loop

In [None]:
# TODO: These are just some examples of fuctionalities to be used inside train loop
# Documentation: https://huggingface.co/transformers/model_doc/bert.html

# Sample mini-batch
mini_batch = ["Hello, my cat is cute", 
              "It is not", 
              "HERE I Am, Bye"]

# Tokenize sampled mini-batch sentences
inputs = tokenizer(mini_batch, 
                   return_tensors="pt",     # Return tensors in pt=PyTorch format
                   padding=True,            # Pad all sentences in mini-batch to have the same length
                   add_special_tokens=True) # Add "Start of sequence", "End of sequence", ... tokens. 

# Encode sentences: Pass tokenization output-dict-contents to model
outputs = model(**inputs) 

# Retrieve hidden state to be passed into Decoder 
last_hidden_states = outputs.last_hidden_state  

# Print...
print(inputs)
print(last_hidden_states.shape)

# Train