# SPARQL generation with pre-trained GPT for KG Question Answering

## Data processing and Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!gdown 1sl_YdyiucWmk8Lx2x-Qn5ALcr2bLFUb0
!unzip DBLP-QuAD.zip

In [2]:
%%capture
import json
import requests
import re
import pickle
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.modules.module import T
from random import shuffle

torch.manual_seed(1706)

def repl_func(match):
    return match.group(1).lower()

def get_entities(question, label_generator="t5-small", embedding_reranker="distmult"):
    base_url = "https://ltdemos.informatik.uni-hamburg.de/dblplinkapi/api/entitylinker"
    endpoint_url = f"{base_url}/{label_generator}/{embedding_reranker}"
    payload = {"question": question}
    headers = {"Content-Type": "application/json"}
    response = requests.post(endpoint_url, data=json.dumps(payload), headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

def process_question(entities, question):
  new_entities = get_entities(question)["entitylinkingresults"]
  found = 0
  for entity in new_entities:
    label = entity["label"]
    if label:
      if ": " in label: label = label.split(": ")[1] # prune authors
      if ". (" in label: label = label.split(". (")[0] # prune year
      if label[-1] == ".": label = label[:-1] # prune last dot
      if  (type(entity["result"]) is list and entity["result"] and
          type(entity["result"][0]) is list and len(entity["result"][0]) > 1 and
          type(entity["result"][0][1]) is list and entity["result"][0][1]):
        iri = entity["result"][0][1][0]
        if ("'" + label + "'" in question) and (iri in entities):
          found += 1
          question = question.replace("'" + label + "'", iri)
        elif (label in question) and (iri in entities):
          found += 1
          question = question.replace(label, iri)
        elif (len(label.split(" ")) == 2) and (iri in entities): # If name has given and last name, try 4 combinations
          given, last = label.split(" ")
          if given and last:
            if last + ", " + given in question:
              found += 1
              question = question.replace(last + ", " + given, iri)
            elif last + ", " + given[0] + "." in question:
              found += 1
              question = question.replace(last + ", " + given[0] + ".", iri)
            elif given[0] + "." + last in question:
              found += 1
              question = question.replace(given[0] + "." + last, iri)
            elif given + last[0] + "." in question:
              found += 1
              question = question.replace(given + last[0] + ".", iri)

  if found == len(entities): return question
  else: return ""

def format_question(question): # punctuation can damage the undestanding when attached to a word
  question = re.sub(r'^(.)', repl_func, question)
  if question[-1] == ".": question = question[:-1] # eliminate end dot
  question = question.replace("?", "") # eliminate question mark
  return question

In [3]:
def process_question_entities(question):
  new_entities = get_entities(question)["entitylinkingresults"]
  entities = []
  for entity in new_entities:
    label = entity["label"]
    if label:
      if ": " in label: label = label.split(": ")[1] # prune authors
      if ". (" in label: label = label.split(". (")[0] # prune year
      if label[-1] == ".": label = label[:-1] # prune last dot
      if  (type(entity["result"]) is list and entity["result"] and
          type(entity["result"][0]) is list and len(entity["result"][0]) > 1 and
          type(entity["result"][0][1]) is list and entity["result"][0][1]):
        iri = entity["result"][0][1][0]
        if ("'" + label + "'" in question):
          question = question.replace("'" + label + "'", iri)
          entities.append(iri)
        elif (label in question):
          question = question.replace(label, iri)
          entities.append(iri)
        elif (len(label.split(" ")) == 2): # If name has given and last name, try 4 combinations
          given, last = label.split(" ")
          if given and last:
            if last + ", " + given in question:
              question = question.replace(last + ", " + given, iri)
              entities.append(iri)
            elif last + ", " + given[0] + "." in question:
              question = question.replace(last + ", " + given[0] + ".", iri)
              entities.append(iri)
            elif given[0] + "." + last in question:
              question = question.replace(given[0] + "." + last, iri)
              entities.append(iri)
            elif given + last[0] + "." in question:
              question = question.replace(given + last[0] + ".", iri)
              entities.append(iri)

  return question, entities

In [None]:

questions = []
archives = ["DBLP-QuAD/train/questions.json"] #"DBLP-QuAD/valid/questions.json", "DBLP-QuAD/test/questions.json"]
for archive in archives:
  with open(archive, 'r', encoding='utf-8') as file:
    data = json.load(file)
    index = 0
    for entry in data["questions"][4000:]:
      print(index)
      index += 1
      if entry["template_id"] != "TP61":
        query = entry["query"]["sparql"]

        question = format_question(entry["question"]["string"])
        question = process_question(entry["entities"], question)
        if question:
          questions.append((question, query))

        paraphrased = format_question(entry["paraphrased_question"]["string"])
        paraphrased = process_question(entry["entities"], paraphrased)
        if paraphrased:
          questions.append((paraphrased, query))

with open("/content/drive/MyDrive/DLBP-QuAD-train2.txt", 'wb') as file:
  pickle.dump(questions, file)
print(len(questions))

In [4]:
archives = ["/content/drive/MyDrive/DLBP-QuAD-valid1.txt", "/content/drive/MyDrive/DLBP-QuAD-valid2.txt",
            "/content/drive/MyDrive/DLBP-QuAD-test.txt", "/content/drive/MyDrive/DLBP-QuAD-train1.txt",
            "/content/drive/MyDrive/DLBP-QuAD-train2.txt"]
questions = []
for archive in archives:
  with open(archive, 'rb') as file:
    data = pickle.load(file)
    questions += data

In [5]:
questions2 = []
for w, q in questions:
  if "'" not in w:
    w = w.replace("(", "( ").replace(")", " )")
    q = q.replace("(", "( ").replace(")", " )")
    questions2.append((w, q))
questions = questions2
print("Size of new entity linked dataset:", len(questions))

Size of new entity linked dataset: 9289


In [None]:
n_entities = 0
for q, a in questions:
  for word in q:
    if "<" in word: n_entities += 1
print("Average number of entities per query:", round(n_entities / len(questions), 3))

Average number of entities per query: 1.231


In [None]:
linked_q = []
with open("/content/drive/MyDrive/dblp_heldout_500_questions.json", 'r', encoding='utf-8') as file:
  data = json.load(file)

  for entry in data:
    question = format_question(entry["question"])
    question, q_entities = process_question_entities(question)

    paraphrased = format_question(entry["paraphrase"])
    paraphrased, p_entities = process_question_entities(paraphrased)

    linked, entities = [], []
    if len(q_entities) >= len(p_entities):
      entities = q_entities
      linked = question
    else:
      entities = p_entities
      linked = paraphrased

    _answer = {
      'id': entry["id"],
      'question': entry["question"],
      'linked': linked,
      'entities': entities,
      'answer': [],
    }
    linked_q.append(_answer)

with open("/content/drive/MyDrive/dblp_linked_500_questions.json", 'w', encoding='utf-8') as file:
  json.dump(linked_q, file)

len(linked_q)

500

In [6]:
linked_q = []
with open("/content/drive/MyDrive/dblp_linked_500_questions.json", 'r', encoding='utf-8') as file:
  linked_q = json.load(file)

## Encoding and decoding

In [None]:
# Encoder
text = [a['linked'] for a in linked_q]
text += [a + " " + q for a, q in questions]
words = list(set((" ".join(text)).split(" ")))
words.append('¿') # Sequence init character
words.append('¡') # Sequence end character
words.append('<https://dblp.org/pid/27/4034-1>')
words = sorted(words) # unique characters
word_vocab_size = len(words) # amount of unique characters
word_stoi = { w:i for i,w in enumerate(words) } # map char to int
word_itos = { i:w for i,w in enumerate(words) } # map int to char
word_encoder = lambda s: [word_stoi[w] for w in s.split(" ")] # encoder
word_decoder = lambda l: ' '.join([word_itos[i] for i in l]) # decoder
print("Vocabulary size:", len(words))

Vocabulary size: 10399


In [8]:
old_words = list(set((" ".join([a + " " + q for a, q in questions])).split(" ")))
print("Old vocabulary size:", len(old_words))
entities = []
for w in old_words:
  if '<' in w and len(w) > 1: entities.append(w)
print("Old number of entities:",  len(entities))

Old vocabulary size: 9339
Old number of entities: 7143


## Model

In [None]:
# hyperparams
batch_size = 32
max_iters = 6000
text_sample = max_iters // 6
eval_interval = 200
learning_rate = 7e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 128
n_head = 8
n_layer = 4

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, b_size, masked=True):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False, device=device)
        self.query = nn.Linear(n_embd, head_size, bias=False, device=device)
        self.value = nn.Linear(n_embd, head_size, bias=False, device=device)
        self.register_buffer('tril', torch.tril(torch.ones(b_size, b_size)))

        self.dropout = nn.Dropout(dropout)
        self.masked = masked

    def forward(self, x, y=None):
        B,T,C = x.shape
        q = self.query(x) # (B,T,C)
        if y is None:
          k = self.key(x)   # (B,T,C)
        else:
          k = self.key(y)
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        if self.masked:
          mask = self.tril[:T, :T] == 0
          wei = wei.masked_fill(mask.to(device), float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        if y is None:
          v = self.value(x) # (B,T,C)
        else:
          v = self.value(y)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size, b_size, masked=True):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, b_size, masked) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd, device=device)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, y=None):
      if y is None:
        out = torch.cat([h(x) for h in self.heads], dim=-1)
      else:
        out = torch.cat([h(x, y) for h in self.heads], dim=-1)
      out = self.dropout(self.proj(out))
      return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd, device=device),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd, device=device),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class DecoderBlock(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head

        self.sa_decoder1 = MultiHeadAttention(n_head, head_size, MAX_OUT_BLOCK, masked=True)
        self.sa_decoder2 = MultiHeadAttention(n_head, head_size, MAX_OUT_BLOCK, masked=False)
        self.ffwd_decoder = FeedFoward(n_embd)
        self.ln_decoder1 = nn.LayerNorm(n_embd, device=device)
        self.ln_decoder2 = nn.LayerNorm(n_embd, device=device)
        self.ln_decoder3 = nn.LayerNorm(n_embd, device=device)

    def forward(self, x, y):
        x = x + self.sa_decoder1(self.ln_decoder1(x))
        x = x + self.sa_decoder2(self.ln_decoder2(x), y)
        x = x + self.ffwd_decoder(self.ln_decoder3(x))
        return x

class EncoderBlock(nn.Module):
  """ Transformer block: communication followed by computation """
  def __init__(self, n_embd, n_head):
      super().__init__()
      head_size = n_embd // n_head

      self.sa_encoder = MultiHeadAttention(n_head, head_size, MAX_IN_BLOCK, masked=False)
      self.ffwd_encoder = FeedFoward(n_embd)
      self.ln_encoder1 = nn.LayerNorm(n_embd, device=device)
      self.ln_encoder2 = nn.LayerNorm(n_embd, device=device)

  def forward(self, x):
      x = x + self.sa_encoder(self.ln_encoder1(x))
      x = x + self.ffwd_encoder(self.ln_encoder2(x))
      return x

class TransformerModel(nn.Module):
    def __init__(self, encoder_blocks, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(MAX_IN_BLOCK, n_embd)
        self.position_embedding_table_dec = nn.Embedding(MAX_OUT_BLOCK, n_embd)
        self.decoder_blocks = [DecoderBlock(n_embd, n_head) for _ in range(n_layer)]
        self.encoder_blocks = encoder_blocks
        self.ln_f = nn.LayerNorm(n_embd, device=device) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size, device=device)

    def forward(self, idx, idy, targets=None):
        Bx, Tx = idx.shape
        By, Ty = idy.shape
        # Encoder
        tok_emb_enc = self.token_embedding_table(idx) # (B,T,C)
        pos_emb_enc = self.position_embedding_table(torch.arange(Tx, device=device)) # (T,C)
        enc_x = tok_emb_enc + pos_emb_enc # (B,T,C)
        enc_x = self.encoder_blocks(enc_x) # (B,T,C)
        # Decoder
        tok_emb = self.token_embedding_table(idy) # (B,T,C)
        pos_emb = self.position_embedding_table_dec(torch.arange(Ty, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        for decoder_block in self.decoder_blocks:
          x = decoder_block(x, enc_x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
          loss = None
        else:
          B, T, C = logits.shape
          logits = logits.view(B*T, C)
          targets = targets.view(B*T)
          loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, start_token_idx, max_new_tokens):
      idy = torch.full((idx.size(0), 1), start_token_idx[0], dtype=torch.long, device=idx.device)

      for _ in range(max_new_tokens):
        # get the predictions
        logits, _ = self(idx, idy)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idy_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        # append sampled index to the running sequence
        idy = torch.cat((idy, idy_next), dim=1) # (B, T+1)
      return idy

    def save(self, path, optimizer):
      torch.save({
          'state_dict': self.state_dict(),
          'optimizer': optimizer.state_dict()
      }, path)

    def load(self, checkpoint_path, optimizer):
      checkpoint = torch.load(checkpoint_path)
      self.load_state_dict(checkpoint['state_dict'])
      optimizer.load_state_dict(checkpoint['optimizer'])

## Split data and loss estimation

In [None]:
def split_and_encode_text(input_array, encoder):
  result = []
  max_in_block = -1
  max_out_block = -1
  shuffle(input_array)
  for item in input_array:
    in_task = torch.tensor(encoder(item[0]), dtype=torch.long)
    out_task = torch.tensor(encoder(item[1]), dtype=torch.long)
    result.append((in_task, out_task))

    len_in_block = len(in_task)
    len_out_block = len(out_task)

    if max_in_block < len_in_block: max_in_block = len_in_block
    if max_out_block < len_out_block: max_out_block = len_out_block

  return result, max_in_block + 1, max_out_block + 1

In [None]:
# Split vectorized data
data, MAX_IN_BLOCK, MAX_OUT_BLOCK = split_and_encode_text(questions, word_encoder)

entities = []
for w in words:
  if '<' in w and len(w) > 1: entities.append(w)
print("Pretrain data size:",  len(entities))
entities = [(" ".join([e]*MAX_IN_BLOCK), " ".join([e]*MAX_OUT_BLOCK)) for e in entities]

data_pre, MAX_IN_BLOCK_pre, MAX_OUT_BLOCx_pre = split_and_encode_text(entities, word_encoder)

Pretrain data size: 7617


## Auxiliary train and test methods

In [None]:
@torch.no_grad()
def estimate_loss():
  out = {}
  m.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y, targets = get_batch(split, word_encoder)
      logits, loss = m(X, Y, targets)
      losses[k] = loss.item()
    out[split] = losses.mean()
  m.train()
  return out

def get_batch(split, encoder):
    data = train_data if split == 'train' else val_data # elige un dataset dependiendo de la etapa
    ix = torch.randint(len(data)-1, (batch_size,))
    batch = [data[i] for i in ix]
    start_token = torch.tensor(encoder('¿'))
    end_token = torch.tensor(encoder('¡'))
    x = torch.stack([F.pad(in_v, (0, MAX_IN_BLOCK - len(in_v)), value=1) for (in_v, _) in batch])
    y = torch.stack([F.pad(torch.cat([start_token, out_v]), (0, MAX_OUT_BLOCK - len(out_v) - 1), value=1) for (_, out_v) in batch])
    t = torch.stack([F.pad(torch.cat([out_v, end_token]), (0, MAX_OUT_BLOCK - len(out_v) - 1), value=1) for (_, out_v) in batch])
    x, y, t = x.to(device), y.to(device), t.to(device)
    return x, y, t

def train_model(m, estimate_loss, optimizer, encoder, decoder):
  intermediate_prints = []
  for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
      losses = estimate_loss()
      print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb, targets = get_batch('train', encoder)

    # evaluate the loss
    logits, loss = m(xb, yb, targets)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if iter % text_sample == 0 or iter == max_iters - 1:
      input_tokens = torch.zeros((1, 1), dtype=torch.long)
      xb, yb, targets = get_batch('val', encoder)
      m.eval()
      intermediate_prints.append(decoder(m.generate(xb, encoder('¿'), MAX_OUT_BLOCK)[0].tolist()))
      m.train()
    del xb
    del yb
    del targets

  return intermediate_prints

def test_model(samples, m, encoder, decoder, test_data):
  one_shot_hits = 0
  three_shot_hits = 0
  hamming_distance = 0
  m.eval()
  for i, test in enumerate(test_data[:samples]):
    if i % 20 == 0: print("Sample: ", str(i))
    in_t, out_t = test
    n_tokens = len(out_t)
    padded_text = F.pad(in_t, (0, MAX_IN_BLOCK - len(in_t)), value=1)

    misses = 0
    out_list = out_t.tolist()
    shot = m.generate(padded_text.unsqueeze(0).to(device), encoder('¿'), n_tokens)[0].tolist()[1:]
    for a,b in zip(shot, out_list):
      if a != b: misses += 1

    if misses:
      hamming_distance += misses
      hit = False
      for _ in range(2):
        shot = m.generate(padded_text.unsqueeze(0).to(device), encoder('¿'), n_tokens)[0].tolist()[1:]
        if shot == out_list: hit = True
      if hit: three_shot_hits += 1
    else:
      one_shot_hits += 1
      three_shot_hits += 1


  acc1 = round(one_shot_hits / samples, 5)
  acc3 = round(three_shot_hits / samples, 5)
  mhd = round(hamming_distance / samples, 5)
  m.train()
  return acc1, acc3, mhd

## Model with pretrain

### Pretrain

In [None]:
# Experiment hyperparams
dropout = 0.01
split = 1

# Data division
n_test = int(len(data_pre))
test_data = data_pre
print("Test cases train:", n_test)

n_train = int(len(data_pre))
print("Train cases:", n_train)
train_data = data_pre
val_data = data_pre
print("Validation cases:", len(val_data))

Test cases train: 7617
Train cases: 7617
Validation cases: 7617


In [None]:
# Model init
encoder_blocks = nn.Sequential(*[EncoderBlock(n_embd, n_head) for _ in range(n_layer)])
full_model = TransformerModel(encoder_blocks, word_vocab_size)
m = full_model.to(device)

# Print init loss and hyperparams
x1, y1, targets = get_batch('train', word_encoder)
print("Device:", device)
logits, loss = m(x1, y1, targets)
print(logits.shape)
print("Loss:", loss)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# Instanciar optimizador
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

Device: cuda
torch.Size([1568, 10399])
Loss: tensor(9.3208, device='cuda:0', grad_fn=<NllLossBackward0>)
3.474975 M parameters


In [None]:
max_iters = 14400
text_sample = max_iters // 6
intermediate_prints = train_model(m, estimate_loss, optimizer, word_encoder, word_decoder)

for idx, intermediate_print in enumerate(intermediate_prints):
  print("Sample {}:\n".format(idx), intermediate_print)

m.save("/content/drive/MyDrive/transformer-pretrain-final.pth", optimizer)

step 0: train loss 9.4087, val loss 9.4129
step 200: train loss 8.8382, val loss 8.8434
step 400: train loss 8.0246, val loss 8.0579
step 600: train loss 7.0254, val loss 7.0388
step 800: train loss 5.7848, val loss 5.7535
step 1000: train loss 4.3185, val loss 4.3537
step 1200: train loss 2.9968, val loss 2.9963
step 1400: train loss 1.9129, val loss 1.8723
step 1600: train loss 1.1284, val loss 1.1320
step 1800: train loss 0.5990, val loss 0.6161
step 2000: train loss 0.3469, val loss 0.3598
step 2200: train loss 0.1924, val loss 0.2017
step 2400: train loss 0.1050, val loss 0.1294
step 2600: train loss 0.0854, val loss 0.0748
step 2800: train loss 0.0480, val loss 0.0459
step 3000: train loss 0.0344, val loss 0.0365
step 3200: train loss 0.0231, val loss 0.0240
step 3400: train loss 0.0183, val loss 0.0194
step 3600: train loss 0.0145, val loss 0.0152
step 3800: train loss 0.0123, val loss 0.0126
step 4000: train loss 0.0106, val loss 0.0106
step 4200: train loss 0.0093, val loss 0.

In [None]:
acc1, acc3, mhd = test_model(200, m, word_encoder, word_decoder, test_data)
print('Accuracy@1 on test: {}%'.format(acc1 * 100))
print('Accuracy@3 on test: {}%'.format(acc3 * 100))
print('Mean Hamming distance on test: {}'.format(mhd))

Sample:  0
Sample:  20
Sample:  40
Sample:  60
Sample:  80
Sample:  100
Sample:  120
Sample:  140
Sample:  160
Sample:  180
Accuracy@1 on test: 99.5%
Accuracy@3 on test: 100.0%
Mean Hamming distance on test: 0.005


### Train

In [None]:
# Experiment hyperparams
dropout = 0.01
split = 0.95

# Data division
n_test = int(0.02*len(data))
test_data = data[:n_test]
print("Test cases train:", n_test)

data = data[n_test:]
n_train = int(split*len(data))
print("Train cases:", n_train)
train_data = data[:n_train]
val_data = data[n_train:]
print("Validation cases:", len(val_data))

Test cases train: 185
Train cases: 8648
Validation cases: 456


In [None]:
max_iters = 4800
text_sample = max_iters // 6

intermediate_prints = train_model(m, estimate_loss, optimizer, word_encoder, word_decoder)

for idx, intermediate_print in enumerate(intermediate_prints):
  print("Sample {}:\n".format(idx), intermediate_print)

step 0: train loss 13.5342, val loss 13.5327
step 200: train loss 0.7309, val loss 0.8128
step 400: train loss 0.3958, val loss 0.5117
step 600: train loss 0.2311, val loss 0.3818
step 800: train loss 0.1507, val loss 0.3148
step 1000: train loss 0.0951, val loss 0.2639
step 1200: train loss 0.0708, val loss 0.2352
step 1400: train loss 0.0490, val loss 0.2083
step 1600: train loss 0.0378, val loss 0.2050
step 1800: train loss 0.0300, val loss 0.1912
step 2000: train loss 0.0218, val loss 0.1894
step 2200: train loss 0.0191, val loss 0.1848
step 2400: train loss 0.0147, val loss 0.1751
step 2600: train loss 0.0107, val loss 0.1802
step 2800: train loss 0.0092, val loss 0.1843
step 3000: train loss 0.0079, val loss 0.1762
step 3200: train loss 0.0070, val loss 0.1765
step 3400: train loss 0.0064, val loss 0.1728
step 3600: train loss 0.0060, val loss 0.1776
step 3800: train loss 0.0048, val loss 0.1688
step 4000: train loss 0.0037, val loss 0.1756
step 4200: train loss 0.0038, val loss 

In [None]:
m.save("/content/drive/MyDrive/transformer-pretrained-final.pth", optimizer)

In [None]:
acc1, acc3, mhd = test_model(n_test, m, word_encoder, word_decoder, test_data)
print('Accuracy@1 on test: {}%'.format(acc1 * 100))
print('Accuracy@3 on test: {}%'.format(acc3 * 100))
print('Mean Hamming distance on test: {}'.format(mhd))

Sample:  0
Sample:  20
Sample:  40
Sample:  60
Sample:  80
Sample:  100
Sample:  120
Sample:  140
Sample:  160
Sample:  180
Accuracy@1 on test: 49.189%
Accuracy@3 on test: 62.702999999999996%
Mean Hamming distance on test: 2.01081


In [None]:
final_queries = []
for entry in linked_q:
  n_tokens = MAX_OUT_BLOCK
  in_t = torch.tensor(word_encoder(entry["linked"]), dtype=torch.long)
  padded_text = F.pad(in_t, (0, MAX_IN_BLOCK - len(in_t)), value=1)
  query = word_decoder(m.generate(padded_text.unsqueeze(0).to(device), word_encoder('¿'), n_tokens)[0].tolist()[1:])
  query = query.split("¡")[0]
  entry["sparql"] = query
  final_queries.append(entry)

with open("/content/drive/MyDrive/dblp_pretrained_queries_500.json", 'w', encoding='utf-8') as file:
  json.dump(final_queries, file)

## Model without pretrain

### Train

In [None]:
# Model init
encoder_blocks = nn.Sequential(*[EncoderBlock(n_embd, n_head) for _ in range(n_layer)])
full_model = TransformerModel(encoder_blocks, word_vocab_size)
m = full_model.to(device)

# Print init loss and hyperparams
x1, y1, targets = get_batch('train', word_encoder)
print("Device:", device)
logits, loss = m(x1, y1, targets)
print(logits.shape)
print("Loss:", loss)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# Instanciar optimizador
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

Device: cuda
torch.Size([1568, 10399])
Loss: tensor(9.1485, device='cuda:0', grad_fn=<NllLossBackward0>)
3.474975 M parameters


In [None]:
max_iters = 19200
text_sample = max_iters // 6
intermediate_prints = train_model(m, estimate_loss, optimizer, word_encoder, word_decoder)

for idx, intermediate_print in enumerate(intermediate_prints):
  print("Sample {}:\n".format(idx), intermediate_print)

m.save("/content/drive/MyDrive/transformer-final.pth", optimizer)

step 0: train loss 9.1270, val loss 9.1256
step 200: train loss 1.1205, val loss 1.1495
step 400: train loss 0.6651, val loss 0.7350
step 600: train loss 0.4713, val loss 0.5500
step 800: train loss 0.3828, val loss 0.4890
step 1000: train loss 0.3221, val loss 0.4391
step 1200: train loss 0.2912, val loss 0.4207
step 1400: train loss 0.2626, val loss 0.4134
step 1600: train loss 0.2433, val loss 0.4012
step 1800: train loss 0.2249, val loss 0.3964
step 2000: train loss 0.2096, val loss 0.3790
step 2200: train loss 0.1980, val loss 0.3821
step 2400: train loss 0.1818, val loss 0.3615
step 2600: train loss 0.1642, val loss 0.3618
step 2800: train loss 0.1465, val loss 0.3632
step 3000: train loss 0.1336, val loss 0.3605
step 3200: train loss 0.1155, val loss 0.3533
step 3400: train loss 0.1019, val loss 0.3400
step 3600: train loss 0.0837, val loss 0.3427
step 3800: train loss 0.0709, val loss 0.3281
step 4000: train loss 0.0568, val loss 0.3261
step 4200: train loss 0.0470, val loss 0.

In [None]:
acc1, acc3, mhd = test_model(n_test, m, word_encoder, word_decoder, test_data)
print('Accuracy@1 on test: {}%'.format(acc1 * 100))
print('Accuracy@3 on test: {}%'.format(acc3 * 100))
print('Mean Hamming distance on test: {}'.format(mhd))

Sample:  0
Sample:  20
Sample:  40
Sample:  60
Sample:  80
Sample:  100
Sample:  120
Sample:  140
Sample:  160
Sample:  180
Accuracy@1 on test: 31.892%
Accuracy@3 on test: 43.784%
Mean Hamming distance on test: 1.72432


In [None]:
final_queries = []
for entry in linked_q:
  n_tokens = MAX_OUT_BLOCK
  in_t = torch.tensor(word_encoder(entry["linked"]), dtype=torch.long)
  padded_text = F.pad(in_t, (0, MAX_IN_BLOCK - len(in_t)), value=1)
  query = word_decoder(m.generate(padded_text.unsqueeze(0).to(device), word_encoder('¿'), n_tokens)[0].tolist()[1:])
  query = query.split("¡")[0]
  entry["sparql"] = query
  final_queries.append(entry)

with open("/content/drive/MyDrive/dblp_not_pretrained_queries_500.json", 'w', encoding='utf-8') as file:
  json.dump(final_queries, file)

## Qualitative testing

In [None]:
q = "What are the papers written by the person Hideaki Takeda?"
entities = ["<https://dblp.org/pid/27/4034-1>"]
q = format_question(q)
print(q)
q = process_question(entities, q)
print(q)

what are the papers written by the person Hideaki Takeda
what are the papers written by the person <https://dblp.org/pid/27/4034-1>


In [None]:
n_tokens = 15
in_t = torch.tensor(word_encoder(q), dtype=torch.long)
padded_text = F.pad(in_t, (0, MAX_IN_BLOCK - len(in_t)), value=1)
word_decoder(m.generate(padded_text.unsqueeze(0).to(device), word_encoder('¿'), n_tokens)[0].tolist()[1:])

"K. <https://dblp.org/rec/conf/cms/MerkelDV11> <https://dblp.org/pid/17/5121> <https://dblp.org/rec/conf/adc/LiuRB10> written <https://dblp.org/pid/81/930> <https://dblp.org/pid/149/6694> 'ICUIMC' <https://dblp.org/rec/journals/corr/abs-1712-00175> <https://dblp.org/pid/93/4244> NIPS <https://dblp.org/pid/14/9539> <https://dblp.org/rec/journals/winet/HojjatiEAN17> <https://dblp.org/rec/journals/rairo/AlrefaeiD15> <https://dblp.org/pid/258/3087>"

# Challenge

In [None]:
!pip install SPARQLWrapper

In [None]:
import json, re
from SPARQLWrapper import SPARQLWrapper, JSON

def get_triples(query):
  try:
    sparql = SPARQLWrapper("https://dblp-kg.ltdemos.informatik.uni-hamburg.de/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    triples = [a["answer"]["value"] for a in results["results"]["bindings"]]
    return triples
  except:
    return []

In [None]:
answers = []
with open("/content/drive/MyDrive/dblp_pretrained_queries_500.json", 'r', encoding='utf-8') as file:
  data = json.load(file)
  for entry in data:
    if entry["entities"]:
      query = entry["sparql"]
      answer = get_triples(query)
      new_entry = entry.copy()
      new_entry["answer"] = answer
      answers.append(new_entry)

with open("/content/drive/MyDrive/dblp_answers_pretrained_500.json", 'w', encoding='utf-8') as file:
  json.dump(answers, file)

In [None]:
answers = []
with open("/content/drive/MyDrive/dblp_not_pretrained_queries_500.json", 'r', encoding='utf-8') as file:
  data = json.load(file)
  for entry in data:
    if entry["entities"]:
      query = entry["sparql"]
      answer = get_triples(query)
      new_entry = entry.copy()
      new_entry["answer"] = answer
      answers.append(new_entry)

with open("/content/drive/MyDrive/dblp_answers_not_pretrained_500.json", 'w', encoding='utf-8') as file:
  json.dump(answers, file)