In [1]:
from collections.__init__ import OrderedDict
import json
import re
import spacy
from spacy.tokens import DocBin
from fairseq.models import BaseFairseqModel

In [14]:
WIKIDATA_REPLACE_RULES = OrderedDict()
WIKIDATA_REPLACE_RULES["brack_open"] = '{'  # \n
WIKIDATA_REPLACE_RULES["brack_close"] = '}'
WIKIDATA_REPLACE_RULES["attr_open"] = '('
# WIKIDATA_REPLACE_RULES[' \( '] = '('
WIKIDATA_REPLACE_RULES["attr_close"] = ')'
# WIKIDATA_REPLACE_RULES[' \) '] = ')'
WIKIDATA_REPLACE_RULES["var_"] = '?'
WIKIDATA_REPLACE_RULES["sep_dot"] = '.'  # \n
WIKIDATA_REPLACE_RULES["sep_comma"] = ','  # \n
WIKIDATA_REPLACE_RULES["_oba_"] = 'order by asc'
WIKIDATA_REPLACE_RULES["_obd_"] = 'order by desc'
WIKIDATA_REPLACE_RULES["_grb_"] = 'group by'
WIKIDATA_REPLACE_RULES["wd_"] = 'wd:'
WIKIDATA_REPLACE_RULES["wdt_"] = 'wdt:'
WIKIDATA_REPLACE_RULES["rdfs_"] = 'rdfs:'
WIKIDATA_REPLACE_RULES["rdf_"] = 'rdf:'
WIKIDATA_REPLACE_RULES["foaf_"] = 'foaf:'
WIKIDATA_REPLACE_RULES["p_"] = 'p:'
WIKIDATA_REPLACE_RULES["ps_"] = 'ps:'
WIKIDATA_REPLACE_RULES["pq_"] = 'pq:'
WIKIDATA_REPLACE_RULES["bd_"] = 'bd:'

WIKIDATA_REGEX_REPLACE_RULES = OrderedDict()
WIKIDATA_REGEX_REPLACE_RULES[r"<([\w\d_]+)>"] = r'placeholder_\1'
WIKIDATA_REGEX_REPLACE_RULES[r"(\d)[.](\d)"] = r"\1_dot_\2"
WIKIDATA_REGEX_REPLACE_RULES[r"'(.*?)'"] = r"apstrph_\1_apstrph"
WIKIDATA_REGEX_REPLACE_RULES[r"\s*([}{)(.,><=])\s*"] = r" \1 "
WIKIDATA_REGEX_REPLACE_RULES[">"] = 'math_gt'
WIKIDATA_REGEX_REPLACE_RULES["<"] = 'math_lt'
WIKIDATA_REGEX_REPLACE_RULES["="] = 'math_eq'
WIKIDATA_REGEX_REPLACE_RULES[r"\s{2,}"] = ' '
#WIKIDATA_REGEX_REPLACE_RULES[r'([,])\s*\"'] = r'\1 open_quote'
#WIKIDATA_REGEX_REPLACE_RULES[r'\"\s*([)])'] = r'\1 close_quote'

WIKIDATA_REGEX_BACK_REPLACE_RULES = OrderedDict()
WIKIDATA_REGEX_BACK_REPLACE_RULES[r"placeholder_(\w+)"] = r'<\1>'
WIKIDATA_REGEX_BACK_REPLACE_RULES[r"(\d)_dot_(\d)"] = r"\1.\2"
WIKIDATA_REGEX_BACK_REPLACE_RULES[r"apstrph_(.*?)_apstrph"] = r"'\1'"
WIKIDATA_REGEX_BACK_REPLACE_RULES["math_gt"] = ' > '
WIKIDATA_REGEX_BACK_REPLACE_RULES["math_lt"] = ' < '
WIKIDATA_REGEX_BACK_REPLACE_RULES["math_eq"] = ' = '
#WIKIDATA_REGEX_BACK_REPLACE_RULES['open_quote'] = '\"'
#WIKIDATA_REGEX_BACK_REPLACE_RULES['close_quote'] = '\"'

WIKIDATA_PREFIXES = OrderedDict()
WIKIDATA_PREFIXES["wd"] = "http://www.wikidata.org/entity/"
WIKIDATA_PREFIXES["wdt"] = "http://www.wikidata.org/prop/direct/"
WIKIDATA_PREFIXES["wiki"] = "https://en.wikipedia.org/wiki/"
WIKIDATA_PREFIXES["wikibase"] = "http://wikiba.se/ontology#"
WIKIDATA_PREFIXES["ps"] = "http://www.wikidata.org/prop/statement/"
WIKIDATA_PREFIXES["pq"] = "http://www.wikidata.org/prop/qualifier/"
WIKIDATA_PREFIXES["p"] = "http://www.wikidata.org/prop/"
WIKIDATA_PREFIXES["rdfs"] = "http://www.w3.org/2000/01/rdf-schema#"
WIKIDATA_PREFIXES["bd"] = "http://www.bigdata.com/rdf#"
WIKIDATA_PREFIXES["schema"] = "http://schema.org/"
WIKIDATA_PREFIXES["skos"] = "http://www.w3.org/2004/02/skos/core#"

In [3]:
rules = WIKIDATA_REPLACE_RULES
regex_rules = WIKIDATA_REGEX_REPLACE_RULES
regex_back_rules = WIKIDATA_REGEX_BACK_REPLACE_RULES

# copied and modified from query_tools.base_tokenizer
def encode(query: str) -> str:
    """
    Transform a SPARQL query to a tokenized query string. Compress the query if not compressed.
    """
#     q_string = query.get_query(compressed=True).lower()
    q_string = query.lower()
    
    for pattern, replace in regex_rules.items():  # iterating through keys
        q_string = re.sub(pattern, replace, q_string)  # re.sub(rule, key, q_string)
        
    for key, rule in rules.items():               # iterating through keys
        q_string = q_string.replace(rule, key)
    return q_string.strip()

def decode(query_string: str) -> str:
    """
    Transform a tokenized query string to a SPARQL query. It assumes that the query is tokenize.
    Perform correction if needed.
    Ej:
        tokenize query -> "select distinct var_uri where brack_open wd_q4072104 wdt_p184 var_uri brack_close"
        query -> "SELECT DISTINCT ?uri WHERE { wd:q4072104 wdt:p184 ?uri }"

    :param query_string: encoded string query.
    :return: Query instance.
    """
    q_string = query_string
    for pattern, replace in regex_back_rules.items():  # iterating through keys
        q_string = re.sub(pattern, replace, q_string)  # re.sub(rule, key, q_string)
    for key, rule in rules.items():  # iterating through keys
        q_string = re.sub(key, rule, q_string)  # perform the S&R
    return q_string

# Explore data

In [2]:
with open('data/dataset_lcquad2.json') as df:
    lcquad2_data = json.load(df)
lcquad2_data.keys()

dict_keys(['questions', 'train', 'valid'])

In [25]:
qid = 13007
print(lcquad2_data['questions'][qid]['natural_language_question'])
print(lcquad2_data['questions'][qid]['query_answer'][0]['sparql_query'])
print(lcquad2_data['questions'][qid]['query_answer'][0]['entities'])
print(lcquad2_data['questions'][qid]['query_answer'][0]['slots'])
print(lcquad2_data['questions'][qid]['query_answer'][0]['sparql_template'])

What field does Fernand Maillaud work in?
SELECT ?answer WHERE { wd:Q3069252 wdt:P106 ?obj . ?obj wdt:P425 ?answer}
[{'label': 'Fernand Maillaud', 'entity': 'wd:Q3069252'}]
[{'slot': '<sbj_1>', 'label': 'Fernand Maillaud'}]
SELECT ?answer WHERE { <sbj_1> wdt:P106 ?obj . ?obj wdt:P425 ?answer}


In [None]:
select ?value where { <sbj_1> p:p106 ?s . ?s ps:p106 <obj_2> . ?s pq:p101 ?value }

In [18]:
encode('ASK WHERE { <sbj_1> wdt:P106 <obj_1> }')

'ask where brack_open placeholder_sbj_1 wdt_p106 placeholder_obj_1 brack_close'

# Build source and target datasets

In [6]:
with open('data/dataset_lcquad2.json') as df:
    lcquad2_data = json.load(df)
print(lcquad2_data.keys())
print(len(lcquad2_data['questions']))

dict_keys(['questions', 'train', 'valid'])
27706


In [7]:
train_start, train_end = 0, 400
val_start, val_end = 400, 500

with open('models/nmt/train.en', 'w+') as nlqen, open('models/nmt/train.sparql', 'w+') as nlqsparql:
    for q in lcquad2_data['questions'][train_start:train_end]:
        nlqen.write(q['natural_language_question'] + '\n')
        nlqsparql.write(encode(q['query_answer'][0]['sparql_template']) + '\n')
        
with open('models/nmt/val.en', 'w+') as nlqen, open('models/nmt/val.sparql', 'w+') as nlqsparql:
    for q in lcquad2_data['questions'][val_start:val_end]:
        nlqen.write(q['natural_language_question'] + '\n')
        nlqsparql.write(encode(q['query_answer'][0]['sparql_template']) + '\n')
    

In [8]:
!fairseq-preprocess --source-lang en --target-lang sparql --trainpref models/nmt/train --validpref models/nmt/val --destdir models/nmt/data-bin

2021-09-12 09:55:31 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, bf16=False, bpe=None, checkpoint_shard_count=1, checkpoint_suffix='', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='models/nmt/data-bin', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, profile=False, quantization_config_path=None, scoring='bleu', seed=1, source_lang='en', srcdict=None, target_lang='sparql', task='translation', tensorboard_logdir=None, testpref=None, tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, tokenizer=None, tpu=False, trainpr

In [9]:
# --valid-subset valid,test
!fairseq-train models/nmt/data-bin -s en -t sparql \
--lr 0.001 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--arch fconv_wmt_en_de --lr-scheduler fixed --force-anneal 50 \
--max-epoch 500 --save-interval 100 \
--save-dir fairseq_fconv_wmt_en_de \
--optimizer adam \
--log-format simple

2021-09-12 09:55:53 | INFO | fairseq_cli.train | Namespace(adam_betas='(0.9, 0.999)', adam_eps=1e-08, all_gather_list_size=16384, arch='fconv_wmt_en_de', batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.1, cpu=False, criterion='label_smoothed_cross_entropy', curriculum=0, data='models/nmt/data-bin', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', decoder_attention='True', decoder_embed_dim=768, decoder_embed_path=None, decoder_layers='[(512, 3)] * 9 + [(1024, 3)] * 4 + [(2048, 1)] * 2', decoder_out_embed_dim=512, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=0, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', dropout=0.2, empty_cache_freq=0, encoder_embed_dim=768, encoder_embed_path=Non

# Modelling

In [4]:
translator = BaseFairseqModel.from_pretrained('models/nmt/', checkpoint_file='checkpoint_best.pt')

In [5]:
translator

GeneratorHubInterface(
  (models): ModuleList(
    (0): FConvModel(
      (encoder): FConvEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(6416, 768, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(1024, 768, padding_idx=1)
        (fc1): Linear(in_features=768, out_features=512, bias=True)
        (projections): ModuleList(
          (0): None
          (1): None
          (2): None
          (3): None
          (4): None
          (5): None
          (6): None
          (7): None
          (8): None
          (9): Linear(in_features=512, out_features=1024, bias=True)
          (10): None
          (11): None
          (12): None
          (13): Linear(in_features=1024, out_features=2048, bias=True)
          (14): None
        )
        (convolutions): ModuleList(
          (0): ConvTBC(512, 1024, kernel_size=(3,), padding=(1,))
          (1): ConvTBC(512, 1024, kernel_size=(3,), padding=(1,))
          (2): ConvTBC(512, 1024,

In [7]:
translator.translate('Did Alexander Hamilton practice law?')

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


'SELECT DISTINCT var_uri WHERE brack_open <dbr_Grey_Goose_ attr_open soft_drink attr_close math_gt <dbp_origin> var_uri brack_close'

In [9]:
translator.translate('Is Harrelson the real family name of Woody Harrelson?')

'SELECT DISTINCT var_uri WHERE brack_open <dbr_Grey_Goose_ attr_open song attr_close math_gt <dbp_artist> var_uri brack_close'