In [None]:
!pip install simpletransformers
!pip install rouge

In [2]:
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args
import json
import re
from rouge import Rouge 
from nltk.translate.bleu_score import sentence_bleu
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
def read_json(json_file):
    file = json.load(open(json_file))

    prefix = []
    questions = []
    sparql = []

    for instance in file:
      for k, v in instance.items():
        if k == 'corrected_question':
          prefix.append('generate_sparql')
          questions.append(v)
        elif k == 'sparql_query':
          sparql.append(v)
    return prefix, questions, sparql

In [4]:
def convert_sparql(sparql):
  sent = ""
  labels = []
  for n in sparql:
    n = n.lower()
    n = n.replace('?uri.', '?uri .').replace('?uri}', '?uri }').replace('{?uri', '{ ?uri').replace('count(?uri)', 'count ( ?uri )')
    n = n.split(' ')
    for word in n:
        if word.startswith("<"):
            if "resource" in word:
                word = word.split('/')
                dbr = '<dbr_' + word[4]
                sent += dbr + ' '

            elif "property" in word:
                word = word.split('/')
                dbp = '<dbp_' + word[4]
                sent += dbp + ' '

            elif "ontology" in word:
                word = word.split('/')
                dbo = '<dbo_' + word[4]
                sent += dbo + ' '
        elif word == '{':
            sent += 'bracket_open '
        elif word == '}':
            sent += 'bracket_close'
        elif word == '?uri':
            sent += 'var_uri '
        elif word == '?x':
            sent += 'var_x '
        elif word == '.':
            sent += 'sep_dot '
        elif word == '(':
            sent += 'attr_open '
        elif word == ')':
            sent += 'attr_close '
        else:
            sent += word + ' '
    sent = re.sub(' +', ' ', sent)
    labels.append(sent)
    sent = ""
  return labels

In [5]:
def create_df(json_file):
    prefix, questions, sparql = read_json(json_file)
    labels = convert_sparql(sparql)

    input_text = []
    target_text = []
    for text, label in zip(questions, labels):
      label = label.rstrip()
      if label[-1] != '}':
        target_text.append(label)
        input_text.append(text)

    df = pd.DataFrame()
    df['prefix'] = prefix[:len(input_text)]
    df['input_text'] = input_text
    df['input_text'] = df['input_text'].str.lower()
    df['target_text'] = target_text

    return df, sparql

In [None]:
!wget https://www.dropbox.com/s/j5di3g5jm3e72p8/train-data.json?dl=0
!wget https://www.dropbox.com/s/8kil1x0pkf6c40p/test-data.json?dl=0

In [7]:
df1, sparql_train = create_df('train-data.json?dl=0')
df2, sparql_test = create_df('test-data.json?dl=0')
df = df1.append(df2, ignore_index=True)

In [8]:
df_train, df_test = train_test_split(df, test_size=0.1)

In [9]:
for index, row in df_train[:5].iterrows():
  print("question: {}\nsparql: {}\n".format(row['input_text'], row['target_text']))

question: which producer of  the eristoff is also the  distributor of  bombay sapphire?
sparql:  select distinct var_uri where bracket_open <dbr_eristoff> <dbp_manufacturer> var_uri sep_dot <dbr_bombay_sapphire> <dbp_distributor> var_uri bracket_close

question: who has been married to both penny lancaster and alana stewart?
sparql: select distinct var_uri where bracket_open <dbr_penny_lancaster> <dbo_spouse> var_uri sep_dot <dbr_alana_stewart> <dbo_spouse> var_uri sep_dot bracket_close

question: which city's governing body is led by bruce harrell?
sparql: select distinct var_uri where bracket_open var_x <dbo_leader> <dbr_bruce_harrell> sep_dot var_uri <dbp_governingbody> var_x sep_dot bracket_close

question: whose children died in north bend, ohio?
sparql: select distinct var_uri where bracket_open var_x <dbo_deathplace> <dbr_north_bend,_ohio> sep_dot var_uri <dbo_child> var_x sep_dot bracket_close

question: to which nation did david animle hanses owe his allegiance?
sparql:  selec

In [10]:
for index, row in df_test[:5].iterrows():
  print("question: {}\nsparql: {}\n".format(row['input_text'], row['target_text']))

question: what is the academic discipline of the journal of cerebral blood flow & metabolism and also an ingredient of the ragout fin ?
sparql:  select distinct var_uri where bracket_open <dbr_journal_of_cerebral_blood_flow_&_metabolism> <dbo_academicdiscipline> var_uri sep_dot <dbr_ragout_fin> <dbo_ingredient> var_uri bracket_close

question: what are the regions in which the distributor of secrets and lies serves ? 
sparql: select distinct var_uri where bracket_open <dbr_secrets_and_lies_(u.s._tv_series)> <dbo_distributor> var_x sep_dot var_x <dbo_regionserved> var_uri sep_dot bracket_close

question: which relative of george madison is the appointer of john drayton ?
sparql:  select distinct var_uri where bracket_open <dbr_john_drayton> <dbp_appointer> var_uri sep_dot <dbr_george_madison> <dbo_relation> var_uri bracket_close

question: which things have been located, where governance is houston city council?
sparql: select distinct var_uri where bracket_open var_x <dbp_governingbody

In [None]:
# Specifying model arguments, e.g. learn rate, batch size, epochs
model_args = T5Args()
model_args.num_train_epochs = 25
model_args.overwrite_output_dir = True
model_args.learning_rate = 0.001
model_args.batch_size = 32

model = T5Model(
    't5',
    't5-small',
    args=model_args,
    early_stopping=True,
    use_cuda=True
)

In [None]:
# Training the model with the specified model arguments
model.train_model(df_train)

In [13]:
# Converting test dataframe to separate X_test and Y_test
X_test = df_test['input_text'].tolist()
Y_test = df_test['target_text'].tolist()

In [14]:
# Let the trained model predict on test questions
predict = model.predict(X_test)

Generating outputs:   0%|          | 0/39 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/308 [00:00<?, ?it/s]

In [15]:
rouge = Rouge()
rouge.get_scores(predict, Y_test, avg=True)

{'rouge-1': {'r': 0.5137132798658773,
  'p': 0.8111742424242453,
  'f': 0.6255455510168814},
 'rouge-2': {'r': 0.38910163503841866,
  'p': 0.7673082869511468,
  'f': 0.5112829782892215},
 'rouge-l': {'r': 0.5134427170953146,
  'p': 0.8107104205318519,
  'f': 0.6252037875172232}}

## Model prediction on simple questions

In [None]:
questions = ["what is the population of Groningen?", "what is the color of an elephant?", "what color is the flag of Groningen?"]
sparql_pred = model.predict(questions)

In [17]:
for sparql, question in zip(sparql_pred, questions):
  print("question: {}\tSparql: {}".format(question, sparql))

question: what is the population of Groningen?	Sparql: select distinct var_uri where bracket_open dbr_groning
question: what is the color of an elephant?	Sparql: select distinct var_uri where bracket_open dbr_an_ele
question: what color is the flag of Groningen?	Sparql: select distinct var_uri where bracket_open dbr_groning
