In [None]:
!pip install simpletransformers
!pip install rouge

In [2]:
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args
import json
import re
from rouge import Rouge 
from nltk.translate.bleu_score import sentence_bleu
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
# Downloading LC-QuAD v2 from Dropbox link
!wget https://www.dropbox.com/s/pygr3g3ansoj043/lcquad_v2.json?dl=0

In [4]:
def read_json(json_file):
    file = json.load(open(json_file))

    prefix = []
    questions = []
    sparql = []

    for instance in file:
      for k, v in instance.items():
        if k == 'paraphrased_question':
          prefix.append('generate sparql')
          questions.append(v)
        elif k == 'sparql_wikidata':
          sparql.append(v)
    return prefix, questions, sparql

In [19]:
prefix, text, label = read_json('lcquad_v2.json?dl=0')

In [20]:
input_text = []
target_text = []
for input, target in zip(text, label):
  target_length = target.split(' ')
  if len(target_length) < 10:
      target = target.replace('{', 'open_bracket').replace('}', ' close_bracket').replace('.', 'sep_dot').replace('?answer', 'var_uri')
      target = re.sub(' +', ' ', target)
      input_text.append(input)
      target_text.append(target)

In [21]:
df = pd.DataFrame()
df['prefix'] = prefix[:len(input_text)]
df['input_text'] = input_text
df['target_text'] = target_text

In [22]:
# Splitting dataframe in train and test set, ratio 90% train, 10% test
df_train, df_test = train_test_split(df, test_size=0.1)

In [23]:
# Converting dataframes to string values and lowercasing 
df_train = df_train.astype(str)
df_test = df_test.astype(str)
df_train['input_text'] = df_train['input_text'].str.lower()
df_test['input_text'] = df_test['input_text'].str.lower()
df_train['target_text'] = df_train['target_text'].str.lower()
df_test['target_text'] = df_test['target_text'].str.lower()

In [24]:
for index, row in df_train[:5].iterrows():
  print("question: {}\nsparql: {}\n".format(row['input_text'], row['target_text']))
print("Total train size: {}".format(len(df_train)))

question: what is z39.5 truncation for diary of the american chemical society ?
sparql: select distinct var_uri where open_bracket wd:q898902 wdt:p1161 var_uri close_bracket

question: what is the enacting neurotransmitter of the y-aminobutyric acid?
sparql: select distinct var_uri where open_bracket var_uri wdt:p928 wd:q210021 close_bracket

question: what is urho kekkonen's finnish priests database id?
sparql: select distinct var_uri where open_bracket wd:q179858 wdt:p2182 var_uri close_bracket

question: what is the irming id of oestridae?
sparql: select distinct var_uri where open_bracket wd:q27485 wdt:p5055 var_uri close_bracket

question: which is conseil constitutionnel id for jacques chirac?
sparql: select distinct var_uri where open_bracket wd:q2105 wdt:p5457 var_uri close_bracket

Total train size: 4089


In [25]:
for index, row in df_test[:5].iterrows():
  print("question: {}\nsparql: {}\n".format(row['input_text'], row['target_text']))
print("Total test size: {}".format(len(df_test)))

question: ululate what is amelie's dnf film id?
sparql: select distinct var_uri where open_bracket wd:q484048 wdt:p1804 var_uri close_bracket

question: which is the mac address piece huge id for nokia?
sparql: select distinct var_uri where open_bracket wd:q1418 wdt:p4776 var_uri close_bracket

question: did eric clapton play the guitar?
sparql: ask where open_bracket wd:q48187 wdt:p1303 wd:q6607 close_bracket

question: what is sally ride's astronaut mission?
sparql: select distinct var_uri where open_bracket wd:q49285 wdt:p450 var_uri close_bracket

question: would phil collins music be in the genre of power pop?
sparql: ask where open_bracket wd:q144622 wdt:p136 wd:q837837 close_bracket

Total test size: 455


#T5-small

In [26]:
# Specifying model arguments, e.g. learn rate, batch size, epochs
model_args = T5Args()
model_args.num_train_epochs = 10
model_args.overwrite_output_dir = True
model_args.learning_rate = 0.001
model_args.batch_size = 32

model = T5Model(
    't5',
    't5-small',
    args=model_args,
    early_stopping=True,
    use_cuda=True
)

In [27]:
# Training the model with the specified model arguments
model.train_model(df_train)

  0%|          | 0/4089 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/512 [00:00<?, ?it/s]

(5120, 0.6510413661773782)

In [28]:
# Converting test dataframe to separate X_test and Y_test
X_test = df_test['input_text'].tolist()
Y_test = df_test['target_text'].tolist()

In [29]:
# Let the trained model predict on test questions
predict = model.predict(X_test)

Generating outputs:   0%|          | 0/57 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/455 [00:00<?, ?it/s]

In [30]:
rouge = Rouge()
rouge.get_scores(predict, Y_test, avg=True)

{'rouge-1': {'r': 0.5962323390894813,
  'p': 0.7983882783882763,
  'f': 0.6825745633965472},
 'rouge-2': {'r': 0.48296703296703336,
  'p': 0.7500366300366349,
  'f': 0.5869257434856141},
 'rouge-l': {'r': 0.5962323390894813,
  'p': 0.7983882783882763,
  'f': 0.6825745633965472}}

## ROUGE score if prediction contains empty value

In [31]:
count  = 0
ROUGEL = []
for pred, gold in zip(predict, Y_test):
  count += 1
  try:
    ROUGEL.append(rouge.get_scores(gold, pred))
  except ValueError:
    pass

In [None]:
rougel_f1 = []
for scores in ROUGEL:
  for score in scores:
    for key, value in score.items():
      if key == 'rouge-l':
        for k, v in value.items():
          if k == 'f':
            rougel_f1.append(v)

# Printing ROUGEL f1
rougel_f1 = sum(rougel_f1) / len(rougel_f1)
print("ROUGEL f1-score: {}".format(round(rougel_f1, 4)))