# Project Dataset Loading

In [None]:
import zipfile
import os
import numpy as np
import xml.etree.ElementTree as ET
import glob
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# Clone the dataset repository from github
!git clone https://github.com/leocomelli/score-freetext-answer.git

Cloning into 'score-freetext-answer'...
remote: Enumerating objects: 511, done.[K
remote: Total 511 (delta 0), reused 0 (delta 0), pack-reused 511[K
Receiving objects: 100% (511/511), 478.34 KiB | 3.65 MiB/s, done.
Resolving deltas: 100% (263/263), done.


In [None]:
training_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/training/2way/sciEntsBank'
test_data_unseen_ans_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-answers'
test_data_unseen_que_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-questions'
test_data_unseen_dom_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-domains'

In [None]:
def parse_xml_file(xml_file_path):

  question = ""
  ref = ""
  results = []

  for elem in ET.parse(xml_file_path).getroot():
    if elem.tag == 'questionText':
      question = elem.text
    for subelem in elem:
      if subelem.tag == 'referenceAnswer':
        ref = subelem.text
      else:
        results.append({
            'question': question,
            'ref': ref,
            'response': subelem.text,
            'score': subelem.attrib['accuracy'],
            'aug': False
        })

  return results

In [None]:
training_data = []
test_data_unseen_ans = []
test_data_unseen_que = []
test_data_unseen_dom = []
num_training_questions = 0
num_test_questions_unseen_ans = 0
num_test_questions_unseen_que = 0
num_test_questions_unseen_dom = 0

for data_file in glob.glob(training_data_directory + '/*'):
  training_data += parse_xml_file(data_file)
  num_training_questions += 1

for data_file in glob.glob(test_data_unseen_ans_directory + '/*'):
  test_data_unseen_ans += parse_xml_file(data_file)
  num_test_questions_unseen_ans += 1

for data_file in glob.glob(test_data_unseen_que_directory + '/*'):
  test_data_unseen_que += parse_xml_file(data_file)
  num_test_questions_unseen_que += 1

for data_file in glob.glob(test_data_unseen_dom_directory + '/*'):
  test_data_unseen_dom += parse_xml_file(data_file)
  num_test_questions_unseen_dom += 1


id_idx = 0
for tmp_ds in [training_data, test_data_unseen_ans, test_data_unseen_que, test_data_unseen_dom]:
  for data_item in tmp_ds:
    data_item['id'] = id_idx
    id_idx += 1


print("Number of Training Questions:", num_training_questions)
print("Number of Training Responses:", len(training_data))
print("")
print("Number of Test Questions (Unseen Answers):", num_test_questions_unseen_ans)
print("Number of Test Questions (Unseen Questions):", num_test_questions_unseen_que)
print("Number of Test Questions (Unseen Domains):", num_test_questions_unseen_dom)
print("")
print("Number of Test Responses (Unseen Answers):", len(test_data_unseen_ans))
print("Number of Test Responses (Unseen Questions):", len(test_data_unseen_que))
print("Number of Test Responses (Unseen Domains):", len(test_data_unseen_dom))

Number of Training Questions: 135
Number of Training Responses: 4969

Number of Test Questions (Unseen Answers): 135
Number of Test Questions (Unseen Questions): 15
Number of Test Questions (Unseen Domains): 46

Number of Test Responses (Unseen Answers): 540
Number of Test Responses (Unseen Questions): 733
Number of Test Responses (Unseen Domains): 4562


In [None]:
print(training_data[0])

{'question': 'The sand and flour in the gray material from mock rocks is separated by mixing with water and allowing the mixture to settle. Explain why the sand and flour separate.', 'ref': 'The sand particles are larger and settle first. The flour particles are smaller and therefore settle more slowly.', 'response': 'One is heavier than another see it settles in how many layers there is ingredients.', 'score': 'correct', 'aug': False, 'id': 0}


## Dataset Augmentation

Using Google Translate, we can augment the relatively small amount of training data via Backtranslation.

In [None]:
!pip install -q googletrans==3.1.0a0

In [None]:
import googletrans
from googletrans import Translator
import json

#### Question Response Map Creation

We create this to ensure that any generated responses do not match any of the existing responses for the question. This prevents duplication and improves the quality of the dataset.

In [None]:
question_response_map = {}

for training_item in training_data:
  question = training_item['question']

  if question not in question_response_map:
    question_response_map[question] = set()

  response = training_item['response']
  question_response_map[question].add(response)

#### Translation Functions

These methods will perform the augmentation of the dataset via backtranslation.

In [None]:
translator = Translator()

In [None]:
def backtranslate_example(input_item):

  global id_idx

  original_response = input_item['response']

  new_dataset_entries = []  # New Validated Dataset Entries

  trs = []  # Translation Candidates
  tr0 = translator.translate(original_response, src='en', dest='es').text
  tr1 = translator.translate(tr0, src='es', dest='fr').text
  tr2 = translator.translate(tr1, src='fr', dest='de').text
  tr3 = translator.translate(tr2, src='de', dest='cs').text
  tr4 = translator.translate(tr3, src='cs', dest='ru').text
  trs.append((translator.translate(tr0, src='es', dest='en').text, ['en', 'es', 'en']))
  trs.append((translator.translate(tr1, src='fr', dest='en').text, ['en', 'es', 'fr', 'en']))
  trs.append((translator.translate(tr2, src='de', dest='en').text, ['en', 'es', 'fr', 'de', 'en']))
  trs.append((translator.translate(tr3, src='cs', dest='en').text, ['en', 'es', 'fr', 'de', 'cs', 'en']))
  trs.append((translator.translate(tr4, src='ru', dest='en').text, ['en', 'es', 'fr', 'de', 'cs', 'ru', 'en']))

  for new_response_tuple in trs:

    new_response = new_response_tuple[0]
    new_response_translation_sequence = new_response_tuple[1]

    # Do not add the new response if it is the same after translation
    if new_response == original_response:
      continue

    # Do not add the new response if there is already another response that is the same
    if new_response in question_response_map[input_item['question']]:
      continue

    # Update the question response map with the new response
    question_response_map[input_item['question']].add(new_response)

    # Create a new dataset entry and return it
    new_dataset_entry = {
      'question': input_item['question'],
      'ref': input_item['ref'],
      'response': new_response,
      'score': input_item['score'],
      'aug': True,
      'id': id_idx,
      'aug_metadata': {
          'parent_id': input_item['id'],
          'translation_seq': new_response_translation_sequence
      }
    }

    id_idx += 1

    new_dataset_entries.append(new_dataset_entry)

  return new_dataset_entries

In [None]:
def process_generation(starting_idx, limit):
  gen_list = []
  for idx, training_item in enumerate(training_data):
    if idx < starting_idx:
      continue

    augmented_items = backtranslate_example(training_item)

    if augmented_items:
      gen_list += augmented_items

    if idx % 50 == 0:
      print('Processed', idx)

    if idx == starting_idx + limit:
      print('Finished Processing To Index', idx)
      return gen_list
  print('Finished Processing All Data')
  return gen_list

#### Data Augmentation Implementation

Note that we break these into multiple cells. This is due to the rate limiting with the translate method, and it benefits from having each cell called individually with down-time in between cell executions.

In [None]:
augmented_data = []

In [None]:
for i in range(3, 10):

  start_idx = 500*i

  print(f'Processing Indices {start_idx} to {start_idx + 499}...')
  generated_data_samples = process_generation(start_idx, 499)
  print(f'Finished Processing Indices {start_idx} to {start_idx + 499}.')

  if len(generated_data_samples) == 0:
    print("Nothing More To Process... Terminating.")
    break

  augmented_data += generated_data_samples  # Save Results

  # Save augmented training data every 500x
  with open(f'/content/train_to_idx_{start_idx+499}.json', 'w') as fp:
      json.dump(augmented_data, fp, indent=4)

  print('[Saved] Size of Augmented Dataset', len(augmented_data))

Processing Indices 1500 to 1999...
Processed 1500
Processed 1550
Processed 1600
Processed 1650
Processed 1700
Processed 1750
Processed 1800
Processed 1850
Processed 1900
Processed 1950
Finished Processing To Index 1999
Finished Processing Indices 1500 to 1999.
[Saved] Size of Augmented Dataset 1543
Processing Indices 2000 to 2499...
Processed 2000
Processed 2050
Processed 2100
Processed 2150
Processed 2200
Processed 2250
Processed 2300
Processed 2350
Processed 2400
Processed 2450
Finished Processing To Index 2499
Finished Processing Indices 2000 to 2499.
[Saved] Size of Augmented Dataset 3113
Processing Indices 2500 to 2999...
Processed 2500
Processed 2550
Processed 2600
Processed 2650
Processed 2700
Processed 2750
Processed 2800
Processed 2850
Processed 2900
Processed 2950
Finished Processing To Index 2999
Finished Processing Indices 2500 to 2999.
[Saved] Size of Augmented Dataset 4722
Processing Indices 3000 to 3499...
Processed 3000
Processed 3050
Processed 3100
Processed 3150
Proce

In [None]:
with open(f'/content/train_augmented_complete.json', 'w') as fp:
    json.dump(training_data + augmented_data, fp, indent=4)

In [None]:
print('Size of Original  Dataset', len(training_data))

print('Size of Combined  Dataset', len(training_data) + len(augmented_data))

print(f'Dataset Size Increase {100*len(augmented_data)/len(training_data)}%')

Size of Original  Dataset 4969
Size of Combined  Dataset 16265
Dataset Size Increase 227.3294425437714%


## Results

We have increased the size of our training set by almost 80%! This is a big improve that we hope will lead to better results in model training and fine-tuning.

Now, save all of the files as .json for easier use in the future

In [None]:
# Save augmented training data
with open('/content/train.json', 'w') as fp:
    json.dump(augmented_data + training_data, fp)

In [None]:
# Save augmented training data
with open('/content/test-unseen-answers.json', 'w') as fp:
    json.dump(test_data_unseen_ans, fp)

In [None]:
# Save augmented training data
with open('/content/test-unseen-questions.json', 'w') as fp:
    json.dump(test_data_unseen_que, fp)

In [None]:
# Save augmented training data
with open('/content/test-unseen-domains.json', 'w') as fp:
    json.dump(test_data_unseen_dom, fp)