# Project Dataset Loading

In [None]:
import zipfile
import os
import numpy as np
import xml.etree.ElementTree as ET
import glob
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# Clone the dataset repository from github
!git clone https://github.com/leocomelli/score-freetext-answer.git

Cloning into 'score-freetext-answer'...
remote: Enumerating objects: 511, done.[K
remote: Total 511 (delta 0), reused 0 (delta 0), pack-reused 511[K
Receiving objects: 100% (511/511), 478.34 KiB | 16.49 MiB/s, done.
Resolving deltas: 100% (263/263), done.


In [None]:
training_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/training/2way/sciEntsBank'
test_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-answers'

In [None]:
def parse_xml_file(xml_file_path):

  question = ""
  ref = ""
  results = []

  for elem in ET.parse(xml_file_path).getroot():
    if elem.tag == 'questionText':
      question = elem.text
    for subelem in elem:
      if subelem.tag == 'referenceAnswer':
        ref = subelem.text
      else:
        results.append({
            'question': question,
            'ref': ref,
            'response': subelem.text,
            'score': subelem.attrib['accuracy'],
            'aug': False
        })

  return results

In [None]:
training_data = []
test_data = []
num_training_questions = 0
num_test_questions = 0

for data_file in glob.glob(training_data_directory + '/*'):
  training_data += parse_xml_file(data_file)
  num_training_questions += 1

for data_file in glob.glob(test_data_directory + '/*'):
  test_data += parse_xml_file(data_file)
  num_test_questions += 1

print("Number of Training Questions:", num_training_questions)
print("Number of Training Responses:", len(training_data))

print("Number of Test Questions:", num_test_questions)
print("Number of Test Responses:", len(test_data))

Number of Training Questions: 135
Number of Training Responses: 4969
Number of Test Questions: 46
Number of Test Responses: 4562


In [None]:
class ShortAnswerGradingDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Note: I handle the parsing in the data loading from XML section
        # Returns a dict for each item with the following keys: 'question', 'ref', 'response', 'score' all of type 'str'
        return self.dataset[idx]

In [None]:
training_dataset = ShortAnswerGradingDataset(training_data)
test_dataset = ShortAnswerGradingDataset(test_data)

## Batching and Loading Data to Model

Use this iterator to load in the train and test datasets to the model of choice.

In [None]:
limit = 10
for training_item in training_dataset:
  print(training_item)
  if limit == 0:
    break
  limit -= 1

limit = 10
for test_item in test_dataset:
  print(test_item)
  if limit == 0:
    break
  limit -= 1

{'question': 'Look at the finger patterns below, then answer the questions. What is the same about the finger patterns?', 'ref': 'Both finger patterns are loops.', 'response': 'They are loop.', 'score': 'correct', 'aug': False}
{'question': 'Look at the finger patterns below, then answer the questions. What is the same about the finger patterns?', 'ref': 'Both finger patterns are loops.', 'response': 'One is a loop and one is arch.', 'score': 'incorrect', 'aug': False}
{'question': 'Look at the finger patterns below, then answer the questions. What is the same about the finger patterns?', 'ref': 'Both finger patterns are loops.', 'response': 'They are the same.', 'score': 'incorrect', 'aug': False}
{'question': 'Look at the finger patterns below, then answer the questions. What is the same about the finger patterns?', 'ref': 'Both finger patterns are loops.', 'response': 'The both finer patterns are loop.', 'score': 'correct', 'aug': False}
{'question': 'Look at the finger patterns bel

## Dataset Augmentation

Using Google Translate, we can augment the relatively small amount of training data via Backtranslation.

In [None]:
!pip install -q googletrans==3.1.0a0

In [None]:
import googletrans
from googletrans import Translator
import json

#### Question Response Map Creation

We create this to ensure that any generated responses do not match any of the existing responses for the question. This prevents duplication and improves the quality of the dataset.

In [None]:
question_response_map = {}

for training_item in training_dataset:
  question = training_item['question']

  if question not in question_response_map:
    question_response_map[question] = set()

  response = training_item['response']
  question_response_map[question].add(response)

#### Translation Functions

These methods will perform the augmentation of the dataset via backtranslation.

In [None]:
translator = Translator()

In [None]:
def backtranslate_example(input_item):
  original_response = input_item['response']
  tr1 = translator.translate(original_response, src='en', dest='es').text
  new_response = translator.translate(tr1, src='es', dest='en').text

  # Do not add the new response if it is the same after translation
  if new_response == original_response:
    return None

  # Do not add the new response if there is already another response that is the same
  if new_response in question_response_map[input_item['question']]:
    return None

  # Update the question response map with the new response
  question_response_map[input_item['question']].add(new_response)

  # Create a new dataset entry and return it
  new_dataset_entry = {
    'question': input_item['question'],
    'ref': input_item['ref'],
    'response': new_response,
    'score': input_item['score'],
    'aug': True
  }

  return new_dataset_entry

In [None]:
def process_generation(starting_idx, limit):
  gen_list = []
  for idx, training_item in enumerate(training_dataset):
    if idx < starting_idx:
      continue

    augmented_item = backtranslate_example(training_item)
    if augmented_item:
      gen_list.append(augmented_item)

    if idx % 100 == 0:
      print('Processed', idx)

    if idx == starting_idx + limit:
      print('Finished Processing To Index', idx)
      return gen_list
  print('Finished Processing All Data')
  return gen_list

#### Data Augmentation Implementation

Note that we break these into multiple cells. This is due to the rate limiting with the translate method, and it benefits from having each cell called individually with down-time in between cell executions.

In [None]:
augmented_data = []

In [None]:
augmented_data += process_generation(0, 999)
print('Size of Augmented Dataset', len(augmented_data))

Processed 0
Processed 100
Processed 200
Processed 300
Processed 400
Processed 500
Processed 600
Processed 700
Processed 800
Processed 900
Finished Processing To Index 999
Size of Augmented Dataset 790


In [None]:
augmented_data += process_generation(1000, 999)
print('Size of Augmented Dataset', len(augmented_data)) # Size of Augmented Dataset 1647

Processed 1000
Processed 1100
Processed 1200
Processed 1300
Processed 1400
Processed 1500
Processed 1600
Processed 1700
Processed 1800
Processed 1900
Finished Processing To Index 1999
Size of Augmented Dataset 1614


In [None]:
augmented_data += process_generation(2000, 999)
print('Size of Augmented Dataset', len(augmented_data))

Processed 2000
Processed 2100
Processed 2200
Processed 2300
Processed 2400
Processed 2500
Processed 2600
Processed 2700
Processed 2800
Processed 2900
Finished Processing To Index 2999
Size of Augmented Dataset 2406


In [None]:
augmented_data += process_generation(3000, 999)
print('Size of Augmented Dataset', len(augmented_data))

Processed 3000
Processed 3100
Processed 3200
Processed 3300
Processed 3400
Processed 3500
Processed 3600
Processed 3700
Processed 3800
Processed 3900
Finished Processing To Index 3999
Size of Augmented Dataset 3186


In [None]:
augmented_data += process_generation(4000, 999) 
print('Size of Augmented Dataset', len(augmented_data))

Processed 4000
Processed 4100
Processed 4200
Processed 4300
Processed 4400
Processed 4500
Processed 4600
Processed 4700
Processed 4800
Processed 4900
Finished Processing All Data
Size of Augmented Dataset 3932


In [None]:
print('Size of Original  Dataset', len(training_data))

print('Size of Combined  Dataset', len(training_data) + len(augmented_data))

print(f'Dataset Size Increase {100*len(augmented_data)/len(training_data)}%')

Size of Original  Dataset 4969
Size of Combined  Dataset 8901
Dataset Size Increase 79.13060978063997%


## Results

We have increased the size of our training set by almost 80%! This is a big improve that we hope will lead to better results in model training and fine-tuning.

Now, save all of the files as .json for easier use in the future

In [None]:
# Save augmented training data
with open('/content/train.json', 'w') as fp:
    json.dump(augmented_data + training_data, fp)

In [None]:
test_unseen_questions = ('/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-questions', 'questions')
test_unseen_answers = ('/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-answers', 'answers')
test_unseen_domains = ('/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-domains', 'domains')

In [None]:
for test_set_tuple in [test_unseen_domains, test_unseen_answers, test_unseen_questions]:
  curr_test_data = []
  for data_file in glob.glob(test_set_tuple[0] + '/*'):
    curr_test_data += parse_xml_file(data_file)

  with open(f'/content/test-unseen-{test_set_tuple[1]}.json', 'w') as fp:
    json.dump(curr_test_data, fp)
  
  print('Saved Test Set Unseen', test_set_tuple[1])

Saved Test Set Unseen domains
Saved Test Set Unseen answers
Saved Test Set Unseen questions
