# Project Dataset Loading

In [None]:
# Clone the dataset repository from github
!git clone https://github.com/CodyRichter/Automatic-Short-Answer-Grading

Cloning into 'Automatic-Short-Answer-Grading'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 33 (delta 12), reused 23 (delta 8), pack-reused 0[K
Unpacking objects: 100% (33/33), done.


In [None]:
import json

with open('/content/Automatic-Short-Answer-Grading/dataset/train.json', 'r') as tf:
  training_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-answers.json', 'r') as tf:
  test_unseen_answer_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-questions.json', 'r') as tf:
  test_unseen_question_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-domains.json', 'r') as tf:
  test_unseen_domain_data = json.load(tf)

print('Number of Training + Validation Data Responses', len(training_data))

Number of Training + Validation Data Responses 16265


In [None]:
from torch.utils.data import Dataset, DataLoader

class ShortAnswerGradingDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Note: I handle the parsing in the data loading from XML section
        # Returns a dict for each item with the following keys: 'question', 'ref', 'response', 'score' all of type 'str'
        return self.dataset[idx]

## Dataset Train/Validation/Test Splits

This step is very important, and requires extra logic due to the nature of the dataset. The train dataset can contain augmented items, but the validation dataset cannot. Additionally, the validation dataset must not contain un-augmented items while the original dataset contains backtranslated versions of the same response, as this is essentially "cheating" due to the similarity between the backtranslated response and the original one. Therefore, we have to do this extra logic on the training and validation sets to make sure that the validation set is a fair measurement of the dataset.

Note that there are some unknown questions in the validation set that are not in the training set, however all of the questions are in the same domain. This means that this validation set is most similar to the test set with unseen answers but known questions, but with a few questions that may not have been seen by the training set.

In [None]:
training_dataset = ShortAnswerGradingDataset(training_data)
test_dataset_unseen_answers = ShortAnswerGradingDataset(test_unseen_answer_data)
test_dataset_unseen_questions = ShortAnswerGradingDataset(test_unseen_question_data)
test_dataset_unseen_domains = ShortAnswerGradingDataset(test_unseen_domain_data)

from sklearn.model_selection import train_test_split

training_dataset, validation_dataset = train_test_split(training_dataset, test_size=0.1, random_state=0)

validation_parent_ids = set()
validation_ids_to_remove = set()
validation_original_ids = set()

# Step 1: Get IDs of Original Responses and mark augmented ones for deletion
for validation_item in validation_dataset:
  if validation_item['aug']:
    validation_parent_ids.add(validation_item['aug_metadata']['parent_id'])
    validation_ids_to_remove.add(validation_item['id'])
  else:
    validation_original_ids.add(validation_item['id'])

train_ids_to_remove = set()

# Step 2: Obtain Original Respones for validation set and mark augmented
#         dataset items for removal if the original is in the validation set
for train_item in training_dataset:

  # If the original is in the validation set, remove from the training set
  if train_item['aug'] and train_item['aug_metadata']['parent_id'] in validation_original_ids:
    train_ids_to_remove.add(train_item['id'])

  # If the original is in the training set, add it to the validation set
  # and then mark it for deletion from the training set
  if not train_item['aug'] and train_item['id'] in validation_parent_ids:
    validation_dataset.append(train_item)
    train_ids_to_remove.add(train_item['id'])

# Step 3: Perform removal operations
validation_dataset[:] = [x for x in validation_dataset if x['id'] not in validation_ids_to_remove]
training_dataset[:] = [x for x in training_dataset if x['id'] not in train_ids_to_remove]

In [None]:
print('Number of Training Samples', len(training_dataset))
print('Number of Validation Samples', len(validation_dataset))
print('Number of Test Data (New Answer) Responses', len(test_unseen_answer_data))
print('Number of Test Data (New Question) Responses', len(test_unseen_question_data))
print('Number of Test Data (New Domain) Responses', len(test_unseen_domain_data))

Number of Training Samples 12805
Number of Validation Samples 1380
Number of Test Data (New Answer) Responses 540
Number of Test Data (New Question) Responses 733
Number of Test Data (New Domain) Responses 4562


## Viewing Samples in Dataset

In [None]:
for idx, training_item in enumerate(training_dataset):
  print(training_item)
  if idx == 10:
    break

{'question': "Pam and her family were planning a hike. Pam found 2 maps of the same area. Her friend recommended she use the topographic map when they went to the lake. Why would Pam's friend recommend using the topographic map for the hike?", 'ref': 'She recommended the topographic map because the map shows the elevations along the trail. Pam would know where the trail was the steepest.', 'response': 'Because both maps show the shapes of landforms, but a topographic map shows elevation and dip.', 'score': 'incorrect', 'aug': True, 'id': 18221, 'aug_metadata': {'parent_id': 3816, 'translation_seq': ['en', 'es', 'en']}}
{'question': 'When a seed germinates, why does the root grow first?', 'ref': 'The root grows first so the root can take up water for the plant.', 'response': 'Because they need roots to produce leaves.', 'score': 'incorrect', 'aug': True, 'id': 15990, 'aug_metadata': {'parent_id': 3117, 'translation_seq': ['en', 'es', 'en']}}
{'question': 'Salt crystals form when the wat

In [None]:
for idx, val_item in enumerate(validation_dataset):
  print(val_item)
  if idx == 10:
    break

{'question': 'Denise made a circuit to light a bulb or run a motor off a D-cell battery. She used a special switch. Below is the schematic diagram of her circuit. The switch is inside the dotted box. Why will the bulb light when she moves the switch to the left?', 'ref': 'There is a complete circuit connecting the bulb to the D-cell battery.', 'response': 'I learned it in class.', 'score': 'incorrect', 'aug': False, 'id': 4645}
{'question': 'Jim used a solid and water to make Mixtures one (one spoon of solid in 100 milliliters water was clear with nothing on the bottom), 3 (3 spoons of solid in 100 milliliters water was clear with nothing on the bottom), 4 (4 spoons of solid in 100 milliliters water was clear with material on the bottom), and 5 (5 spoons of solid in 100 milliliters water was clear with material on the bottom) as shown below. He stirred each one and observed the results. If Jim made Mixture 2 with 2 spoons of solid in 100 milliliters of water, what would he observe? Wha

In [None]:
for idx, test_item in enumerate(test_dataset_unseen_answers):
  print(test_item)
  if idx == 10:
    break

{'question': 'The sand and flour in the gray material from mock rocks is separated by mixing with water and allowing the mixture to settle. Explain why the sand and flour separate.', 'ref': 'The sand particles are larger and settle first. The flour particles are smaller and therefore settle more slowly.', 'response': 'Because it will not get dry.', 'score': 'incorrect', 'aug': False, 'id': 4969}
{'question': 'The sand and flour in the gray material from mock rocks is separated by mixing with water and allowing the mixture to settle. Explain why the sand and flour separate.', 'ref': 'The sand particles are larger and settle first. The flour particles are smaller and therefore settle more slowly.', 'response': 'Because one is heavy and one is not.', 'score': 'incorrect', 'aug': False, 'id': 4970}
{'question': 'The sand and flour in the gray material from mock rocks is separated by mixing with water and allowing the mixture to settle. Explain why the sand and flour separate.', 'ref': 'The

In [None]:
for idx, test_item in enumerate(test_dataset_unseen_questions):
  print(test_item)
  if idx == 10:
    break

{'question': 'What happens to earth materials during erosion?', 'ref': 'Earth materials are worn away and moved during erosion.', 'response': 'Earth material gets eroded and carried away.', 'score': 'incorrect', 'aug': False, 'id': 5509}
{'question': 'What happens to earth materials during erosion?', 'ref': 'Earth materials are worn away and moved during erosion.', 'response': 'They are eroded by water, wind, and or ice.', 'score': 'incorrect', 'aug': False, 'id': 5510}
{'question': 'What happens to earth materials during erosion?', 'ref': 'Earth materials are worn away and moved during erosion.', 'response': 'They form into other solids water, lava, wind.', 'score': 'incorrect', 'aug': False, 'id': 5511}
{'question': 'What happens to earth materials during erosion?', 'ref': 'Earth materials are worn away and moved during erosion.', 'response': 'They just move around.', 'score': 'incorrect', 'aug': False, 'id': 5512}
{'question': 'What happens to earth materials during erosion?', 'ref'

In [None]:
for idx, test_item in enumerate(test_dataset_unseen_domains):
  print(test_item)
  if idx == 10:
    break

{'question': 'Ted poured the same amount of water into 2 small containers, X and Y. He placed them together where they would not be disturbed. After several days, all of the water had evaporated from one of the containers. The other container still had some water. Which container, X or Y, was empty? Explain why the water in that container evaporated more quickly than the water in the other container.', 'ref': 'Container X has more surface area or more water exposed to the air so the water evaporated faster.', 'response': 'Because X had more room to evaporate and Y did not.', 'score': 'correct', 'aug': False, 'id': 6242}
{'question': 'Ted poured the same amount of water into 2 small containers, X and Y. He placed them together where they would not be disturbed. After several days, all of the water had evaporated from one of the containers. The other container still had some water. Which container, X or Y, was empty? Explain why the water in that container evaporated more quickly than th