# Project Dataset Loading

In [None]:
import zipfile
import os
import numpy as np
import xml.etree.ElementTree as ET
import glob
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# Clone the dataset repository from github
!git clone https://github.com/leocomelli/score-freetext-answer.git

Cloning into 'score-freetext-answer'...
remote: Enumerating objects: 511, done.[K
remote: Total 511 (delta 0), reused 0 (delta 0), pack-reused 511[K
Receiving objects: 100% (511/511), 478.34 KiB | 5.20 MiB/s, done.
Resolving deltas: 100% (263/263), done.


In [None]:
training_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/training/2way/sciEntsBank'
test_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-answers'

In [None]:
def parse_xml_file(xml_file_path):

  question = ""
  ref = ""

  results = []

  for elem in ET.parse(xml_file_path).getroot():
    if elem.tag == 'questionText':
      question = elem.text
    for subelem in elem:
      if subelem.tag == 'referenceAnswer':
        ref = subelem.text
      else:
        results.append({
            'question': question,
            'ref': ref,
            'response': subelem.text,
            'score': subelem.attrib['accuracy']
        })

  return results

In [None]:
training_data = []
test_data = []
num_training_questions = 0
num_test_questions = 0

for data_file in glob.glob(training_data_directory + '/*'):
  training_data += parse_xml_file(data_file)
  num_training_questions += 1

for data_file in glob.glob(test_data_directory + '/*'):
  test_data += parse_xml_file(data_file)
  num_test_questions += 1

print("Number of Training Questions:", num_training_questions)
print("Number of Training Responses:", len(training_data))

print("Number of Test Questions:", num_test_questions)
print("Number of Test Responses:", len(test_data))

Number of Training Questions: 135
Number of Training Responses: 4969
Number of Test Questions: 135
Number of Test Responses: 540


In [None]:
class ShortAnswerGradingDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Note: I handle the parsing in the data loading from XML section
        # Returns a dict for each item with the following keys: 'question', 'ref', 'response', 'score' all of type 'str'
        return self.dataset[idx]

In [None]:
training_dataset = ShortAnswerGradingDataset(training_data)
test_dataset = ShortAnswerGradingDataset(test_data)

## Batching and Loading Data to Model

Use this iterator to load in the train and test datasets to the model of choice.

In [None]:
# for training_item in training_dataset:
#   print(training_item)

# for test_item in test_dataset:
#   print(test_item)
print(training_dataset[0])
print(len(training_dataset))
print(len(test_dataset))

{'question': 'When a seed germinates, why does the root grow first?', 'ref': 'The root grows first so the root can take up water for the plant.', 'response': 'Because it sucks up the water as food for the plant and stem.', 'score': 'correct'}
4969
540


## Installation
---

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 5.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 41.0 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 2.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 40.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.2 MB/s 


In [None]:
from sentence_transformers import SentenceTransformer, util
# model = SentenceTransformer('all-MiniLM-L6-v2')
# Same model as the BERT_baseline approach
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

## Create inputs for similarity check
---

In [None]:
# Concate the reference answer and student answer to creat new input for both train and test set
test_data_response = []
test_data_ref = []
test_data_score = []

for training_item in training_data:
  test_data_response.append(training_item["response"])
  test_data_ref.append(training_item["ref"])
  test_data_score.append(training_item["score"])


for test_item in test_data:
  test_data_response.append(test_item["response"])
  test_data_ref.append(test_item["ref"])
  test_data_score.append(test_item["score"])


print(len(test_data_response))
print(len(test_data_ref))
print(len(test_data_score))
print(len(training_data) + len(test_data))


5509
5509
5509
5509


## Sentence similarity calculation

resource: https://www.sbert.net/docs/usage/semantic_textual_similarity.html
---

In [None]:
#Compute embedding for both lists
embeddings1 = model.encode(test_data_ref, convert_to_tensor=True)
embeddings2 = model.encode(test_data_response, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.cos_sim(embeddings1, embeddings2)


In [None]:
#Output the pairs with their score
# for i in range(len(test_data_ref)):
#     print("{} \t\t {} \t\t Score: {:.4f}".format(test_data_ref[i], test_data_response[i], cosine_scores[i][i]))
result_score = []
for i in range(len(test_data_ref)):
  if cosine_scores[i][i] >= 0.65:
    result_score.append("correct")
  else:
    result_score.append("incorrect")

correct_predict = 0
for i in range(len(result_score)):
  if result_score[i] == test_data_score[i]:
    correct_predict += 1

print(correct_predict / len(result_score))

0.6514793973497912


## Conclusion

This approach is converting reference answer and student answer into sentence embeddings. Use the two sentence embeddings to calculate the cosine similarity score, and use the cos-sim score to predict "correct" or "incorrect". We then calculate the accuracy on the entire dataset to measure the accuracy. The score is 0.6289021396001403. Adjust the threshold for picking correct and incorrect will change the result slightly