# Project Dataset Loading

In [None]:
import zipfile
import os
import numpy as np
import xml.etree.ElementTree as ET
import glob
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# Clone the dataset repository from github
!git clone https://github.com/leocomelli/score-freetext-answer.git

Cloning into 'score-freetext-answer'...
remote: Enumerating objects: 511, done.[K
remote: Total 511 (delta 0), reused 0 (delta 0), pack-reused 511[K
Receiving objects: 100% (511/511), 478.34 KiB | 1.54 MiB/s, done.
Resolving deltas: 100% (263/263), done.


In [None]:
training_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/training/2way/sciEntsBank'
test_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-answers'

In [None]:
def parse_xml_file(xml_file_path):

  question = ""
  ref = ""

  results = []

  for elem in ET.parse(xml_file_path).getroot():
    if elem.tag == 'questionText':
      question = elem.text
    for subelem in elem:
      if subelem.tag == 'referenceAnswer':
        ref = subelem.text
      else:
        results.append({
            'question': question,
            'ref': ref,
            'response': subelem.text,
            'score': subelem.attrib['accuracy']
        })

  return results

In [None]:
training_data = []
test_data = []
num_training_questions = 0
num_test_questions = 0

for data_file in glob.glob(training_data_directory + '/*'):
  training_data += parse_xml_file(data_file)
  num_training_questions += 1

for data_file in glob.glob(test_data_directory + '/*'):
  test_data += parse_xml_file(data_file)
  num_test_questions += 1

print("Number of Training Questions:", num_training_questions)
print("Number of Training Responses:", len(training_data))

print("Number of Test Questions:", num_test_questions)
print("Number of Test Responses:", len(test_data))

Number of Training Questions: 135
Number of Training Responses: 4969
Number of Test Questions: 15
Number of Test Responses: 733


In [None]:
class ShortAnswerGradingDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Note: I handle the parsing in the data loading from XML section
        # Returns a dict for each item with the following keys: 'question', 'ref', 'response', 'score' all of type 'str'
        return self.dataset[idx]

In [None]:
training_dataset = ShortAnswerGradingDataset(training_data)
test_dataset = ShortAnswerGradingDataset(test_data)

In [None]:
for training_item in training_dataset:
  print(training_item)

for test_item in test_dataset:
  print(test_item)

In [None]:
for training_item in training_dataset:
  print(training_item.keys())
  print(training_item['question'])
  print(training_item['ref'])
  print(training_item['response'])
  print(training_item['score'])

  break

dict_keys(['question', 'ref', 'response', 'score'])
Grace set up a science experiment. Here's what she wrote in her notebook. First I put eggshells into a cup and covered them with vinegar. After a while, small bubbles appeared on the surface of the eggshells. The next day there were more bubbles. In a couple of days, the eggshells were gone and the liquid was clear. I think I made a solution and a reaction. Explain why Grace thinks she made a solution.
A solution was created when the eggshells disappeared (dissolved) and the mixture ended up as a clear liquid.
Because first she put eggshells then the next day she made a solution.
incorrect


In [None]:
train_data_text = []
train_data_scores = []
test_data_text = []
test_data_scores = []


for training_item in training_dataset:

  # train_data_text.append(training_item['ref'] + ' ' + training_item['response'])
  # train_data_text.append(training_item['question'] + ' ' + training_item['response'])
  train_data_text.append(training_item['question'] + ' ' + training_item['ref'] + ' ' + training_item['response'])
  train_data_scores.append(training_item['score'])

for test_item in test_dataset:
  # test_data_text.append(test_item['ref'] + ' ' + training_item['response'])
  # test_data_text.append(test_item['question'] + ' ' + training_item['response'])
  test_data_text.append(test_item['question'] + ' ' + training_item['ref'] + ' ' + training_item['response'])
  test_data_scores.append(test_item['score'])


# GloVE

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-04-18 19:14:21--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-04-18 19:14:21--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-04-18 19:14:21--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [None]:
embeddings_index = {}
# with open('glove.6B.100d.txt') as f: 
# with open('glove.6B.200d.txt') as f:
with open('glove.6B.300d.txt') as f: 
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
# if there is a word that is not defined in GloVe
# either returns glove embedding or zeros for words not defined
def check_glove(token):
  
  try: 
    embeddings_index[token]
    return embeddings_index[token]
  except KeyError:
    # return np.zeros(100)
    # return np.zeros(200)
    return np.zeros(300)


In [None]:
check_glove('the')

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [None]:
# tokenizer
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer('spacy')

def lower(lists):
  return [x.lower() for x in lists]

In [None]:
print(train_data_text[0])
print(lower(tokenizer(train_data_text[0])))

A solution was created when the eggshells disappeared (dissolved) and the mixture ended up as a clear liquid. Because first she put eggshells then the next day she made a solution.
['a', 'solution', 'was', 'created', 'when', 'the', 'eggshells', 'disappeared', '(', 'dissolved', ')', 'and', 'the', 'mixture', 'ended', 'up', 'as', 'a', 'clear', 'liquid', '.', 'because', 'first', 'she', 'put', 'eggshells', 'then', 'the', 'next', 'day', 'she', 'made', 'a', 'solution', '.']


# Encode Embeddings

In [None]:
train_data_embed = []
test_data_embed = []

for train_data in train_data_text:

  # embedding = np.empty((0,100))
  # embedding = np.empty((0,200))
  embedding = np.empty((0,300))

  for token in lower(tokenizer(train_data)):

    embedding = np.append(embedding, [check_glove(token)], axis = 0)

  train_data_embed.append(np.mean(embedding, axis= 0))


for test_data in test_data_text:

  # embedding = np.empty((0,100))
  # embedding = np.empty((0,200))
  embedding = np.empty((0,300))

  for token in lower(tokenizer(test_data)):

    embedding = np.append(embedding, [check_glove(token)], axis = 0)

  test_data_embed.append(np.mean(embedding, axis= 0))

print(len(train_data_embed))
print(len(test_data_embed))

4969
733


# Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors= 5)
knn.fit(train_data_embed, train_data_scores)
knn.score(test_data_embed,test_data_scores)

0.5075034106412005

# GloVE results

## 100d

- reference and response: 0.465

- question and response: 0.572

- question, reference, and response 0.502

## 200d

- reference and response: 0.547

- question and response: 0.592

- question, reference, and response 0.538

## 300d

- reference and response: 0.510

- question and response: 0.540

- question, reference, and response 0.508



Interesting that the 200d embeddings did better than 300d embeddings

The best input for GloVe was just using the question and answer concatenated together