In [85]:
!pip install transformers
!pip install sentencepiece



#Get the dataset

In [88]:
import transformers
import glob
import numpy as np
import json

In [89]:
# Clone the dataset repository from github
!git clone https://github.com/leocomelli/score-freetext-answer.git

fatal: destination path 'score-freetext-answer' already exists and is not an empty directory.


In [90]:
import zipfile
import os
import xml.etree.ElementTree as ET
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer, BertModel, AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

In [91]:
training_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/training/2way/sciEntsBank'
test_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-answers'

In [92]:
def parse_xml_file(xml_file_path):

  question = ""
  ref = ""
  results = []

  for elem in ET.parse(xml_file_path).getroot():
    if elem.tag == 'questionText':
      question = elem.text
    for subelem in elem:
      if subelem.tag == 'referenceAnswer':
        ref = subelem.text
      else:
        results.append({
            'question': question,
            'ref': ref,
            'response': subelem.text,
            'score': subelem.attrib['accuracy']
        })

  return results

In [93]:
training_data_raw = []
test_data_raw = []
num_training_questions = 0
num_test_questions = 0

for data_file in glob.glob(training_data_directory + '/*'):
  training_data_raw += parse_xml_file(data_file)
  num_training_questions += 1

for data_file in glob.glob(test_data_directory + '/*'):
  test_data_raw += parse_xml_file(data_file)
  num_test_questions += 1

print("Number of Training Questions:", num_training_questions)
print("Number of Training Responses:", len(training_data_raw))

print("Number of Test Questions:", num_test_questions)
print("Number of Test Responses:", len(test_data_raw))

Number of Training Questions: 135
Number of Training Responses: 4969
Number of Test Questions: 135
Number of Test Responses: 540


In [94]:
# Concate the reference answer and student answer to create new input for both train and test set
texts_reference_untokenized = []
texts_response_untokenized = []
data_scores_numeric = []

maxl = []
for training_item in training_data_raw:    
  texts_reference_untokenized.append(training_item['ref'])
  texts_response_untokenized.append(training_item['response'])
  data_scores_numeric.append(0 if training_item["score"] == 'incorrect' else 1)

for test_item in test_data_raw:
  texts_reference_untokenized.append(test_item['ref'])
  texts_response_untokenized.append(test_item['response'])
  data_scores_numeric.append(0 if test_item["score"] == 'incorrect' else 1)

texts_reference_untokenized = np.asarray(texts_reference_untokenized)
texts_response_untokenized = np.asarray(texts_response_untokenized)
data_scores_numeric = np.asarray(data_scores_numeric)

In [95]:
print(len(texts_reference_untokenized))
print(len(texts_response_untokenized))
print(texts_reference_untokenized[0])
print(texts_response_untokenized[0])
print(data_scores_numeric[0])

5509
5509
Agree. Vibrations are movements. Vibrations produce sound.
Yes, Because if things do not move it will not make a sound. Like if you are talking your voice box has to move.
0


In [97]:
print("mnli premise: " + texts_response_untokenized[0] + " hypothesis: " + texts_reference_untokenized[0])

mnli premise: Yes, Because if things do not move it will not make a sound. Like if you are talking your voice box has to move. hypothesis: Agree. Vibrations are movements. Vibrations produce sound.


# Batch Inference

In [96]:
# batch the inputs


# input_ids = "mnli premise: " + training_data_raw[5]['question'] + " hypothesis: " + training_data_raw[5]['response']
# one = tokenizer(["mnli premise: " + training_data_raw[0]['question'] + " hypothesis: " + training_data_raw[0]['response'], "mnli premise: " + training_data_raw[1]['question'] + " hypothesis: " + training_data_raw[1]['response'], input_ids], return_tensors="pt", padding=True, truncation=True).input_ids


# outputs = model.generate(one)
# print(outputs)
# for i in range(len(outputs)):
#   res = tokenizer.decode(outputs[i], skip_special_tokens=True)
#   print(res)

In [12]:
# def batch_inputs_mnli(train_data):
#   input_sequences = []
#   for item in train_data:
#     input_sequences.append("mnli premise: " + item['question'] + " hypothesis: " + item['response'])
#   return input_sequences

In [13]:
# input_sequences = batch_inputs_mnli(training_data_raw)
# input_sequences[:5]

# T5 Model

In [116]:
# Vanilla T5-small

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

input_ids = tokenizer("stsb sentence1: The house is wonderful. sentence2: This house is great", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


5.0


In [26]:
# Free the memory
import gc
gc.collect()

# Time the inference

import time

start_time = time.clock()
input_ids = tokenizer(input_sequences[0], return_tensors="pt", padding=True, truncation=True).input_ids
outputs = model.generate(input_ids)
res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(time.clock() - start_time, "seconds")
print((time.clock() - start_time) * len(training_data_raw) / 60, "seconds")

367

# T5 inference on the Training Data: Textual 
##### Note: takes a long time to run.

In [44]:
# TODO Paralellize
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
entailment = 0
contra_neutral = 0
# input_ids = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True).input_ids
# outputs = model.generate(input_ids)
# batch_size = 64
for idx, training_item in enumerate(training_data_raw):
  # print(training_item)
  input_ids = tokenizer("mnli premise: " + training_item['ref'] + " hypothesis: " + training_item['response'], return_tensors="pt").input_ids
  outputs = model.generate(input_ids)
  res = tokenizer.decode(outputs[0], skip_special_tokens=True)
  training_item["mnli"] = res
  if res == "entailment":
    entailment += 1
    if training_item["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  if res != "entailment":
    contra_neutral += 1
    if training_item["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1
  


In [45]:
print(true_pos, false_pos, true_neg, false_neg, entailment, contra_neutral)

1157 1182 1779 851 2339 2630


In [46]:
print("Accuracy: ", (true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg))

0.5908633527872812

# T5 inference on the Training Data: Sentence Similarity
##### Note: takes a long time to run

In [47]:
# TODO Paralellize
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
pred_true = 0
pred_false = 0
# input_ids = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True).input_ids
# outputs = model.generate(input_ids)
for idx, training_item in enumerate(training_data_raw):
  # print(training_item)
  input_ids = tokenizer("stsb sentence1: " + training_item['ref'] + " sentence2: " + training_item['response'], return_tensors="pt").input_ids
  outputs = model.generate(input_ids)
  res = tokenizer.decode(outputs[0], skip_special_tokens=True)
  training_item["sts"] = res

  if float(res) > 2.5:
    pred_true += 1
    if training_item["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  else:
    pred_false += 1
    if training_item["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1
  


In [48]:
print("Accuracy: ", (true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg))

0.6081706580800966

# Error Analysis

In [49]:
import pandas as pd

In [54]:
train_data_pd = pd.DataFrame(training_data_raw)
results = train_data_pd[["score", "mnli", "sts"]]

In [60]:
results

Unnamed: 0,score,mnli,sts
0,incorrect,contradiction,2.4
1,incorrect,contradiction,2.2
2,incorrect,entailment,0.4
3,correct,entailment,3.6
4,correct,entailment,3.6
...,...,...,...
4964,incorrect,contradiction,2.8
4965,incorrect,contradiction,0.8
4966,incorrect,contradiction,0.0
4967,incorrect,entailment,2.8


In [63]:
# Compute Agreement
agree = 0
for index, row in results.iterrows():
  if row["mnli"] == "entailment" and float(row["sts"]) > 2.5:
    agree +=1
agree / len(results)

0.30307909036023345

In [64]:
# Break ties

true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
pred_true = 0
pred_false = 0



In [80]:
# Logical OR
for index, row in results.iterrows():

  if float(row["sts"]) > 2.5 or row["mnli"] == "entailment":
    pred_true += 1
    if row["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  else:
    pred_false += 1
    if row["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1

In [81]:
(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

0.5995170054336889

In [78]:
# Logical AND
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
pred_true = 0
pred_false = 0


for index, row in results.iterrows():

  if float(row["sts"]) > 2.5 and row["mnli"] == "entailment":
    pred_true += 1
    if row["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  else:
    pred_false += 1
    if row["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1

In [79]:
(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

0.6333266250754679

# T5 Inference on Test Data (Unseen Answers):

In [70]:
# TODO Paralellize
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
entailment = 0
contra_neutral = 0
op = 0
# input_ids = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True).input_ids
# outputs = model.generate(input_ids)
batch_size = 64
for idx, training_item in enumerate(test_data_raw):
  # print(training_item)
  input_ids = tokenizer("mnli premise: " + training_item['ref'] + " hypothesis: " + training_item['response'], return_tensors="pt").input_ids
  outputs = model.generate(input_ids)
  res = tokenizer.decode(outputs[0], skip_special_tokens=True)
  training_item["mnli"] = res
  if res == "entailment":
    entailment += 1
    if training_item["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  if res != "entailment":
    contra_neutral += 1
    if training_item["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1

(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

In [71]:
(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

0.6240740740740741

In [75]:
# TODO Paralellize
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
pred_true = 0
pred_false = 0
# input_ids = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True).input_ids
# outputs = model.generate(input_ids)
for idx, training_item in enumerate(test_data_raw):
  # print(training_item)
  input_ids = tokenizer("stsb sentence1: " + training_item['ref'] + " sentence2: " + training_item['response'], return_tensors="pt").input_ids
  outputs = model.generate(input_ids)
  res = tokenizer.decode(outputs[0], skip_special_tokens=True)
  training_item["sts"] = res

  if float(res) > 2.5:
    pred_true += 1
    if training_item["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  else:
    pred_false += 1
    if training_item["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1
  
(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

0.6648148148148149

In [76]:
test_data_pd = pd.DataFrame(test_data_raw)
results_test = train_data_pd[["score", "mnli", "sts"]]

In [77]:
# Compute Agreement
agree = 0
for index, row in results_test.iterrows():
  if row["mnli"] == "entailment" and float(row["sts"]) > 2.5:
    agree +=1
agree / len(results)

0.30307909036023345

In [84]:
# Logical AND
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
pred_true = 0
pred_false = 0


for index, row in results_test.iterrows():

  if float(row["sts"]) > 2.5 and row["mnli"] == "entailment":
    pred_true += 1
    if row["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  else:
    pred_false += 1
    if row["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1

(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

0.6333266250754679

In [109]:
# TODO Paralellize
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
pred_true = 0
pred_false = 0
# input_ids = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True).input_ids
# outputs = model.generate(input_ids)
for idx, training_item in enumerate(test_data_raw):
  # print(training_item)
  input_ids = tokenizer("stsb sentence1: " + training_item['ref'] + " sentence2: " + training_item['response'], padding=True, truncation=True, return_tensors="pt").input_ids
  outputs = model.generate(input_ids)
  sentence_embeddings = mean_pooling(outputs, encoded_input['attention_mask'])
  res = tokenizer.decode(sentence_embeddings, skip_special_tokens=True)
  training_item["sts"] = res
  if idx == 20:
    break
  if float(res) > 2.5:
    pred_true += 1
    if training_item["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  else:
    pred_false += 1
    if training_item["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1
  
(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Input length of input_ids is 34, but ``max_length`` is set to 20. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


AttributeError: ignored

# T5 Inference on Test Data (Unseen Questions):

In [110]:
# Clone the dataset repository from github
!git clone https://github.com/CodyRichter/Automatic-Short-Answer-Grading

Cloning into 'Automatic-Short-Answer-Grading'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 33 (delta 12), reused 23 (delta 8), pack-reused 0[K
Unpacking objects: 100% (33/33), done.


In [111]:
import json

with open('/content/Automatic-Short-Answer-Grading/dataset/train.json', 'r') as tf:
  training_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-answers.json', 'r') as tf:
  test_unseen_answer_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-questions.json', 'r') as tf:
  test_unseen_question_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-domains.json', 'r') as tf:
  test_unseen_domain_data = json.load(tf)

print('Number of Training + Validation Data Responses', len(training_data))

Number of Training + Validation Data Responses 16265


In [112]:
from torch.utils.data import Dataset, DataLoader

class ShortAnswerGradingDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Note: I handle the parsing in the data loading from XML section
        # Returns a dict for each item with the following keys: 'question', 'ref', 'response', 'score' all of type 'str'
        return self.dataset[idx]

In [113]:
training_dataset = ShortAnswerGradingDataset(training_data)
test_dataset_unseen_answers = ShortAnswerGradingDataset(test_unseen_answer_data)
test_dataset_unseen_questions = ShortAnswerGradingDataset(test_unseen_question_data)
test_dataset_unseen_domains = ShortAnswerGradingDataset(test_unseen_domain_data)

from sklearn.model_selection import train_test_split

training_dataset, validation_dataset = train_test_split(training_dataset, test_size=0.1, random_state=0)

validation_parent_ids = set()
validation_ids_to_remove = set()
validation_original_ids = set()

# Step 1: Get IDs of Original Responses and mark augmented ones for deletion
for validation_item in validation_dataset:
  if validation_item['aug']:
    validation_parent_ids.add(validation_item['aug_metadata']['parent_id'])
    validation_ids_to_remove.add(validation_item['id'])
  else:
    validation_original_ids.add(validation_item['id'])

train_ids_to_remove = set()

# Step 2: Obtain Original Respones for validation set and mark augmented
#         dataset items for removal if the original is in the validation set
for train_item in training_dataset:

  # If the original is in the validation set, remove from the training set
  if train_item['aug'] and train_item['aug_metadata']['parent_id'] in validation_original_ids:
    train_ids_to_remove.add(train_item['id'])

  # If the original is in the training set, add it to the validation set
  # and then mark it for deletion from the training set
  if not train_item['aug'] and train_item['id'] in validation_parent_ids:
    validation_dataset.append(train_item)
    train_ids_to_remove.add(train_item['id'])

# Step 3: Perform removal operations
validation_dataset[:] = [x for x in validation_dataset if x['id'] not in validation_ids_to_remove]
training_dataset[:] = [x for x in training_dataset if x['id'] not in train_ids_to_remove]

In [114]:
print('Number of Training Samples', len(training_dataset))
print('Number of Validation Samples', len(validation_dataset))
print('Number of Test Data (New Answer) Responses', len(test_unseen_answer_data))
print('Number of Test Data (New Question) Responses', len(test_unseen_question_data))
print('Number of Test Data (New Domain) Responses', len(test_unseen_domain_data))

Number of Training Samples 12805
Number of Validation Samples 1380
Number of Test Data (New Answer) Responses 540
Number of Test Data (New Question) Responses 733
Number of Test Data (New Domain) Responses 4562


In [117]:
# TODO Paralellize
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
pred_true = 0
pred_false = 0
# input_ids = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True).input_ids
# outputs = model.generate(input_ids)
for idx, training_item in enumerate(test_unseen_question_data):
  # print(training_item)
  input_ids = tokenizer("stsb sentence1: " + training_item['ref'] + " sentence2: " + training_item['response'], return_tensors="pt").input_ids
  outputs = model.generate(input_ids)
  res = tokenizer.decode(outputs[0], skip_special_tokens=True)
  training_item["sts"] = res

  if float(res) > 2.5:
    pred_true += 1
    if training_item["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  else:
    pred_false += 1
    if training_item["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1
  
(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

0.6575716234652115

In [118]:
# TODO Paralellize
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
entailment = 0
contra_neutral = 0
op = 0
# input_ids = tokenizer(input_sequences, return_tensors="pt", padding=True, truncation=True).input_ids
# outputs = model.generate(input_ids)
batch_size = 64
for idx, training_item in enumerate(test_unseen_question_data):
  # print(training_item)
  input_ids = tokenizer("mnli premise: " + training_item['ref'] + " hypothesis: " + training_item['response'], return_tensors="pt").input_ids
  outputs = model.generate(input_ids)
  res = tokenizer.decode(outputs[0], skip_special_tokens=True)
  training_item["mnli"] = res
  if res == "entailment":
    entailment += 1
    if training_item["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  if res != "entailment":
    contra_neutral += 1
    if training_item["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1

(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

0.6125511596180082

In [119]:
test_unseen_data_pd = pd.DataFrame(test_unseen_question_data)
results_test_unseen = test_unseen_data_pd[["score", "mnli", "sts"]]

# Final Result: State-of-the-art 68.89% for zero-shot methods.
##### 71%, 75% state-of-the-art published in 2017, 2018 using handcrafted features and full training.

In [150]:
# Logical AND
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
pred_true = 0
pred_false = 0
mnli_AND_sts = []

for index, row in results_test_unseen.iterrows():

  if float(row["sts"]) > 3.0 and row["mnli"] == "entailment":
    mnli_AND_sts.append(1)
    pred_true += 1
    if row["score"] == "correct":
      true_pos+=1
    else:
      false_pos += 1
  else:
    mnli_AND_sts.append(0)
    pred_false += 1
    if row["score"] == "incorrect":
      true_neg+=1
    else:
      false_neg+=1

(true_pos + true_neg) / (true_pos+ false_pos+ true_neg+ false_neg)

0.6889495225102319

# Computing other metrics. (Not reported in the paper)

In [152]:
results_test_unseen["mnli_AND_sts"] = mnli_AND_sts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [154]:
results_test_unseen["score_number"] = results_test_unseen["score"].apply(lambda x: 1 if x=="correct" else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [156]:
results_test_unseen

Unnamed: 0,score,mnli,sts,mnli_AND_sts,score_number
0,incorrect,entailment,3.6,1,0
1,incorrect,entailment,0.0,0,0
2,incorrect,entailment,0.0,0,0
3,incorrect,entailment,0.0,0,0
4,incorrect,entailment,0.0,0,0
...,...,...,...,...,...
728,correct,entailment,4.0,1,1
729,correct,entailment,4.0,1,1
730,correct,entailment,2.4,0,1
731,incorrect,contradiction,0.0,0,0


In [157]:
from scipy.stats import pearsonr
 
 
# Apply the pearsonr()
corr, _ = pearsonr(results_test_unseen["mnli_AND_sts"], results_test_unseen["score_number"])
print('Pearsons correlation: %.3f' % corr)


Pearsons correlation: 0.351


In [158]:
from sklearn.metrics import mean_squared_error

mean_squared_error(results_test_unseen["score_number"], results_test_unseen["mnli_AND_sts"], squared=False)


0.5577189950949923

Used the training datast to find the threshold
stack models
use combination score

In [1]:
from sklearn.metrics import f1_score

f1_score(results_test_unseen["score_number"], results_test_unseen["mnli_AND_sts"])

NameError: ignored