## Step 0: Setup and Installation

In [1]:
!pip install git+https://github.com/allenai/scibert.git
!pip install transformers
!pip install datasets

Collecting git+https://github.com/allenai/scibert.git
  Cloning https://github.com/allenai/scibert.git to /tmp/pip-req-build-biqoie9x
  Running command git clone -q https://github.com/allenai/scibert.git /tmp/pip-req-build-biqoie9x
Collecting allennlp@ git+https://github.com/ibeltagy/allennlp@fp16_and_others
  Cloning https://github.com/ibeltagy/allennlp (to revision fp16_and_others) to /tmp/pip-install-f4kplmv7/allennlp_3c5a51998fa642488b6344d5c572825d
  Running command git clone -q https://github.com/ibeltagy/allennlp /tmp/pip-install-f4kplmv7/allennlp_3c5a51998fa642488b6344d5c572825d
  Running command git checkout -b fp16_and_others --track origin/fp16_and_others
  Switched to a new branch 'fp16_and_others'
  Branch 'fp16_and_others' set up to track remote branch 'fp16_and_others' from 'origin'.
Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Collecting overrides
  Downloading overrides-6.1.0-py3-none-any.whl (14 kB)
Collecting spacy<2.2,>=2.1.0
  Downlo

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 60.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 30.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.8 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for

In [2]:
# Clone the dataset repository from github
!git clone https://github.com/leocomelli/score-freetext-answer.git

Cloning into 'score-freetext-answer'...
remote: Enumerating objects: 511, done.[K
remote: Total 511 (delta 0), reused 0 (delta 0), pack-reused 511[K
Receiving objects: 100% (511/511), 478.34 KiB | 5.83 MiB/s, done.
Resolving deltas: 100% (263/263), done.


In [3]:
import zipfile
import os
import numpy as np
import xml.etree.ElementTree as ET
import glob
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer, BertModel, AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

## Step 1: Dataset Setup and Loading

In [4]:
training_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/training/2way/sciEntsBank'
test_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-answers'

In [5]:
def parse_xml_file(xml_file_path):

  question = ""
  ref = ""
  results = []

  for elem in ET.parse(xml_file_path).getroot():
    if elem.tag == 'questionText':
      question = elem.text
    for subelem in elem:
      if subelem.tag == 'referenceAnswer':
        ref = subelem.text
      else:
        results.append({
            'question': question,
            'ref': ref,
            'response': subelem.text,
            'score': subelem.attrib['accuracy']
        })

  return results

In [6]:
training_data_raw = []
test_data_raw = []
num_training_questions = 0
num_test_questions = 0

for data_file in glob.glob(training_data_directory + '/*'):
  training_data_raw += parse_xml_file(data_file)
  num_training_questions += 1

for data_file in glob.glob(test_data_directory + '/*'):
  test_data_raw += parse_xml_file(data_file)
  num_test_questions += 1

print("Number of Training Questions:", num_training_questions)
print("Number of Training Responses:", len(training_data_raw))

print("Number of Test Questions:", num_test_questions)
print("Number of Test Responses:", len(test_data_raw))

Number of Training Questions: 135
Number of Training Responses: 4969
Number of Test Questions: 135
Number of Test Responses: 540


In [7]:
# Concate the reference answer and student answer to create new input for both train and test set
texts_reference_untokenized = []
texts_response_untokenized = []
data_scores_numeric = []

maxl = []
for training_item in training_data_raw:    
  texts_reference_untokenized.append(training_item['ref'])
  texts_response_untokenized.append(training_item['response'])
  data_scores_numeric.append(0 if training_item["score"] == 'incorrect' else 1)

for test_item in test_data_raw:
  texts_reference_untokenized.append(test_item['ref'])
  texts_response_untokenized.append(test_item['response'])
  data_scores_numeric.append(0 if test_item["score"] == 'incorrect' else 1)

texts_reference_untokenized = np.asarray(texts_reference_untokenized)
texts_response_untokenized = np.asarray(texts_response_untokenized)
data_scores_numeric = np.asarray(data_scores_numeric)

In [8]:
print(len(texts_reference_untokenized))
print(len(texts_response_untokenized))
print(texts_reference_untokenized[0])
print(texts_response_untokenized[0])
print(data_scores_numeric[0])

5509
5509
Both finger patterns are loops.
They are loop.
1


## Step 2: SciBERT

In [9]:
model = BertModel.from_pretrained("allenai/scibert_scivocab_uncased")
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", do_lower_case=True)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [10]:
texts_reference_untokenized_chunks = np.array_split(texts_reference_untokenized, 25)

In [None]:
reference_embeddings = np.empty((0,768))

# for chunk in train_texts_untokenized_chunks:
for chunk in texts_reference_untokenized_chunks:

  # Train_embeddings
  train_embeddings = {'input_ids': [], 'attention_mask': []}
  # for text_input in train_texts_untokenized:
  for text_input in chunk:
      embedding_result = tokenizer.encode_plus(text_input, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
      train_embeddings['input_ids'].append(embedding_result['input_ids'][0])
      train_embeddings['attention_mask'].append(embedding_result['attention_mask'][0])

  train_embeddings['input_ids'] = torch.stack(train_embeddings['input_ids'])
  train_embeddings['attention_mask'] = torch.stack(train_embeddings['attention_mask'])

  with torch.no_grad():
    res = model(**train_embeddings)

  train_hidden_state_embeddings = res.last_hidden_state

  attention_mask = train_embeddings['attention_mask']
  mask = attention_mask.unsqueeze(-1).expand(train_hidden_state_embeddings.size()).float()
  masked_embeddings = train_hidden_state_embeddings * mask
  summed = torch.sum(masked_embeddings, 1)
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
  mean_pooled = summed / summed_mask
  mean_pooled = mean_pooled.detach().numpy()

  reference_embeddings = np.append(reference_embeddings, mean_pooled, axis = 0)

In [None]:
reference_embeddings.shape

In [None]:
texts_response_untokenized_chunks = np.array_split(texts_response_untokenized, 25)

In [None]:
response_embeddings = np.empty((0,768))

for chunk in texts_response_untokenized_chunks:

  # Test_embeddings
  test_embeddings = {'input_ids': [], 'attention_mask': []}
  # for text_input in train_texts_untokenized:
  for text_input in chunk:
      embedding_result = tokenizer.encode_plus(text_input, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
      test_embeddings['input_ids'].append(embedding_result['input_ids'][0])
      test_embeddings['attention_mask'].append(embedding_result['attention_mask'][0])

  test_embeddings['input_ids'] = torch.stack(test_embeddings['input_ids'])
  test_embeddings['attention_mask'] = torch.stack(test_embeddings['attention_mask'])

  with torch.no_grad():
    res = model(**test_embeddings)

  test_hidden_state_embeddings = res.last_hidden_state

  attention_mask = test_embeddings['attention_mask']
  mask = attention_mask.unsqueeze(-1).expand(test_hidden_state_embeddings.size()).float()
  masked_embeddings = test_hidden_state_embeddings * mask
  summed = torch.sum(masked_embeddings, 1)
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
  mean_pooled = summed / summed_mask
  mean_pooled = mean_pooled.detach().numpy()

  response_embeddings = np.append(response_embeddings, mean_pooled, axis = 0)

In [None]:
response_embeddings.shape

# Cosine Similarity

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import util

In [None]:
#Compute cosine-similarity
cosine_scores = util.cos_sim(reference_embeddings, response_embeddings)

In [None]:
cosine_scores.shape

In [None]:
cosine_scores

In [None]:
#Output the pairs with their score
result_score = []
for i in range(len(data_scores_numeric)):
  if cosine_scores[i][i] >= 0.65:
    result_score.append(1)
  else:
    result_score.append(0)

correct_predict = 0
for i in range(len(data_scores_numeric)):
  if result_score[i] == data_scores_numeric[i]:
    correct_predict += 1

print(correct_predict / len(result_score))

# Cosine Similarity results

- Using 0.65 threshold: 0.411

- Using 0.70 threshold: 0.421