# Project Dataset Loading

In [None]:
import zipfile
import os
import numpy as np
import xml.etree.ElementTree as ET
import glob
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# Clone the dataset repository from github
!git clone https://github.com/CodyRichter/Automatic-Short-Answer-Grading

Cloning into 'Automatic-Short-Answer-Grading'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 33 (delta 12), reused 23 (delta 8), pack-reused 0[K
Unpacking objects: 100% (33/33), done.


In [None]:
training_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/training/2way/sciEntsBank'
test_data_directory = '/content/score-freetext-answer/src/main/resources/corpus/semeval2013-task7/test/2way/sciEntsBank/test-unseen-questions'

In [None]:
import json

with open('/content/Automatic-Short-Answer-Grading/dataset/train.json', 'r') as tf:
  training_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-answers.json', 'r') as tf:
  test_unseen_answer_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-questions.json', 'r') as tf:
  test_unseen_question_data = json.load(tf)

with open('/content/Automatic-Short-Answer-Grading/dataset/test-unseen-domains.json', 'r') as tf:
  test_unseen_domain_data = json.load(tf)

print('Number of Training Data Responses', len(training_data))
print('Number of Test Data (New Answer) Responses', len(test_unseen_answer_data))
print('Number of Test Data (New Question) Responses', len(test_unseen_question_data))
print('Number of Test Data (New Domain) Responses', len(test_unseen_domain_data))

Number of Training Data Responses 16265
Number of Test Data (New Answer) Responses 540
Number of Test Data (New Question) Responses 733
Number of Test Data (New Domain) Responses 4562


## BERT sentence Embedding
---
Documentation: https://www.sbert.net/

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 8.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 44.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 51.7 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 33.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
# Concate the reference answer and student answer to creat new input for both train and test set
test_data_texts = []
test_data_scores = []
train_data_texts = []
train_data_scores = []

for training_item in training_data:
  train_data_texts.append(training_item["ref"] + " " + training_item["response"])
  train_data_scores.append(training_item["score"])

print(train_data_texts[0:4])
print(train_data_scores[0:4])

for test_item in test_unseen_question_data:
  test_data_texts.append(test_item["ref"] + " " + test_item["response"])
  test_data_scores.append(test_item["score"])

print(test_data_texts[0:4])
print(test_data_scores[0:4])

['The water splashed because the fork was vibrating. Vibrations make sounds. Hitting the fork and dipping it into the water.', 'The water splashed because the fork was vibrating. Vibrations make sounds. Strike the fork and plunge it into the water.', 'The water splashed because the fork was vibrating. Vibrations make sounds. Hit the fork and dip it in the water.', 'The water splashed because the fork was vibrating. Vibrations make sounds. Hit the fork and immerse it in water.']
['incorrect', 'incorrect', 'incorrect', 'incorrect']
['Earth materials are worn away and moved during erosion. Earth material gets eroded and carried away.', 'Earth materials are worn away and moved during erosion. They are eroded by water, wind, and or ice.', 'Earth materials are worn away and moved during erosion. They form into other solids water, lava, wind.', 'Earth materials are worn away and moved during erosion. They just move around.']
['incorrect', 'incorrect', 'incorrect', 'incorrect']


In [None]:
#Sentences are encoded by calling model.encode()
embeddings = model.encode(test_data_texts[1:3])

#Print the embeddings
# for sentence, embedding in zip(test_data_texts[1:3], embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

In [None]:
#Training sentences we like to encode
train_embeddings = model.encode(train_data_texts)

#Testing sentences we would like to encode
test_embeddings = model.encode(test_data_texts)

print(train_embeddings.shape)
print(test_embeddings.shape)

(16265, 384)
(733, 384)


## Classification
---

In [None]:
# import KNN and use n = 5
from sklearn.neighbors import KNeighborsClassifier


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_embeddings,train_data_scores)
knn.score(test_embeddings,test_data_scores)
# results = knn.predict(X_test)

0.5457025920873124

In [None]:
# print(knn.predict(test_embeddings))

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
clf = RandomForestClassifier(max_depth=20)
clf.fit(train_embeddings, train_data_scores)
clf.score(test_embeddings,test_data_scores)


0.5648021828103683

In [None]:
# print(clf.predict(test_embeddings))

## Try different input format:
---

### Input: (Student Response)

In [None]:
test_data = test_unseen_question_data

In [None]:
test_data_texts = []
test_data_scores = []
train_data_texts = []
train_data_scores = []

for training_item in training_data:
  train_data_texts.append(training_item["response"])
  train_data_scores.append(training_item["score"])


for test_item in test_data:
  test_data_texts.append(test_item["response"])
  test_data_scores.append(test_item["score"])


#Training sentences we like to encode
train_embeddings = model.encode(train_data_texts)

#Testing sentences we would like to encode
test_embeddings = model.encode(test_data_texts)

# KNN classification
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_embeddings,train_data_scores)
knn.score(test_embeddings,test_data_scores)
# results = knn.predict(X_test)

0.5361527967257844

### Input: (Question, Student Response)

In [None]:
test_data_texts = []
test_data_scores = []
train_data_texts = []
train_data_scores = []

for training_item in training_data:
  train_data_texts.append(training_item["question"] + "<SEP>" + training_item["response"])
  train_data_scores.append(training_item["score"])


for test_item in test_data:
  test_data_texts.append(test_item["question"] + "<SEP>" + test_item["response"])
  test_data_scores.append(test_item["score"])


#Training sentences we like to encode
train_embeddings = model.encode(train_data_texts)

#Testing sentences we would like to encode
test_embeddings = model.encode(test_data_texts)

# KNN classification
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_embeddings,train_data_scores)
knn.score(test_embeddings,test_data_scores)
# results = knn.predict(X_test)

0.5006821282401092

### Input: (Reference Answer, Student Response)

In [None]:
test_data_texts = []
test_data_scores = []
train_data_texts = []
train_data_scores = []

for training_item in training_data:
  train_data_texts.append(training_item["ref"] + "<SEP>" + training_item["response"])
  train_data_scores.append(training_item["score"])


for test_item in test_data:
  test_data_texts.append(test_item["ref"] + "<SEP>" + test_item["response"])
  test_data_scores.append(test_item["score"])


#Training sentences we like to encode
train_embeddings = model.encode(train_data_texts)

#Testing sentences we would like to encode
test_embeddings = model.encode(test_data_texts)

# KNN classification
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_embeddings,train_data_scores)
knn.score(test_embeddings,test_data_scores)
# results = knn.predict(X_test)

0.582537517053206

### Input: (Question, Reference Answer, Student Response)

In [None]:
test_data_texts = []
test_data_scores = []
train_data_texts = []
train_data_scores = []

for training_item in training_data:
  train_data_texts.append(training_item["question"] + " " + training_item["ref"] + " " + training_item["response"])
  train_data_scores.append(training_item["score"])


for test_item in test_data:
  test_data_texts.append(test_item["question"] + " " + test_item["ref"] + " " + test_item["response"])
  test_data_scores.append(test_item["score"])


#Training sentences we like to encode
train_embeddings = model.encode(train_data_texts)

#Testing sentences we would like to encode
test_embeddings = model.encode(test_data_texts)

# KNN classification
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_embeddings,train_data_scores)
knn.score(test_embeddings,test_data_scores)
# results = knn.predict(X_test)

0.538881309686221

### Conclusion:
We found the input "reference answer + student answer" sentence embeddings performed the best with accuracy 0.5798090040927695