# Project Dataset Loading

In [6]:
import zipfile
import os
import numpy as np

In [7]:
# This is a direct-download version of the Kaggle dataset found at: https://www.kaggle.com/competitions/asap-sas/data
!wget 'https://drive.google.com/uc?export=download&id=1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2' -O dataset.zip

# Extract the dataset files to the "dataset" folder
with zipfile.ZipFile('dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

--2022-04-15 01:32:42--  https://drive.google.com/uc?export=download&id=1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2
Resolving drive.google.com (drive.google.com)... 173.194.218.100, 173.194.218.102, 173.194.218.113, ...
Connecting to drive.google.com (drive.google.com)|173.194.218.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0o-24-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/kd00onljsfmd2jnkh6khe5npje35sopn/1649986350000/09634012588375902450/*/1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2?e=download [following]
--2022-04-15 01:32:51--  https://doc-0o-24-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/kd00onljsfmd2jnkh6khe5npje35sopn/1649986350000/09634012588375902450/*/1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2?e=download
Resolving doc-0o-24-docs.googleusercontent.com (doc-0o-24-docs.googleusercontent.com)... 142.250.98.132, 2607:f8b0:400c:c1a::84
Connecting to doc-0o-24-docs.googleusercontent.com (doc-0o-24-

In [8]:
training_data_file_name = 'dataset/train.tsv'
test_data_texts_file_name = 'dataset/public_leaderboard_rel_2.tsv'
test_data_scores_file_name = 'dataset/public_leaderboard_solution.csv'

In [9]:
training_data_tsv = np.genfromtxt(fname=training_data_file_name, delimiter="\t", skip_header=1, dtype='str')

training_data_texts = training_data_tsv[:,4]
training_data_essay_set = training_data_tsv[:,1].astype('int64')
training_data_scores = np.minimum(training_data_tsv[:,2].astype('int64'), training_data_tsv[:,3].astype('int64'))

print('Training Data Texts', training_data_texts.shape)
print('Training Data Scores', training_data_scores.shape)

test_data_texts_file = np.genfromtxt(fname=test_data_texts_file_name, delimiter="\t", skip_header=1, dtype='str')
test_data_scores_file = np.genfromtxt(fname=test_data_scores_file_name, delimiter=",", skip_header=1)

test_data_texts = test_data_texts_file[:,2]
test_data_essay_set = test_data_scores_file[:,1]
test_data_scores = test_data_scores_file[:,3]

print('Test Data Texts', test_data_texts.shape)
print('Test Data Scores', test_data_scores.shape)


train_matrix = np.column_stack((training_data_texts, training_data_essay_set.astype('str'), training_data_scores.astype('str')))
test_matrix = np.column_stack((test_data_texts, test_data_essay_set.astype('str'), test_data_scores.astype('str')))

try:
    os.remove('train.tsv')
except OSError:
    pass

try:
    os.remove('test.tsv')
except OSError:
    pass

np.savetxt('train.tsv', train_matrix, delimiter='\t', fmt='%s', header='text\tessay_set\tscore')
np.savetxt('test.tsv', test_matrix, delimiter='\t', fmt='%s', header='text\tessay_set\tscore')

Training Data Texts (17207,)
Training Data Scores (17207,)
Test Data Texts (5224,)
Test Data Scores (5224,)


In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

class ShortAnswerGradingDataset(Dataset):
    def __init__(self, texts, essay_sets, scores):
        self.scores = scores
        self.essay_sets = essay_sets
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
            score = self.scores[idx]
            essay_set = self.scores[idx]
            text = self.texts[idx]
            sample = {"text": text, "score": score, "essay_set": essay_set}
            return sample

In [15]:
training_dataset = ShortAnswerGradingDataset(training_data_texts, training_data_essay_set, training_data_scores)
test_dataset = ShortAnswerGradingDataset(test_data_texts, test_data_essay_set, test_data_scores)



## Batching and Loading Data to Model

Use this iterator to load in the train and test datasets to the model of choice.

In [19]:
for test_item in test_dataset:
  print(test_item)
  break

{'text': 'The procedures I think they should have included inorder for me to replicate the experiment would be how different samples did they used for each? What tool did they use to determine the mass.', 'score': 1.0, 'essay_set': 1.0}
