# Project Dataset Loading

In [None]:
import zipfile
import os
import numpy as np

In [None]:
# This is a direct-download version of the Kaggle dataset found at: https://www.kaggle.com/competitions/asap-sas/data
!wget 'https://drive.google.com/uc?export=download&id=1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2' -O dataset.zip

# Extract the dataset files to the "dataset" folder
with zipfile.ZipFile('dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

--2022-04-13 00:23:44--  https://drive.google.com/uc?export=download&id=1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2
Resolving drive.google.com (drive.google.com)... 108.177.97.101, 108.177.97.100, 108.177.97.138, ...
Connecting to drive.google.com (drive.google.com)|108.177.97.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0o-24-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/badcj9hdlhpphqt4rtv8qq2b0jnptb23/1649809425000/09634012588375902450/*/1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2?e=download [following]
--2022-04-13 00:23:53--  https://doc-0o-24-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/badcj9hdlhpphqt4rtv8qq2b0jnptb23/1649809425000/09634012588375902450/*/1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2?e=download
Resolving doc-0o-24-docs.googleusercontent.com (doc-0o-24-docs.googleusercontent.com)... 108.177.125.132, 2404:6800:4008:c01::84
Connecting to doc-0o-24-docs.googleusercontent.com (doc-0o-24-doc

In [None]:
training_data_file_name = 'dataset/train.tsv'
test_data_texts_file_name = 'dataset/public_leaderboard_rel_2.tsv'
test_data_scores_file_name = 'dataset/public_leaderboard_solution.csv'

In [None]:
training_data_tsv = np.genfromtxt(fname=training_data_file_name, delimiter="\t", skip_header=1, dtype='str')

training_data_texts = training_data_tsv[:,4]
training_data_essay_set = training_data_tsv[:,1].astype('int64')
training_data_scores = np.minimum(training_data_tsv[:,2].astype('int64'), training_data_tsv[:,3].astype('int64'))

print('Training Data Texts', training_data_texts.shape)
print('Training Data Scores', training_data_scores.shape)

test_data_texts_file = np.genfromtxt(fname=test_data_texts_file_name, delimiter="\t", skip_header=1, dtype='str')
test_data_scores_file = np.genfromtxt(fname=test_data_scores_file_name, delimiter=",", skip_header=1)

test_data_texts = test_data_texts_file[:,2]
test_data_essay_set = test_data_scores_file[:,1]
test_data_scores = test_data_scores_file[:,3]

print('Test Data Texts', test_data_texts.shape)
print('Test Data Scores', test_data_scores.shape)


train_matrix = np.column_stack((training_data_texts, training_data_essay_set.astype('str'), training_data_scores.astype('str')))
test_matrix = np.column_stack((test_data_texts, test_data_essay_set.astype('str'), test_data_scores.astype('str')))

try:
    os.remove('train.tsv')
except OSError:
    pass

try:
    os.remove('test.tsv')
except OSError:
    pass

np.savetxt('train.tsv', train_matrix, delimiter='\t', fmt='%s', header='text\tessay_set\tscore')
np.savetxt('test.tsv', test_matrix, delimiter='\t', fmt='%s', header='text\tessay_set\tscore')

Training Data Texts (17207,)
Training Data Scores (17207,)
Test Data Texts (5224,)
Test Data Scores (5224,)


In [None]:
from torchtext.legacy import data
from torchtext.legacy import datasets

TEXT = data.Field()
ESSAY_SET = data.Field()
SCORE = data.Field()

train, test = data.TabularDataset.splits(
    path='./', train='train.tsv', test='test.tsv', format='tsv',
    fields=[('text', TEXT),('essay_set', TEXT), ('score', SCORE)], 
)

In [None]:
train_iter, test_iter = data.BucketIterator.splits(
    (train, test), batch_sizes=(16, 256),
    sort_key=lambda x: len(x.text), device=0)

TEXT.build_vocab(train)
SCORE.build_vocab(train)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


## Batching and Loading Data to Model

Use this iterator to load in the train and test datasets to the model of choice.

In [None]:
# Iterate through the training set batch by batch
for train_batch in train_iter:
  print(train_batch)

# Iterate through the test set batch by batch
for test_batch in test_iter:
  print(test_batch)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

[torchtext.legacy.data.batch.Batch of size 16]
	[.text]:[torch.LongTensor of size 105x16]
	[.essay_set]:[torch.LongTensor of size 1x16]
	[.score]:[torch.LongTensor of size 1x16]

[torchtext.legacy.data.batch.Batch of size 16]
	[.text]:[torch.LongTensor of size 75x16]
	[.essay_set]:[torch.LongTensor of size 1x16]
	[.score]:[torch.LongTensor of size 1x16]

[torchtext.legacy.data.batch.Batch of size 16]
	[.text]:[torch.LongTensor of size 103x16]
	[.essay_set]:[torch.LongTensor of size 1x16]
	[.score]:[torch.LongTensor of size 1x16]

[torchtext.legacy.data.batch.Batch of size 16]
	[.text]:[torch.LongTensor of size 52x16]
	[.essay_set]:[torch.LongTensor of size 1x16]
	[.score]:[torch.LongTensor of size 1x16]

[torchtext.legacy.data.batch.Batch of size 16]
	[.text]:[torch.LongTensor of size 81x16]
	[.essay_set]:[torch.LongTensor of size 1x16]
	[.score]:[torch.LongTensor of size 1x16]

[torchtext.legacy.data.batch.Batch of size

In [None]:
for test_batch in test_iter:
  print(test_batch.score)
  break

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
