In [1]:
import zipfile
import numpy as np
import pandas as pd
import csv
import os

# Load Dataset

In [2]:
# This is a direct-download version of the Kaggle dataset found at: https://www.kaggle.com/competitions/asap-sas/data
!wget 'https://drive.google.com/uc?export=download&id=1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2' -O dataset.zip

# Extract the dataset files to the "dataset" folder
with zipfile.ZipFile('dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

--2022-04-15 13:00:37--  https://drive.google.com/uc?export=download&id=1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2
Resolving drive.google.com (drive.google.com)... 74.125.203.139, 74.125.203.138, 74.125.203.100, ...
Connecting to drive.google.com (drive.google.com)|74.125.203.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0o-24-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6ejs79lgpi1hu74gb8693ie2prfhi5e6/1650027600000/09634012588375902450/*/1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2?e=download [following]
--2022-04-15 13:00:46--  https://doc-0o-24-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6ejs79lgpi1hu74gb8693ie2prfhi5e6/1650027600000/09634012588375902450/*/1evWxj4M33SfSaw4nCXA8m71v9w8Zf1k2?e=download
Resolving doc-0o-24-docs.googleusercontent.com (doc-0o-24-docs.googleusercontent.com)... 142.250.157.132, 2404:6800:4008:c13::84
Connecting to doc-0o-24-docs.googleusercontent.com (doc-0o-24-doc

In [3]:
training_data_file_name = 'dataset/train.tsv'
test_data_texts_file_name = 'dataset/public_leaderboard_rel_2.tsv'
test_data_scores_file_name = 'dataset/public_leaderboard_solution.csv'

In [4]:
training_data_tsv = np.genfromtxt(fname=training_data_file_name, delimiter="\t", skip_header=1, dtype='str')

training_data_texts = training_data_tsv[:,4]
training_data_essay_set = training_data_tsv[:,1].astype('int64')
training_data_scores = np.minimum(training_data_tsv[:,2].astype('int64'), training_data_tsv[:,3].astype('int64'))

print('Training Data Texts', training_data_texts.shape)
print('Training Data Scores', training_data_scores.shape)

test_data_texts_file = np.genfromtxt(fname=test_data_texts_file_name, delimiter="\t", skip_header=1, dtype='str')
test_data_scores_file = np.genfromtxt(fname=test_data_scores_file_name, delimiter=",", skip_header=1)

test_data_texts = test_data_texts_file[:,2]
test_data_essay_set = test_data_scores_file[:,1].astype('int')
test_data_scores = test_data_scores_file[:,3].astype('int')

print('Test Data Texts', test_data_texts.shape)
print('Test Data Scores', test_data_scores.shape)


train_matrix = np.column_stack((training_data_texts, training_data_essay_set.astype('str'), training_data_scores.astype('str')))
test_matrix = np.column_stack((test_data_texts, test_data_essay_set.astype('str'), test_data_scores.astype('str')))

try:
    os.remove('train.tsv')
except OSError:
    pass

try:
    os.remove('test.tsv')
except OSError:
    pass

np.savetxt('train.tsv', train_matrix, delimiter='\t', fmt='%s', header='text\tessay_set\tscore')
np.savetxt('test.tsv', test_matrix, delimiter='\t', fmt='%s', header='text\tessay_set\tscore')

Training Data Texts (17207,)
Training Data Scores (17207,)
Test Data Texts (5224,)
Test Data Scores (5224,)


# dataset without pytorch

In [16]:
trains = pd.read_csv('train.tsv', sep= '\t')
tests = pd.read_csv('test.tsv', sep= '\t')

trains.head()

print(np.unique(trains['score']))
print(np.unique(trains['essay_set']))
print(trains.head())

print(np.unique(tests['score']))
print(np.unique(tests['essay_set']))
print(tests.head())

[0 1 2 3]
[ 1  2  3  4  5  6  7  8  9 10]
                                              # text  essay_set  score
0  Some additional information that we would need...          1      1
1  After reading the expirement, I realized that ...          1      1
2  What you need is more trials, a control set up...          1      1
3  The student should list what rock is better an...          1      0
4  For the students to be able to make a replicat...          1      2
[0 1 2 3]
[ 1  2  3  4  5  6  7  8  9 10]
                                              # text  essay_set  score
0  The procedures I think they should have includ...          1      1
1  In order to replicate this experiment, you wou...          1      1
2  In order to replicate their experiment, you wo...          1      3
3  Pleace a simple of one material into one conta...          1      0
4  Determin the mass of four different samples ma...          1      0


# Split Data

In [39]:
from torchtext.legacy import data
from torchtext.legacy import datasets

TEXT = data.Field()
ESSAY_SET = data.Field(sequential=False,use_vocab=False)
SCORE = data.Field(sequential=False,use_vocab=False)


train, test = data.TabularDataset.splits(
    path='./', train='train.tsv', test='test.tsv', format='tsv', skip_header = True,
    fields=[('text', TEXT),('essay_set', ESSAY_SET), ('score', SCORE)], 
)

# HERE to download and use GloVe embeddings
# TEXT.build_vocab(train, vectors= 'glove.6B.300d')
# TEXT.build_vocab(train, vectors= 'glove.6B.100d')
TEXT.build_vocab(train, vectors= 'glove.42B.300d')

vocab = TEXT.vocab

In [40]:
from typing import Text
train_iter, test_iter = data.BucketIterator.splits(
    (train, test), batch_sizes=(16, 256),
    sort_key=lambda x: len(x.text), device=0)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


# Training using GloVe

In [25]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim

In [27]:
class Model(nn.Module):
    def __init__(self, embedding):
        super().__init__()
        self.embedding = embedding
        self.linear = nn.Linear(300,4)
        
    def forward(self, x):
        features = self.embedding[x]
        features = torch.mean(features,1)
        features = F.relu(features)
        features = self.linear(features)

        return features

In [31]:
model = Model(vocab.vectors)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

epochs = 10

In [29]:
def get_accuracy(dataset):

    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():

        for data in dataset:
            predicted_label = model(data.text.T)
            loss = loss_function(predicted_label, data.score)
            total_acc += (predicted_label.argmax(1) == data.score).sum().item()
            total_count += data.score.size(0)

    return total_acc/total_count

In [32]:
for epoch in range(epochs):

  losses = list()
    
  for train_batch in train_iter:

    optimizer.zero_grad()

    prediction = model(train_batch.text.T)

    loss = loss_function(prediction, train_batch.score)

    loss.backward()
    optimizer.step()

    losses.append(loss.item())


  print('train loss on epoch {} : {:.3f}'.format(epoch, np.mean(losses)))
  train_acc = get_accuracy(train_iter)

  print('training accuracy: {}'.format(train_acc))

train loss on epoch 0 : 1.012
training accuracy: 0.5663675562288778
train loss on epoch 1 : 0.944
training accuracy: 0.5839063046265004
train loss on epoch 2 : 0.922
training accuracy: 0.5836149632909917
train loss on epoch 3 : 0.914
training accuracy: 0.5955016897797459
train loss on epoch 4 : 0.904
training accuracy: 0.5798275259293788
train loss on epoch 5 : 0.899
training accuracy: 0.5957347628481529
train loss on epoch 6 : 0.891
training accuracy: 0.5998135415452744
train loss on epoch 7 : 0.890
training accuracy: 0.5996970050110709
train loss on epoch 8 : 0.886
training accuracy: 0.6014450530241231
train loss on epoch 9 : 0.883
training accuracy: 0.5995804684768675


In [33]:
# get test accuracy using test set
test_acc = get_accuracy(test_iter)
print('training accuracy: {}'.format(test_acc))

training accuracy: 0.5388189738625363


# Glove 6B 100d

- training accuracy 57.2%
- test accuracy 48.7%

# Glove 6B 300d

- training accuracy 59.3%
- test accuracy 52.7%

# Glove 42B 300d

- training accuracy 59.9%
- test accuracy 53.8%