In [1]:
import pandas as pd
from collections import Counter

# Set GPU flag to false if running on CPU
GPU = True

# Change the column to use either essay1/2 or title 
column = 'project_essay_1'
batch_size = 32

## TorchText

In [2]:
import torch
from torchtext import data
from torchtext.vocab import GloVe

TEXT = data.Field(include_lengths=True,batch_first=True)
ID = data.Field(sequential=False)
LABEL = data.Field(sequential=False)


def sort_key(ex):
    return len(getattr(ex,column))

train, val, test = data.TabularDataset.splits(
        path='.', train='train.csv',
        validation='val.csv', test='dev.csv', format='csv', skip_header=True,
        fields=[('id', ID), ('project_title', None),('project_resource_summary', None), 
                ('project_essay_1', TEXT), ('project_essay_2', None), ('project_is_approved', LABEL)])

#vocab is shared across all the text fields
#CAUTION: GloVe will download all embeddings locally (862 MB).  If not interested, remove "vectors"
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
ID.build_vocab(train)
LABEL.build_vocab(train)

#change device to 0 for GPU
train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test), sort_key= sort_key, repeat=False, sort_within_batch=True, 
        batch_size=(batch_size), device=-1 if GPU else 0)

## Model

- Naive LSTM/BiLSTM classifier with packed sequences for variable length inputs

In [8]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
import torch.optim as optim

class LSTMClassifier(nn.Module):

    def __init__(self, vocab, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(vocab.vectors)        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.hidden2label = nn.Linear(2*hidden_dim, label_size)
        
    def last_timestep(self, unpacked, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
                                               unpacked.size(2)).unsqueeze(1)
        return unpacked.gather(1, idx).squeeze()

    def init_hidden(self,batch_size):
        if(GPU):
            h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim).cuda())
            c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim).cuda())
        
        else:
            h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            
        return (h0, c0)

    def forward(self, sentence, lengths):
        embeds = self.word_embeddings(sentence)
        packed_emb = nn.utils.rnn.pack_padded_sequence(embeds, list(lengths.data), batch_first=True)
        lstm_out, self.hidden = self.lstm(packed_emb, self.hidden)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        last_outputs = self.last_timestep(lstm_out, lengths)
        y  = self.hidden2label(last_outputs)
        return y

from sklearn.metrics import accuracy_score, f1_score
def validate(net,iter_obj):
    loss_function = nn.CrossEntropyLoss()
    net.eval()
    iter_obj.init_epoch()
    pred = []
    actual = []
    dev_loss = 0.
#     with torch.no_grad():
    for batch in iter_obj:
        input,label,length = getattr(batch,column)[0], batch.project_is_approved-1, Variable(getattr(batch,column)[1])
        if(GPU): input, label, length = input.cuda(), label.cuda(),length.cuda()
        net.hidden = net.init_hidden(input.shape[0])
        scores = net(input,length)
        dev_loss = loss_function(scores,label)
        pred.extend(scores.cpu().data.numpy().argmax(axis=1))
        actual.extend(label.data.cpu().numpy().tolist())
    return  round(dev_loss.data[0],4), round(accuracy_score(actual,pred),4)

### Train a Naive BiLSTM model
- Input: project_essay

In [12]:
def train(NUM_EPOCHS):
    model = LSTMClassifier(vocab = TEXT.vocab, embedding_dim=300, vocab_size=len(TEXT.vocab), 
                           hidden_dim=50, batch_size=batch_size, label_size=2)
    
    if(GPU): model.cuda()

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)#, momentum=0.9)

    n_correct = 0.
    n_total = 0.
    for i in range(NUM_EPOCHS):
        train_iter.init_epoch()
        for batch in train_iter:
            model.train()
            optimizer.zero_grad()       
            input,label,length = getattr(batch,column)[0], batch.project_is_approved-1, Variable(getattr(batch,column)[1])
            if(GPU): input,label,length = input.cuda(), label.cuda(), length.cuda()
            model.hidden = model.init_hidden(input.shape[0])

            scores = model(input,length)
            loss = loss_function(scores, label)
            loss.backward()
            optimizer.step()

            n_correct += (torch.max(scores, 1)[1].view(label.size()) == label).sum().data[0]
            n_total += batch.batch_size
            train_acc = 100. * n_correct/n_total

        if(NUM_EPOCHS<=10) or ((i+1)%10==0):
            val_loss, val_acc = validate(model,val_iter)
            print(f"Epoch {i+1} Train loss: {loss.data[0]} Val loss: {val_loss} Train acc: {train_acc} Val acc: {val_acc}")

    return model

In [10]:
# Batch size 32 optimizer SGD(0.001,0.9) epochs 10
model = train(10)

Epoch 1 Train loss: 0.6996755599975586 Val loss: 0.6922 Train acc: 49.85666666666667 Val acc: 0.5029
Epoch 2 Train loss: 0.6939828395843506 Val loss: 0.6931 Train acc: 49.986666666666665 Val acc: 0.5013
Epoch 3 Train loss: 0.6948064565658569 Val loss: 0.6862 Train acc: 49.96888888888889 Val acc: 0.497
Epoch 4 Train loss: 0.7016761898994446 Val loss: 0.6978 Train acc: 50.04083333333333 Val acc: 0.4998
Epoch 5 Train loss: 0.6900875568389893 Val loss: 0.6904 Train acc: 50.048 Val acc: 0.5023
Epoch 6 Train loss: 0.7001388072967529 Val loss: 0.6929 Train acc: 50.08166666666666 Val acc: 0.4936
Epoch 7 Train loss: 0.7006786465644836 Val loss: 0.687 Train acc: 50.142380952380954 Val acc: 0.501
Epoch 8 Train loss: 0.700408935546875 Val loss: 0.6877 Train acc: 50.18833333333333 Val acc: 0.5028
Epoch 9 Train loss: 0.6882122755050659 Val loss: 0.6968 Train acc: 50.251481481481484 Val acc: 0.4998
Epoch 10 Train loss: 0.6905537843704224 Val loss: 0.692 Train acc: 50.273 Val acc: 0.4962


In [11]:
# Batch size 32 optimizer SGD(0.001,0.9) epochs 100
model = train(100)

Epoch 10 Train loss: 0.6957561373710632 Val loss: 0.6922 Train acc: 50.553333333333335 Val acc: 0.5079
Epoch 20 Train loss: 0.6872419714927673 Val loss: 0.6953 Train acc: 50.94683333333333 Val acc: 0.5057
Epoch 30 Train loss: 0.6961731314659119 Val loss: 0.697 Train acc: 51.30144444444444 Val acc: 0.4984
Epoch 40 Train loss: 0.677617609500885 Val loss: 0.6963 Train acc: 51.60808333333333 Val acc: 0.4986
Epoch 50 Train loss: 0.6645770072937012 Val loss: 0.68 Train acc: 51.90273333333333 Val acc: 0.5067
Epoch 60 Train loss: 0.6916297078132629 Val loss: 0.6859 Train acc: 52.204277777777776 Val acc: 0.5065
Epoch 70 Train loss: 0.7122184038162231 Val loss: 0.6969 Train acc: 52.51447619047619 Val acc: 0.5049
Epoch 80 Train loss: 0.7024260759353638 Val loss: 0.7004 Train acc: 52.815416666666664 Val acc: 0.5082
Epoch 90 Train loss: 0.7158741354942322 Val loss: 0.6743 Train acc: 53.13485185185185 Val acc: 0.5095
Epoch 100 Train loss: 0.7155278921127319 Val loss: 0.6793 Train acc: 53.48526666666

In [13]:
# Batch size 32 optimizer Adam(0.001) epochs 10
model = train(10)

Epoch 1 Train loss: 0.6948099136352539 Val loss: 0.6877 Train acc: 50.31 Val acc: 0.5063
Epoch 2 Train loss: 0.7146060466766357 Val loss: 0.6552 Train acc: 54.31666666666667 Val acc: 0.5054
Epoch 3 Train loss: 0.6167232990264893 Val loss: 0.7901 Train acc: 59.47888888888889 Val acc: 0.5202
Epoch 4 Train loss: 0.4552345871925354 Val loss: 1.3341 Train acc: 65.095 Val acc: 0.5249
Epoch 5 Train loss: 0.17953452467918396 Val loss: 1.9786 Train acc: 69.84533333333333 Val acc: 0.5252
Epoch 6 Train loss: 0.1045316532254219 Val loss: 2.3741 Train acc: 73.59611111111111 Val acc: 0.5243
Epoch 7 Train loss: 0.13866591453552246 Val loss: 2.2783 Train acc: 76.5 Val acc: 0.5189
Epoch 8 Train loss: 0.05565723404288292 Val loss: 2.6443 Train acc: 78.78416666666666 Val acc: 0.5203
Epoch 9 Train loss: 0.030734602361917496 Val loss: 2.7011 Train acc: 80.63074074074073 Val acc: 0.5192
Epoch 10 Train loss: 0.01786181703209877 Val loss: 3.1807 Train acc: 82.14033333333333 Val acc: 0.5213


In [14]:
# Batch size 32 optimizer Adam(0.001) epochs 100
model = train(100)

Epoch 10 Train loss: 0.01835058256983757 Val loss: 4.5241 Train acc: 82.16066666666667 Val acc: 0.5196
Epoch 20 Train loss: 8.636713027954102e-05 Val loss: 5.4904 Train acc: 89.26416666666667 Val acc: 0.5108
Epoch 30 Train loss: 0.05354123190045357 Val loss: 4.9487 Train acc: 91.78988888888888 Val acc: 0.5223
Epoch 40 Train loss: 0.0003166794776916504 Val loss: 3.8772 Train acc: 93.09508333333333 Val acc: 0.5161
Epoch 50 Train loss: 0.09413549304008484 Val loss: 3.2109 Train acc: 93.91226666666667 Val acc: 0.5187
Epoch 60 Train loss: 0.043175749480724335 Val loss: 3.5885 Train acc: 94.46811111111111 Val acc: 0.5179
Epoch 70 Train loss: 0.0006021857261657715 Val loss: 4.0057 Train acc: 94.8734761904762 Val acc: 0.5197
Epoch 80 Train loss: 0.04303445667028427 Val loss: 3.6391 Train acc: 95.18029166666666 Val acc: 0.5153
Epoch 90 Train loss: 0.04535502940416336 Val loss: 3.4293 Train acc: 95.42103703703704 Val acc: 0.5231
Epoch 100 Train loss: 4.476308822631836e-05 Val loss: 4.2937 Train 