In [78]:
import pandas as pd, numpy as np
from collections import Counter

# Set GPU flag to false if running on CPU
GPU = True

# Change the column to use either essay1/2 or title 
column = 'project_essay_1'
batch_size = 32

## TorchText

In [79]:
import torch
from torchtext import data
from torchtext.vocab import GloVe

TEXT = data.Field(include_lengths=True,batch_first=True)
ID = data.Field(sequential=False)
LABEL = data.Field(sequential=False)


def sort_key(ex):
    return len(getattr(ex,column))

train, val, test = data.TabularDataset.splits(
        path='.', train='train.csv',
        validation='val.csv', test='dev.csv', format='csv', skip_header=True,
        fields=[('id', ID), ('project_title', None),('project_resource_summary', None), 
                ('project_essay_1', TEXT), ('project_essay_2', None), ('project_is_approved', LABEL)])

#vocab is shared across all the text fields
#CAUTION: GloVe will download all embeddings locally (862 MB).  If not interested, remove "vectors"
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
ID.build_vocab(train)
LABEL.build_vocab(train)

#change device to 0 for GPU
train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test), sort_key= sort_key, repeat=False, sort_within_batch=True,
        batch_size=(batch_size), device=-1 if GPU else 0)

## Model

- Naive LSTM/BiLSTM classifier with packed sequences for variable length inputs

In [85]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
import torch.optim as optim

class LSTMClassifier(nn.Module):

    def __init__(self, vocab, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(vocab.vectors)        
        self.lstm = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2label = nn.Linear(hidden_dim, label_size)

    def init_hidden(self,batch_size):
        if(GPU):
            h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda())
#             c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda())
        
        else:
            h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            
        return h0
#         return (h0, c0)

    def forward(self, sentence, lengths):
        embeds = self.word_embeddings(sentence)
        packed_emb = nn.utils.rnn.pack_padded_sequence(embeds, list(lengths.data), batch_first=True)
        lstm_out, self.hidden = self.lstm(packed_emb, self.hidden)
        y = self.hidden2label(self.hidden[0].squeeze(0))
        return y

from sklearn.metrics import accuracy_score, f1_score
def validate(net,iter_obj):
    loss_function = nn.CrossEntropyLoss()
    net.eval()
    iter_obj.init_epoch()
    pred = []
    actual = []
    dev_loss = []
    for batch in iter_obj:
        input,label,length = getattr(batch,column)[0], batch.project_is_approved-1, Variable(getattr(batch,column)[1])
        if(GPU): input, label, length = input.cuda(), label.cuda(),length.cuda()
        net.hidden = net.init_hidden(input.shape[0])
        scores = net(input,length)
        dev_loss.append(loss_function(scores,label).data[0])
        pred.extend(scores.cpu().data.numpy().argmax(axis=1))
        actual.extend(label.data.cpu().numpy().tolist())
    return  round(np.mean(dev_loss),4), round(accuracy_score(actual,pred),4)

In [86]:
def train_net(NUM_EPOCHS):
    model = LSTMClassifier(vocab = TEXT.vocab, embedding_dim=300, vocab_size=len(TEXT.vocab), 
                           hidden_dim=50, batch_size=batch_size, label_size=2)
    
    if(GPU): model.cuda()

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)#, momentum=0.9)

    n_correct = 0.
    n_total = 0.
    for i in range(NUM_EPOCHS):
        train_loss = []
        train_iter.init_epoch()
        for batch in train_iter:
            model.train()
            optimizer.zero_grad()       
            input,label,length = getattr(batch,column)[0], batch.project_is_approved-1, Variable(getattr(batch,column)[1])
            if(GPU): input,label,length = input.cuda(), label.cuda(), length.cuda()
            model.hidden = model.init_hidden(input.shape[0])

            scores = model(input,length)
            loss = loss_function(scores, label)
            loss.backward()
            optimizer.step()

            n_correct += (torch.max(scores, 1)[1].view(label.size()) == label).sum().data[0]
            n_total += batch.batch_size
            train_acc = 100. * n_correct/n_total
            train_loss.append(loss.data[0])

        if(NUM_EPOCHS<=10) or ((i+1)%10==0):
            val_loss, val_acc = validate(model,val_iter)
            print(f"Epoch {i+1} Train loss: {round(np.mean(train_loss),4)} Val loss: {val_loss} Train acc: {train_acc} Val acc: {val_acc}")

    return model

In [87]:
# Batch size 32 optimizer Adam(0.001) epochs 10
model = train_net(10)

Epoch 1 Train loss: 0.6944 Val loss: 0.6943 Train acc: 50.31666666666667 Val acc: 0.5023
Epoch 2 Train loss: 0.6685 Val loss: 0.7102 Train acc: 54.655 Val acc: 0.5119
Epoch 3 Train loss: 0.5406 Val loss: 0.8053 Train acc: 60.58555555555556 Val acc: 0.5241
Epoch 4 Train loss: 0.3411 Val loss: 1.0044 Train acc: 66.69916666666667 Val acc: 0.5186
Epoch 5 Train loss: 0.2073 Val loss: 1.2339 Train acc: 71.738 Val acc: 0.522
Epoch 6 Train loss: 0.1457 Val loss: 1.3914 Train acc: 75.48833333333333 Val acc: 0.5183
Epoch 7 Train loss: 0.1139 Val loss: 1.5017 Train acc: 78.2995238095238 Val acc: 0.5192
Epoch 8 Train loss: 0.0929 Val loss: 1.6781 Train acc: 80.44958333333334 Val acc: 0.5172
Epoch 9 Train loss: 0.0773 Val loss: 2.0785 Train acc: 82.15333333333334 Val acc: 0.5155
Epoch 10 Train loss: 0.0673 Val loss: 2.0625 Train acc: 83.53966666666666 Val acc: 0.5173


In [88]:
# Batch size 32 optimizer Adam(0.001) epochs 100
model = train_net(100)

Epoch 10 Train loss: 0.0637 Val loss: 2.2588 Train acc: 83.633 Val acc: 0.515
Epoch 20 Train loss: 0.0478 Val loss: 2.5291 Train acc: 90.09083333333334 Val acc: 0.5138
Epoch 30 Train loss: 0.0434 Val loss: 2.6984 Train acc: 92.37011111111111 Val acc: 0.5112
Epoch 40 Train loss: 0.0425 Val loss: 2.3447 Train acc: 93.54775 Val acc: 0.5207
Epoch 50 Train loss: 0.0421 Val loss: 2.3905 Train acc: 94.28046666666667 Val acc: 0.5162
Epoch 60 Train loss: 0.0404 Val loss: 2.4795 Train acc: 94.77722222222222 Val acc: 0.5178
Epoch 70 Train loss: 0.0392 Val loss: 2.572 Train acc: 95.1437619047619 Val acc: 0.5173
Epoch 80 Train loss: 0.0413 Val loss: 2.4635 Train acc: 95.414375 Val acc: 0.5094
Epoch 90 Train loss: 0.0371 Val loss: 2.7861 Train acc: 95.63466666666666 Val acc: 0.5177
Epoch 100 Train loss: 0.04 Val loss: 2.3652 Train acc: 95.8118 Val acc: 0.5102
