In [29]:
import pandas as pd
from collections import Counter

# Set GPU flag to false if running on CPU
GPU = True

# Change the column to use either essay1/2, title or resource summary
column = 'project_essay_1'

## TorchText

In [31]:
import torch
from torchtext import data
from torchtext.vocab import GloVe

TEXT = data.Field(include_lengths=True)
ID = data.Field(sequential=False)
LABEL = data.Field(sequential=False)

def sort_key(ex):
    return len(vars(ex)[column])

train, val, test = data.TabularDataset.splits(
        path='.', train='train.csv',
        validation='val.csv', test='dev.csv', format='csv', skip_header=True,
        fields=[('id', ID), ('project_title', TEXT),('project_resource_summary', None), 
                ('project_essay_1', TEXT), ('project_essay_2', TEXT), ('project_is_approved', LABEL)])

#vocab is shared across all the text fields
#CAUTION: GloVe will download all embeddings locally (862 MB).  If not interested, remove "vectors"
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
ID.build_vocab(train)
LABEL.build_vocab(train)

#change device to 0 for GPU
train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), sort_key= sort_key, repeat=False,
        batch_size=(64), device=-1 if GPU else 0)

## Model

- Naive LSTM/BiLSTM classifier

In [32]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
import torch.optim as optim

class LSTMClassifier(nn.Module):

    def __init__(self, vocab, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(vocab.vectors)        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,bidirectional=True)
        self.hidden2label = nn.Linear(2*hidden_dim, label_size)

    def init_hidden(self,batch_size):
        if(GPU):
            h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim).cuda())
            c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim).cuda())
        
        else:
            h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            
        return (h0, c0)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        y  = self.hidden2label(lstm_out[-1])
        return y

from sklearn.metrics import accuracy_score, f1_score
def test_result(net,iter_obj):
    pred = []
    actual = []
    total_loss = 0.
    total = 0.
    for batch in iter_obj:
        input,label = vars(batch)[column][0], batch.project_is_approved-1
        if(GPU): input, label = input.cuda(), label.cuda()
        net.hidden = net.init_hidden(input.shape[1])
        scores = net(input)
        loss = loss_function(scores,label)
        total_loss += loss.data[0]
        total += len(batch)
        pred.extend(scores.cpu().data.numpy().argmax(axis=1))
        actual.extend(label.data.cpu().numpy().tolist())
    return  round(total_loss/total,4), round(accuracy_score(actual,pred),4)

### Train a Naive BiLSTM model
- Input: project_title

In [33]:
# %%time
import time
NUM_EPOCHS = 10
model = LSTMClassifier(vocab = TEXT.vocab, embedding_dim=300, vocab_size=len(TEXT.vocab), 
                       hidden_dim=50, batch_size=64, label_size=2)



if(GPU): model.cuda()

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

loss_l = []
for i in range(NUM_EPOCHS):
    start = time.time()
    for batch in train_iter:
        model.zero_grad()       
        input,label = vars(batch)[column][0], batch.project_is_approved-1
        if(GPU): input,label = input.cuda(),label.cuda()
        model.hidden = model.init_hidden(input.shape[1])

        
        scores = model(input)
        loss = loss_function(scores, label)
        loss.backward()
        optimizer.step()
    loss_l.append(loss.cpu().data.numpy())
    train_loss, train_acc = test_result(model,train_iter)
    val_loss, val_acc = test_result(model,val_iter)
    print(f"Epoch {i+1} Train loss: {train_loss} Val loss: {val_loss} Train acc: {train_acc} Val acc: {val_acc}")
#     if((i+1)%10==0):
#     print(f"End of {i+1} epoch(s)")
#     print(f"Train accuracy: {test_result(model,train_iter)}, Validation accuracy: {test_result(model,val_iter)}")

Epoch 1 Train loss: 0.010834628168741861 Val loss: 0.010856181871891022 Train acc: 0.5004 Val acc: 0.4982
Epoch 2 Train loss: 0.010817983076969783 Val loss: 0.010942785966396331 Train acc: 0.5061 Val acc: 0.5313
Epoch 3 Train loss: 0.010731731692949931 Val loss: 0.011484366422891617 Train acc: 0.5114 Val acc: 0.5037
Epoch 4 Train loss: 0.010595243789752325 Val loss: 0.014691692617535591 Train acc: 0.5171 Val acc: 0.5027
Epoch 5 Train loss: 0.010465758776664735 Val loss: 0.016222913306951522 Train acc: 0.5227 Val acc: 0.5048
Epoch 6 Train loss: 0.010424254333972932 Val loss: 0.016707074457406997 Train acc: 0.5233 Val acc: 0.5006
Epoch 7 Train loss: 0.010350942740837733 Val loss: 0.016917417538166047 Train acc: 0.5257 Val acc: 0.4989
Epoch 8 Train loss: 0.010320433169603347 Val loss: 0.016894089859724046 Train acc: 0.5279 Val acc: 0.5016
Epoch 9 Train loss: 0.010240207606554032 Val loss: 0.019885657703876496 Train acc: 0.5298 Val acc: 0.5142
Epoch 10 Train loss: 0.010229211755593617 Val 

In [34]:
# %%time
import time
NUM_EPOCHS = 100
model = LSTMClassifier(vocab = TEXT.vocab, embedding_dim=300, vocab_size=len(TEXT.vocab), 
                       hidden_dim=50, batch_size=64, label_size=2)



if(GPU): model.cuda()

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

loss_l = []
for i in range(NUM_EPOCHS):
    start = time.time()
    for batch in train_iter:
        model.zero_grad()       
        input,label = vars(batch)[column][0], batch.project_is_approved-1
        if(GPU): input,label = input.cuda(),label.cuda()
        model.hidden = model.init_hidden(input.shape[1])

        
        scores = model(input)
        loss = loss_function(scores, label)
        loss.backward()
        optimizer.step()
    loss_l.append(loss.cpu().data.numpy())
#     train_loss, train_acc = test_result(model,train_iter)
#     val_loss, val_acc = test_result(model,val_iter)
#     print(f"Epoch {i+1} Train loss: {train_loss} Val loss: {val_loss} Train acc: {train_acc} Val acc: {val_acc}")
    if((i+1)%10==0):
        train_loss, train_acc = test_result(model,train_iter)
        val_loss, val_acc = test_result(model,val_iter)

        print(f"End of {i+1} epoch(s)")
        print(f"Train loss: {train_loss} Val loss: {val_loss} Train acc: {train_acc} Val acc: {val_acc}")

End of 10 epoch(s)
Train loss: 0.009447158187627792 Val loss: 0.021745308941602707 Train acc: 0.5742 Val acc: 0.5219
End of 20 epoch(s)
Train loss: 0.005528450036048889 Val loss: 0.028323950338363647 Train acc: 0.778 Val acc: 0.5269
End of 30 epoch(s)
Train loss: 0.004875019164383412 Val loss: 0.029892107379436494 Train acc: 0.7932 Val acc: 0.5284
End of 40 epoch(s)
Train loss: 0.004605301079154015 Val loss: 0.03133709924221039 Train acc: 0.8007 Val acc: 0.5312
End of 50 epoch(s)
Train loss: 0.004518949463963509 Val loss: 0.030084283912181854 Train acc: 0.8026 Val acc: 0.5284
End of 60 epoch(s)
Train loss: 0.0044067384536067645 Val loss: 0.03323603180646896 Train acc: 0.8041 Val acc: 0.5308
End of 70 epoch(s)
Train loss: 0.004350695995986462 Val loss: 0.031539781284332276 Train acc: 0.8052 Val acc: 0.5288
End of 80 epoch(s)
Train loss: 0.004292706365386645 Val loss: 0.03157402411699295 Train acc: 0.8049 Val acc: 0.5318
End of 90 epoch(s)
Train loss: 0.0042093737542629244 Val loss: 0.03