In [58]:
import pandas as pd, numpy as np
from collections import Counter

# Set GPU flag to false if running on CPU
GPU = True
batch_size = 32
column1 = 'project_essay_1'
column2 = 'project_essay_2'

In [43]:
import torch
from torchtext import data
from torchtext.vocab import GloVe

TEXT = data.Field(include_lengths=True)
ID = data.Field(sequential=False)
LABEL = data.Field(sequential=False)


def sort_key(ex):
    return len(getattr(ex,column1))

train, val, test = data.TabularDataset.splits(
        path='.', train='train.csv',
        validation='val.csv', test='dev.csv', format='csv', skip_header=True,
        fields=[('id', ID), ('project_title', None),('project_resource_summary', None), 
                ('project_essay_1', TEXT), ('project_essay_2', TEXT), ('project_is_approved', LABEL)])

#vocab is shared across all the text fields
#CAUTION: GloVe will download all embeddings locally (862 MB).  If not interested, remove "vectors"
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
ID.build_vocab(train)
LABEL.build_vocab(train)

#change device to 0 for GPU
train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test), repeat=False, sort=False,#sort_key= sort_key, sort_within_batch=True,
        batch_size=(batch_size), device=-1 if GPU else 0)

In [60]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
import torch.optim as optim
    
class LSTMClassifier(nn.Module):

    def __init__(self, vocab, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(vocab.vectors)        
        self.lstm = nn.GRU(embedding_dim, hidden_dim)
        self.fc = nn.Linear(2*hidden_dim,2)

    def init_hidden(self,batch_size):
        if(GPU):
            h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim).cuda())
#             c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim).cuda())
        
        else:
            h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
#             c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim))
            
        return h0
#         return (h0, c0)

    def forward(self, text1,text2,):
        
        embeds1 = self.word_embeddings(text1)
        embeds2 = self.word_embeddings(text2)
        
        self.hidden = self.init_hidden(text1.shape[1])
        lstm_out, self.hidden = self.lstm(embeds1, self.hidden)
        feat1 = self.hidden[0].squeeze(0)
        
        self.hidden = self.init_hidden(text1.shape[1])
        lstm_out, self.hidden = self.lstm(embeds2, self.hidden)
        feat2 = self.hidden[0].squeeze(0)

        feat = torch.cat([feat1,feat2],dim=1)
        y = self.fc(feat)
        return y

In [61]:
from sklearn.metrics import accuracy_score, f1_score
def validate(net,iter_obj):
    loss_function = nn.CrossEntropyLoss()
    net.eval()
    iter_obj.init_epoch()
    pred = []
    actual = []
    dev_loss = []
    with torch.no_grad():
        for batch in iter_obj:
            input1,input2,label = getattr(batch,column1)[0], getattr(batch,column2)[0], batch.project_is_approved-1
            if(GPU): input1, label, input2 = input1.cuda(), label.cuda(), input2.cuda()
            scores = net(input1,input2)
            dev_loss.append(loss_function(scores,label).item())
            pred.extend(scores.cpu().data.numpy().argmax(axis=1))
            actual.extend(label.data.cpu().numpy().tolist())
    return  round(np.mean(dev_loss),4), round(accuracy_score(actual,pred),4)

In [62]:
def train_net(NUM_EPOCHS):
    model = LSTMClassifier(vocab = TEXT.vocab, embedding_dim=300, vocab_size=len(TEXT.vocab), 
                           hidden_dim=50, batch_size=batch_size, label_size=2)
    
    if(GPU): model.cuda()

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)

    n_correct = 0.
    n_total = 0.
    for i in range(NUM_EPOCHS):
        train_loss = []
        train_iter.init_epoch()
        for batch in train_iter:
            model.train()
            optimizer.zero_grad()       
            input1,input2,label = getattr(batch,column1)[0], getattr(batch,column2)[0], batch.project_is_approved-1
            if(GPU): input1, label, input2 = input1.cuda(), label.cuda(), input2.cuda()
            
            scores = net(input1,input2)
            loss = loss_function(scores, label)
            loss.backward()
            optimizer.step()

            n_correct += (torch.max(scores, 1)[1].view(label.size()) == label).sum().item()
            n_total += batch.batch_size
            train_acc = 100. * n_correct/n_total
            train_loss.append(loss.item())

        if(NUM_EPOCHS<=10) or ((i+1)%10==0):
            val_loss, val_acc = validate(model,val_iter)
            print(f"Epoch {i+1} Train loss: {round(np.mean(train_loss),4)} Val loss: {val_loss} Train acc: {train_acc} Val acc: {val_acc}")

    return model

In [23]:
%%time
## Combining essay 1 and essay 2
model = train_net(10)

  return Variable(arr, volatile=not train)
  return Variable(arr, volatile=not train), lengths


Epoch 1 Train loss: 0.6934 Val loss: 0.6939 Train acc: 50.42 Val acc: 0.5
Epoch 2 Train loss: 0.6907 Val loss: 0.6947 Train acc: 50.625 Val acc: 0.4997
Epoch 3 Train loss: 0.6775 Val loss: 0.7133 Train acc: 51.233333333333334 Val acc: 0.5003
Epoch 4 Train loss: 0.6474 Val loss: 0.7267 Train acc: 52.62083333333333 Val acc: 0.5283
Epoch 5 Train loss: 0.5014 Val loss: 0.7485 Train acc: 57.06133333333333 Val acc: 0.6177
Epoch 6 Train loss: 0.2102 Val loss: 1.0785 Train acc: 62.81944444444444 Val acc: 0.5855
Epoch 7 Train loss: 0.0475 Val loss: 1.6903 Train acc: 67.91571428571429 Val acc: 0.5803
Epoch 8 Train loss: 0.0121 Val loss: 2.2447 Train acc: 71.88625 Val acc: 0.5768
Epoch 9 Train loss: 0.006 Val loss: 2.3537 Train acc: 74.99444444444444 Val acc: 0.5799
Epoch 10 Train loss: 0.0035 Val loss: 2.381 Train acc: 77.48433333333334 Val acc: 0.5821
CPU times: user 4min 51s, sys: 1min 12s, total: 6min 3s
Wall time: 6min 3s


In [24]:
%%time
## Combining essay 1 and essay 2
model = train_net(100)

  return Variable(arr, volatile=not train)
  return Variable(arr, volatile=not train), lengths


Epoch 10 Train loss: 0.0026 Val loss: 2.1051 Train acc: 83.30666666666667 Val acc: 0.6128
Epoch 20 Train loss: 0.0007 Val loss: 2.8417 Train acc: 91.635 Val acc: 0.614
Epoch 30 Train loss: 0.0005 Val loss: 3.1582 Train acc: 94.41266666666667 Val acc: 0.6136
Epoch 40 Train loss: 0.0004 Val loss: 3.0658 Train acc: 95.80291666666666 Val acc: 0.6098
Epoch 50 Train loss: 0.0003 Val loss: 3.2577 Train acc: 96.63713333333334 Val acc: 0.6107
Epoch 60 Train loss: 0.0003 Val loss: 3.2977 Train acc: 97.19338888888889 Val acc: 0.6109
Epoch 70 Train loss: 0.0003 Val loss: 3.3285 Train acc: 97.5907619047619 Val acc: 0.611
Epoch 80 Train loss: 0.0003 Val loss: 3.4714 Train acc: 97.88904166666667 Val acc: 0.6121
Epoch 90 Train loss: 0.0003 Val loss: 3.42 Train acc: 98.1207037037037 Val acc: 0.6121
Epoch 100 Train loss: 0.0003 Val loss: 3.4455 Train acc: 98.3062 Val acc: 0.6105
CPU times: user 48min 52s, sys: 12min 47s, total: 1h 1min 40s
Wall time: 1h 1min 42s
