In [1]:
import pandas as pd, numpy as np
from collections import Counter
import torch
from torchtext import data
from torchtext.vocab import GloVe
from torchtext.data.field import Field
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
import torch.optim as optim

# Set GPU flag to false if running on CPU
GPU = True
batch_size = 32
column1 = 'project_essay_1'
column2 = 'project_essay_2'

In [2]:
TEXT = data.Field(include_lengths=True, batch_first = True)
ID = data.Field(sequential=False)
LABEL = data.Field(sequential=False)


def sort_key(ex):
    return len(getattr(ex,column1))

train, val, test = data.TabularDataset.splits(
        path='.', train='train.csv',
        validation='val.csv', test='dev.csv', format='csv', skip_header=True,
        fields=[('id', ID), ('project_title', TEXT),('project_resource_summary', TEXT), 
                ('project_essay_1', TEXT), ('project_essay_2', TEXT), ('project_is_approved', LABEL)])

#vocab is shared across all the text fields
#CAUTION: GloVe will download all embeddings locally (862 MB).  If not interested, remove "vectors"
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
ID.build_vocab(train)
LABEL.build_vocab(train)

#change device to 0 for GPU
train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test), repeat=False, sort=False,#sort_key= sort_key, sort_within_batch=True,
        batch_size=(batch_size), device=-1)#-1 if GPU else 0)

In [3]:
from typing import List, Optional, Dict
class DAN(nn.Module):
    def __init__(self, embedding_dim, 
                 h1_dim, text_field, answer_size):
        super(DAN, self).__init__()
        
        #initialize the vocab from the text field (passed in from train_iter) and pad
        text_vocab = text_field.vocab
        self.text_vocab_size = len(text_vocab)
        text_pad_idx = text_vocab.stoi[text_field.pad_token]

        
        #run the vocab through Glove Embeddings
        self.text_embeddings = nn.Embedding(self.text_vocab_size, embedding_dim, padding_idx=text_pad_idx)
        self.text_field = text_field
        
        #set the unknown items to the mean embedding and make them cuda()
        mean_emb = text_vocab.vectors.mean(0)
        text_vocab.vectors[text_vocab.stoi[text_field.unk_token]] = mean_emb
        self.text_embeddings.weight.data = text_vocab.vectors.cuda()
        
        #freeze the embeddings
        #self.text_embeddings.weight.requires_grad = False 
        
        #layers used in the network
        self.large_dropout = nn.Dropout(p=0.265)
        self.small_dropout = nn.Dropout(p=0.15)
        self.nonlinear = nn.Sigmoid()
        self.hidden = nn.Linear(embedding_dim, h1_dim)
        self.batch_norm = nn.BatchNorm1d(h1_dim)
        
        #the classifier converts the hidden dimensions into the answers.  
        #It takes batch norm and dropout as well.
        self.classifier = nn.Sequential(
            nn.Linear(h1_dim, answer_size),
            nn.BatchNorm1d(answer_size),
            nn.Dropout(.15)
        )

    def _pool(self, embed, lengths, batch_size):
        return embed.sum(1) / lengths.view(batch_size, -1)
    
    def forward(self, input_: Dict[str, Variable], lengths: Dict): 
        for key in lengths:
            if not isinstance(lengths[key], Variable):
                lengths[key] = Variable(lengths[key].float(), volatile=not self.training)
        
        #if the text exists, run it through embeddings, pool, dropout, and then run it through a hidden layer
        if self.text_field is not None:
            text_input = input_['text']
            embed = self.text_embeddings(text_input)
            #print(embed.sum(0))
            averaged = self._pool(embed, lengths['text'].float(), text_input.size()[0])
            averaged_dropped = self.small_dropout(averaged)
            hidden_layer = self.hidden(averaged_dropped)
            batchnormed_dropped = self.large_dropout(self.batch_norm(hidden_layer))
            nonlinear = self.nonlinear(batchnormed_dropped)
            return self.classifier(nonlinear )

In [4]:
from sklearn.metrics import accuracy_score, f1_score
def validate(net,iter_obj, field):
    loss_function = nn.CrossEntropyLoss()
    net.eval()
    net.zero_grad()
    iter_obj.init_epoch()
    pred = []
    actual = []
    dev_loss = []
    for batch in iter_obj:
        #input1,input2,label = getattr(batch,column1)[0], getattr(batch,column2)[0], batch.project_is_approved-1
        #if(GPU): input1, label, input2 = input1.cuda(), label.cuda(), input2.cuda()
        text, lengths = getattr(batch,field)
        text = text.cuda()
        lengths = lengths.cuda()
        input_dict = {}
        lengths_dict = {}
        input_dict['text'] = text
        lengths_dict['text'] = lengths
            
        scores = net(input_dict,lengths_dict)
        dev_loss.append(float(loss_function(scores,label)))
        pred.extend(scores.cpu().data.numpy().argmax(axis=1))
        actual.extend(label.data.cpu().numpy().tolist())
    return  round(np.mean(dev_loss),4), round(accuracy_score(actual,pred),4)

In [5]:
fields: Dict[str, Field] = train_iter.dataset.fields
ANSWER_SIZE = 2
EMBEDDING_DIM = 300
HIDDEN_DIM = 1000 

def train_net(NUM_EPOCHS, field):
    model =  DAN(EMBEDDING_DIM, HIDDEN_DIM, fields[field], ANSWER_SIZE)
    #LSTMClassifier(vocab = TEXT.vocab, embedding_dim=300, vocab_size=len(TEXT.vocab), 
    #                       hidden_dim=50, batch_size=batch_size, label_size=2)
     
    model = model.cuda()

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0005)

    n_correct = 0.
    n_total = 0.
    for i in range(NUM_EPOCHS):
        train_loss = []
        train_iter.init_epoch()
        for batch in train_iter:
            input_dict = {}
            lengths_dict = {}
            if hasattr(batch, 'project_essay_1'):
                text, lengths = getattr(batch,field)
                text = text.cuda()
                lengths = lengths.cuda()
                input_dict['text'] = text
                lengths_dict['text'] = lengths
            
            model.train()
            model.zero_grad()       
            #input1,input2,label = getattr(batch,column1)[0], getattr(batch,column2)[0], batch.project_is_approved-1
            #if(GPU): input1, label, input2 = input1.cuda(), label.cuda(), input2.cuda()
            
            scores = model(input_dict, lengths_dict)
            #scores = model(input1,input2)
            loss = loss_function(scores, label)
            loss.backward()
            optimizer.step()
            n_correct += int((torch.max(scores, 1)[1].view(label.size()) == label).sum())
            n_total += batch.batch_size
            train_acc = 100. * n_correct/n_total
            train_loss.append(loss.data[0])

        if(NUM_EPOCHS<=10) or ((i+1)%10==0):
            val_loss, val_acc = validate(model,val_iter, field)
            print(f"Epoch {i+1} Train loss: {round(np.mean(train_loss),4)} Val loss: {val_loss} Train acc: {train_acc} Val acc: {val_acc}")
    print ("TEST ACC:", validate(model, test_iter, field))
    return model

In [7]:
%%time
## Combining essay 1 and essay 2
#model = train_net(5, 'project_essay_1')
#model = train_net(5, 'project_essay_2')
model = train_net(5, 'project_resource_summary')
model = train_net(5, 'project_title')


  # Remove the CWD from sys.path while we load stuff.


Epoch 1 Train loss: 0.6925 Val loss: 0.6544 Train acc: 58.29333333333334 Val acc: 0.6181
Epoch 2 Train loss: 0.6496 Val loss: 0.6542 Train acc: 60.435 Val acc: 0.6263
Epoch 3 Train loss: 0.6139 Val loss: 0.6628 Train acc: 62.681111111111115 Val acc: 0.6194
Epoch 4 Train loss: 0.5822 Val loss: 0.6889 Train acc: 64.605 Val acc: 0.601
Epoch 5 Train loss: 0.5513 Val loss: 0.7063 Train acc: 66.148 Val acc: 0.6002
TEST ACC: (0.7027, 0.6059)
Epoch 1 Train loss: 0.7053 Val loss: 0.6811 Train acc: 54.233333333333334 Val acc: 0.5702
Epoch 2 Train loss: 0.671 Val loss: 0.6837 Train acc: 56.565 Val acc: 0.5701
Epoch 3 Train loss: 0.6393 Val loss: 0.6938 Train acc: 58.992222222222225 Val acc: 0.5758
Epoch 4 Train loss: 0.6041 Val loss: 0.728 Train acc: 61.094166666666666 Val acc: 0.5675
Epoch 5 Train loss: 0.576 Val loss: 0.7344 Train acc: 62.962666666666664 Val acc: 0.5601
TEST ACC: (0.7281, 0.5626)
CPU times: user 1min 38s, sys: 46.7 s, total: 2min 25s
Wall time: 2min 25s


In [None]:
print ("TEST ACC:", validate(model, test_iter, 'project_essay_1'))

In [None]:
%%time
## Combining essay 1 and essay 2
model = train_net(100)