In [1]:
import torch
import torch.nn.functional as F
import torchtext
import time
import os
import pandas as pd

In [2]:
# Declare bi-directional LSTM for sentiment analyzer
class Network(torch.nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_size, num_outputs, num_layers=2):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, emb_dim)
        # Bi-directional LSTM
        self.rnn = torch.nn.LSTM(emb_dim,
                                 hidden_size,
                                 num_layers=num_layers,
                                 dropout=0.3,
                                 bidirectional=True)
        # doubling size of hidden layers because of the bi-directional LSTM
        self.fc = torch.nn.Linear(hidden_size * 2, num_outputs)
        self.softmax = torch.nn.LogSoftmax(dim=-1)
        
    def forward(self, inputs):
        embs = self.embedding(inputs)
        output, _ = self.rnn(embs)
        output = self.fc(output[-1])
        return self.softmax(output)

In [3]:
# Load data from tsv format file
def LoadTSV(file_path, columns, skip_header=True):
    return torchtext.data.TabularDataset(file_path, 'TSV', columns, skip_header=skip_header)

In [4]:
# Data type for label(=target)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, dtype=torch.long)
# Data type for phrases
TEXT = torchtext.data.Field(fix_length=50, use_vocab=True, lower=True)
# Phrase ID
ID = torchtext.data.Field(sequential=False, use_vocab=False, dtype=torch.long)

train_columns = [
    ('PhraseId', None),
    ('SentenceId', None),
    ('Phrase', TEXT),
    ('Sentiment', LABEL)
]

test_columns = [
    ('PhraseId', ID),
    ('SentenceId', None),
    ('Phrase', TEXT)
]

train = LoadTSV('./dataset/train.tsv/train.tsv', train_columns)
test = LoadTSV('./dataset/test.tsv/test.tsv', test_columns)
# Build vocab from phrases and use Glove vector for transfer learning
TEXT.build_vocab(train, vectors=torchtext.vocab.GloVe(name='6B', dim=300), max_size=50000)
# Build vocab from labels
LABEL.build_vocab(train)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

train_iter, test_iter = torchtext.data.BucketIterator.splits((train, test), sort=False, batch_size=128, device=device)
train_iter.repeat = False
test_iter.repeat = False

In [5]:
vocab_size = len(TEXT.vocab)
emb_dim = 300
hidden_size = 300
# 0, 1, 2, 3, 4 (Sentiment score)
num_outputs = 5
num_layers = 2

model = Network(vocab_size, emb_dim, hidden_size, num_outputs, num_layers=num_layers)
# Use Glove pretrained vector on the embedding layer
model.embedding.weight.data = TEXT.vocab.vectors
model.embedding.weight.require_grad = False

if torch.cuda.is_available():
    model = model.cuda()

epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

test_loss, test_accuracy = [], []

def training(model, dataset, optimizer, scheduler, epochs=10):
    model.train()
    training_loss, train_accuracy = [], []
    dataset_size = len(dataset.dataset)
    for epoch in range(epochs):
        epoch_begin = time.time()
        epoch_loss = 0.0
        epoch_corrects = 0
        print(f'------------- Epoch {epoch + 1} -------------')
        for batch in dataset:
            text, labels = batch.Phrase, batch.Sentiment
            if torch.cuda.is_available():
                text, labels = text.cuda(), labels.cuda()
            optimizer.zero_grad()
            output = model(text)
            loss = F.nll_loss(output, labels, reduction='sum')
            _, preds = torch.max(output, dim=1)
            epoch_loss += loss.data.detach().item()
            epoch_corrects += preds.eq(labels.data.view_as(preds)).sum()
            
            loss.backward()
            optimizer.step()
        print(f'Loss / Accuracy : {epoch_loss / dataset_size :.4f} / {100. * epoch_corrects / dataset_size :.4f}% === {time.time() - epoch_begin}')
        scheduler.step()

In [6]:
training(model, train_iter, optimizer, scheduler, epochs=epochs)

------------- Epoch 1 -------------
Loss / Accuracy : 1.0852 / 56.9249% === 44.60798907279968
------------- Epoch 2 -------------
Loss / Accuracy : 0.7735 / 68.1296% === 44.22965931892395
------------- Epoch 3 -------------
Loss / Accuracy : 0.6965 / 71.0246% === 44.11410093307495
------------- Epoch 4 -------------
Loss / Accuracy : 0.6493 / 72.9668% === 44.167043685913086
------------- Epoch 5 -------------
Loss / Accuracy : 0.6141 / 74.3733% === 44.156005859375
------------- Epoch 6 -------------
Loss / Accuracy : 0.5353 / 78.0283% === 44.176945209503174
------------- Epoch 7 -------------
Loss / Accuracy : 0.5195 / 78.5653% === 44.22783422470093
------------- Epoch 8 -------------
Loss / Accuracy : 0.5088 / 79.0337% === 44.584755659103394
------------- Epoch 9 -------------
Loss / Accuracy : 0.4986 / 79.3611% === 44.619701623916626
------------- Epoch 10 -------------
Loss / Accuracy : 0.4891 / 79.7687% === 44.17894101142883


In [7]:
torch.save(model.state_dict(), 'final_model.pth')

In [29]:
def testing(model, dataset):
    # evaluation mode
    model.eval()
    result = []
    for batch in dataset:
        text, ids = batch.Phrase, batch.PhraseId
        output = model(text)
        _, preds = torch.max(output, dim=1)
        tmp = torch.cat((ids.view(-1, 1), preds.view(-1, 1)), 1).cpu().detach().numpy()
        for e in tmp:
            result.append(e)
    return result

result = testing(model, test_iter)
submission = pd.DataFrame(result, columns=['PhraseId', 'Sentiment'])
submission.to_csv('submission.csv', index=False)