In [1]:
import torchtext
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import glob

In [2]:
'''
batch_first = True 는, n grams? input vector 를 열로 만들것인지, 행으로 만들것인지를 결정하는 옵션
fix_length = 해당 파일에서 최대 몇 단어까지 들고올 것 인지에 대한 설정값, 200 이하면 <pad>를 채워서 강제로 200 길이에 맞춤
'''
TEXT = torchtext.data.Field(lower=True, batch_first=False, fix_length=200)
LABEL = torchtext.data.Field(sequential=False, )

In [3]:
'''
이렇게 splits로 들고오면, pos/neg 밖에 구분하지 못함. score 값은 적용이 안됨 현재는 1 : neg, 2 : pos
'''
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL)

In [4]:
'''
TEXT의 vocab를 보면 실제로는 10000을 넘어가는데, 그 이유는 추가적으로 필요한 <unk>, <pad> ... 가 있기 때문이다.
'''
TEXT.build_vocab(train, vectors=torchtext.vocab.GloVe(name='6B', dim=300), max_size=10000, min_freq=10)
'''
3개 있음, neg / pos / unk
'''
LABEL.build_vocab(train)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, test_iter = torchtext.data.BucketIterator.splits((train, test), batch_size=32, device=device, shuffle=False)

train_iter.repeat = False
test_iter.repeat = False

In [10]:
class IMDBRnn(torch.nn.Module):
    '''
    n_vocab : # vocab
    hidden_size : # features in the hidden state h
    n_cat : # outputs
    bs : batch
    nl : # layers
    '''
    def __init__(self, n_vocab, hidden_size, n_cat, bs=1, nl=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.bs = bs
        self.nl = nl
        self.e = torch.nn.Embedding(n_vocab, hidden_size)
        self.rnn = torch.nn.LSTM(hidden_size, hidden_size, nl)
        self.fc2 = torch.nn.Linear(hidden_size, n_cat)
        self.softmax = torch.nn.LogSoftmax(dim=-1)
        
    
    def forward(self, inp):
        bs = inp.size()[1]
        if bs != self.bs:
            self.bs = bs
        e_out = self.e(inp)
        # For every batch step, h0 and c0 are should be initialized with zero
        h0 = c0 = e_out.data.new(*(self.nl, self.bs, self.hidden_size)).zero_()
        rnn_o, _ = self.rnn(e_out, (h0, c0))
        # Real output of the network
        rnn_o = rnn_o[-1]
        fc = F.dropout(self.fc2(rnn_o), p=0.5)
        return self.softmax(fc)
    

def fit(epoch, model, optimizer, data_loader, phase='training', volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile = True
    
    running_loss = 0.0
    running_correct = 0
    
    for batch_idx, batch in enumerate(data_loader):
        text, target = batch.text, batch.label
        if torch.cuda.is_available():
            text, target = text.cuda(), target.cuda()
            
        if phase == 'training':
            optimizer.zero_grad()
            
        output = model(text)
        loss = F.nll_loss(output, target)
        
        running_loss += F.nll_loss(output, target, reduction='sum').data.item()
        preds = output.data.max(dim=1, keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        
        if phase == 'training':
            loss.backward()
            optimizer.step()
            
    loss = running_loss / len(data_loader.dataset)
    accuracy = 100. * running_correct / len(data_loader.dataset)
    
    
    print(f'{phase} loss is {loss:5.2f} and {phase} accuracy is {running_correct} / {len(data_loader.dataset)} {accuracy:10.4f}')
    return loss, accuracy

In [18]:
n_vocab = len(TEXT.vocab.stoi)
# n_hidden = 100
n_hidden = 300

model = IMDBRnn(n_vocab, n_hidden, 3, bs=32)

model.e.weight.data = TEXT.vocab.vectors
model.e.weight.requires_grad = False

model = model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

train_losses, train_accuracy = [], []
val_losses, val_accuracy = [], []

for epoch in range(1, 15):
    epoch_loss, epoch_accuracy = fit(epoch, model, optimizer, train_iter, phase='training')
    val_epoch_loss, val_epoch_accuracy = fit(epoch, model, optimizer, test_iter, phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)
    scheduler.step()

training loss is  0.88 and training accuracy is 10875 / 25000    43.5000
validation loss is  0.88 and validation accuracy is 10918 / 25000    43.6720
training loss is  0.88 and training accuracy is 10956 / 25000    43.8240
validation loss is  0.87 and validation accuracy is 10979 / 25000    43.9160
training loss is  0.88 and training accuracy is 10890 / 25000    43.5600
validation loss is  0.88 and validation accuracy is 10813 / 25000    43.2520
training loss is  0.87 and training accuracy is 10965 / 25000    43.8600
validation loss is  0.88 and validation accuracy is 10857 / 25000    43.4280
training loss is  0.87 and training accuracy is 11161 / 25000    44.6440
validation loss is  0.85 and validation accuracy is 11343 / 25000    45.3720
training loss is  0.77 and training accuracy is 13214 / 25000    52.8560
validation loss is  0.76 and validation accuracy is 13266 / 25000    53.0640
training loss is  0.71 and training accuracy is 13903 / 25000    55.6120
validation loss is  0.70 an