In [68]:
import torchtext
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import glob

In [58]:
'''
batch_first = True 는, n grams? input vector 를 열로 만들것인지, 행으로 만들것인지를 결정하는 옵션
fix_length = 
'''
TEXT = torchtext.data.Field(lower=True, batch_first=False, fix_length=200)
LABEL = torchtext.data.Field(sequential=False, )

In [59]:
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL)

In [60]:
TEXT.build_vocab(train, vectors=torchtext.vocab.GloVe(name='6B', dim=300), max_size=10000, min_freq=10)
LABEL.build_vocab(train)

In [66]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, test_iter = torchtext.data.BucketIterator.splits((test, train), batch_size=32, device=device, shuffle=False)

train_iter.repeat = False
test_iter.repeat = False

In [17]:
class IMDBRnn(torch.nn.Module):
    def __init__(self, n_vocab, hidden_size, n_cat, bs=1, nl=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.bs = bs
        self.nl = nl
        self.e = torch.nn.Embedding(n_vocab, hidden_size)
        self.rnn = torch.nn.LSTM(hidden_size, hidden_size, nl)
        self.fc2 = torch.nn.Linear(hidden_size, n_cat)
        self.softmax = torch.nn.LogSoftmax(dim=-1)
        
    
    def forward(self, inp):
        bs = inp.size()[1]
        if bs != self.bs:
            self.bs = bs
        e_out = self.e(inp)
        h0 = c0 = Variable(e_out.data.new(*(self.nl, self.bs, self.hidden_size)).zero_())
        rnn_o, _ = self.rnn(e_out, (h0, c0))
        rnn_o = rnn_o[-1]
        fc = F.dropout(self.fc2(rnn_o), p=0.8)
        return self.softmax(fc)
    
def fit(epoch, model, optimizer, data_loader, phase='training', volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile = True
    
    running_loss = 0.0
    running_correct = 0
    
    for batch_idx, batch in enumerate(data_loader):
        text, target = batch.text, batch.label
        if torch.cuda.is_available():
            text, target = text.cuda(), target.cuda()
            
        if phase == 'training':
            optimizer.zero_grad()
            
        output = model(text)
        loss = F.nll_loss(output, target)
        running_loss += F.nll_loss(output, target, size_average=False).data.item()
        preds = output.data.max(dim=1, keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        
        if phase == 'training':
            loss.backward()
            optimizer.step()
            return
    loss = running_loss / len(data_loader.dataset)
    accuracy = 100. * running_correct / len(data_loader.dataset)
    
    
    print(f'{phase} loss is {loss:5.2f} and {phase} accuracy is {running_correct} / {len(data_loader.dataset)} {accuracy:10.4f}')
    return loss, accuracy

In [19]:
n_vocab = len(TEXT.vocab.stoi)
n_hidden = 30

model = IMDBRnn(n_vocab, n_hidden, 3, bs=32)
model = model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_losses, train_accuracy = [], []
val_losses, val_accuracy = [], []

for epoch in range(1, 5):
    epoch_loss, epoch_accuracy = fit(epoch, model, optimizer, train_iter, phase='training')
    val_epoch_loss, val_epoch_accuracy = fit(epoch, model, optimizer, test_iter, phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

training loss is  1.00 and training accuracy is 6139 / 25000    24.5560
validation loss is  1.00 and validation accuracy is 6189 / 25000    24.7560
training loss is  1.00 and training accuracy is 6170 / 25000    24.6800
validation loss is  1.00 and validation accuracy is 6062 / 25000    24.2480
training loss is  1.00 and training accuracy is 6135 / 25000    24.5400
validation loss is  1.01 and validation accuracy is 6041 / 25000    24.1640
training loss is  1.00 and training accuracy is 6005 / 25000    24.0200
validation loss is  1.00 and validation accuracy is 6054 / 25000    24.2160


In [67]:
for batch in train_iter:
    text, label = batch
    tmp = 0
    print(text[0].shape)
    tmp_text = text[0].transpose(0, 1)
    for row in tmp_text:
        print(' '.join([TEXT.vocab.itos[idx] for idx in row]))
        tmp += 1
        if tmp == 5:
            break
    tmp = 0
    print(text[1])
#     for label in text[1]:
#         print(label)
#         tmp += 1
#         if tmp == 5:
#             break
#         break
    break

torch.Size([200, 32])
i went and saw this movie last night after being <unk> to by a few friends of mine. i'll admit that i was reluctant to see it because from what i knew of <unk> <unk> he was only able to do comedy. i was wrong. <unk> played the character of jake <unk> very well, and kevin <unk> played ben <unk> with such <unk> the sign of a good movie is that it can toy with our emotions. this one did exactly that. the entire theater (which was sold <unk> was overcome by laughter during the first half of the movie, and were moved to tears during the second half. while <unk> the theater i not only saw many women in <unk> but many full grown men as well, trying desperately not to let anyone see them <unk> this movie was great, and i suggest that you go see it before you <unk> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <

In [26]:
train

<torchtext.datasets.imdb.IMDB at 0x1c303df6fc8>

In [86]:
target_string = "saw this movie last night"

for file in glob.glob('.data/imdb/aclImdb/train/unsup/*.txt'):
    content = open(file, 'r', encoding='UTF8').read().lower()
    if not content.find(target_string) == -1:
        print(file)

.data/imdb/aclImdb/train/unsup\13200_0.txt
.data/imdb/aclImdb/train/unsup\18360_0.txt
.data/imdb/aclImdb/train/unsup\18376_0.txt
.data/imdb/aclImdb/train/unsup\31179_0.txt
.data/imdb/aclImdb/train/unsup\32841_0.txt
.data/imdb/aclImdb/train/unsup\36245_0.txt
.data/imdb/aclImdb/train/unsup\37874_0.txt
.data/imdb/aclImdb/train/unsup\8085_0.txt
.data/imdb/aclImdb/train/unsup\9326_0.txt
.data/imdb/aclImdb/train/unsup\9383_0.txt
.data/imdb/aclImdb/train/unsup\9385_0.txt
