In [51]:
import torchtext
import torch
import torch.nn.functional as F

In [52]:
TEXT = torchtext.data.Field(lower=True, batch_first=False, fix_length=200)
LABEL = torchtext.data.Field(sequential=False)

In [53]:
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL)

In [54]:
print('train.fields', train.fields)
print(vars(train[0]))

train.fields {'text': <torchtext.data.field.Field object at 0x0000018D451A4348>, 'label': <torchtext.data.field.Field object at 0x0000018D451A4088>}
{'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'i

In [55]:
TEXT.build_vocab(train, vectors=torchtext.vocab.GloVe(name='6B', dim=300), max_size=10000, min_freq=10)
LABEL.build_vocab(train)

In [21]:
print(TEXT.vocab.freqs)
# print(TEXT.vocab.vectors)
# print(TEXT.vocab.stoi)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, test_iter = torchtext.data.BucketIterator.splits((test, train), batch_size=32, device=device, shuffle=True)

train_iter.repeat = False
test_iter.repeat = False

batch = next(iter(train_iter))
batch.text
print(batch.text.shape)

torch.Size([200, 32])


In [48]:
class IMDBRnn(torch.nn.Module):
    def __init__(self, n_vocab, hidden_size, n_cat, bs=1, nl=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.bs = bs
        self.nl = nl
        self.e = torch.nn.Embedding(n_vocab, hidden_size)
        self.rnn = torch.nn.LSTM(hidden_size, hidden_size, nl)
        self.fc2 = torch.nn.Linear(hidden_size, n_cat)
        self.softmax = torch.nn.LogSoftmax(dim=-1)
        
    
    def forward(self, inp):
        bs = inp.size()[1]
        if bs != self.bs:
            self.bs = bs
        e_out = self.e(inp)
        h0 = c0 = e_out.data.new(*(self.nl, self.bs, self.hidden_size)).zero_()
        rnn_o, _ = self.rnn(e_out, (h0, c0))
        rnn_o = rnn_o[-1]
        fc = F.dropout(self.fc2(rnn_o), p=0.8)
        return self.softmax(fc)
    
def fit(epoch, model, optimizer, data_loader, phase='training', volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile = True
    
    running_loss = 0.0
    running_correct = 0
    
    for batch_idx, batch in enumerate(data_loader):
        text, target = batch.text, batch.label
        if torch.cuda.is_available():
            text, target = text.cuda(), target.cuda()
            
        if phase == 'training':
            optimizer.zero_grad()
            
        output = model(text)
        loss = F.nll_loss(output, target)
        running_loss += F.nll_loss(output, target, size_average=False).data.item()
        preds = output.data.max(dim=1, keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        
        if phase == 'training':
            loss.backward()
            optimizer.step()
            
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * running_correct / len(data_loader.dataset)
    
    print(f'{phase} loss is {loss:5.2f} and {phase} accuracy is {running_correct} / {len(data_loader.dataset)} {accuracy:10.4f}')
    return loss, accuracy

In [49]:
model = EmbNet(len(TEXT.vocab.stoi), 300, 12000)

model.embedding.weight.data = TEXT.vocab.vectors

model.embedding.weight.requires_grad = False
optimizer = torch.optim.SGD([ param for param in model.parameters() if param.requires_grad == True ], lr=0.001)

if torch.cuda.is_available():
    model.cuda()
    
train_losses, train_accuracy = [], []
val_losses, val_accuracy = [], []

for epoch in range(1, 10):
    epoch_loss, epoch_accuracy = fit(epoch, model, optimizer, train_iter, phase='training')
    val_epoch_loss, val_epoch_accuracy = fit(epoch, model, optimizer, test_iter, phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

training loss is  0.74 and training accuracy is 13049 / 25000    52.1960
validation loss is  0.70 and validation accuracy is 13780 / 25000    55.1200
training loss is  0.69 and training accuracy is 14283 / 25000    57.1320
validation loss is  0.68 and validation accuracy is 14558 / 25000    58.2320
training loss is  0.67 and training accuracy is 15052 / 25000    60.2080
validation loss is  0.67 and validation accuracy is 14990 / 25000    59.9600
training loss is  0.66 and training accuracy is 15522 / 25000    62.0880
validation loss is  0.67 and validation accuracy is 15204 / 25000    60.8160
training loss is  0.65 and training accuracy is 15783 / 25000    63.1320
validation loss is  0.66 and validation accuracy is 15469 / 25000    61.8760
training loss is  0.65 and training accuracy is 16022 / 25000    64.0880
validation loss is  0.66 and validation accuracy is 15606 / 25000    62.4240
training loss is  0.64 and training accuracy is 16285 / 25000    65.1400
validation loss is  0.65 an

In [None]:
rnn = torch.nn.RNN(input_size,)