In [21]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from torchtext import datasets
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim

In [22]:
!pwd

/content


In [23]:
train_dataset = datasets.IMDB(split=('train'))

In [24]:
tokenizer = get_tokenizer('basic_english')

In [25]:
counter = Counter()
for(label, text) in train_dataset:
    counter.update(tokenizer(text))

In [26]:
vocabulary = vocab(counter, min_freq=10)
vocabulary.set_default_index(0)

In [27]:
text_transform = lambda x: [vocabulary[token] for token in tokenizer(x)]
label_transform = lambda x: 1 if x=='pos' else 0

In [28]:
def preprocessing(batch):
    label_list, text_list=[], []
    for(_label, _text) in batch:
        label_list.append(label_transform(_label))
        text_list.append(torch.tensor(text_transform(_text)))
    data = pad_sequence(text_list)
    target = torch.tensor(label_list)
    return data, target

In [29]:
train_dataset, test_dataset = datasets.IMDB(split=('train','test'))
train_loader = DataLoader(list(train_dataset), batch_size=8,shuffle=True, collate_fn=preprocessing)
test_loader = DataLoader(list(test_dataset), batch_size=8, shuffle=False, collate_fn=preprocessing)

In [30]:
class LSTM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, 16)
        self.cell = nn.LSTM(16,16)
        self.fc = nn.Linear(16,1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, X):
        out = self.embed(X)
        out, (hidden_state, cell_state) = self.cell(out)
        out = self.fc(hidden_state.view(-1,16))
        out = self.sigmoid(out)
        return out

In [31]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

vocab_size = len(vocabulary)
model = LSTM(vocab_size).to(device)

criterion  = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [32]:
def train(model, criterion, optimizer, loader):
    epoch_loss=0
    epoch_acc = 0
    model.train()
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device).float().view(-1,1)
        optimizer.zero_grad()
        hypothesis = model(X_batch)
        loss = criterion(hypothesis, y_batch)
        loss.backward()
        optimizer.step()
        acc = ((hypothesis >= 0.5) == y_batch).float().mean()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(loader), epoch_acc / len(loader)

In [33]:
def evaluate(model, criterion, loader):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device).float().view(-1, 1)
            hypothesis = model(X_batch)
            loss = criterion(hypothesis, y_batch)
            acc =((hypothesis >=0.4)==y_batch).float().mean()
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss /len(loader), epoch_acc / len(loader)

In [None]:
n_epochs = 25
for epoch in range(1, n_epochs+1):
    loss, acc = train(model, criterion, optimizer, train_loader)

    test_loss , test_acc = evaluate(model, criterion, test_loader)
    print('epoch: {}, loss: {:2.3f}, acc: {:2.2f}, test_loss:{:2.3f},test_acc:{:2.3f}'.format(epoch, loss, acc, test_loss, test_acc))