In [16]:
import torch
import torchtext
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import time

In [17]:
start = time.time()
TEXT = torchtext.legacy.data.Field(lower=True, fix_length=200, batch_first=False)
LABEL = torchtext.legacy.data.Field(sequential=False)

In [18]:
from torchtext.legacy import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [19]:
print(vars(train_data.examples[0]))

{'text': ['when', 'i', 'first', 'went', 'to', 'watch', 'the', 'shining', 'i', 'was', 'expecting', 'a', 'decent', 'film', 'from', 'what', 'i', 'had', 'heard', 'about', 'it', 'and', 'i', 'liked', 'a', 'lot', 'of', 'stanley', "kubrick's", 'other', 'work', 'but', 'when', 'i', 'started', 'to', 'watch', 'it', 'it', 'was', 'so', 'much', 'better', 'than', 'i', 'thought', 'it', 'would', 'be.at', 'times', 'i', 'seriously', 'felt', 'ridiculously', 'uneasy', 'and', 'i', "couldn't", 'take', 'my', 'eyes', 'of', 'the', 'screen', 'still', "there's", 'something', 'very', 'disturbing', 'about', 'everything', 'in', 'the', 'film.', 'now', 'some', 'people', "don't", 'like', "kubrick's", 'version', 'of', 'the', 'shining', 'since', 'it', "doesn't", 'entirely', 'follow', 'stephen', "king's", 'book', 'but', 'in', 'my', 'opinion', 'both', "kubrick's", 'version,the', 'mini-series', 'and', 'the', 'book', 'are', 'all', 'great.jack', 'nicholson', 'gives', 'an', 'awesome', 'performance.if', 'you', 'are', 'looking', 

In [20]:
import string
for example in train_data.examples:
  text = [x.lower() for x in vars(example)['text']]
  text = [x.replace('<br', '') for x in text]
  text = [''.join(c for c in s if c not in string.punctuation) for s in text]
  text = [s for s in text if s]
  vars(example)['text'] = text

In [21]:
import random
train_data, valid_data = train_data.split(random_state=random.seed(0), split_ratio=0.8)

In [22]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


In [23]:
TEXT.build_vocab(train_data, max_size=10000, min_freq=10, vectors=None)
LABEL.build_vocab(train_data)

print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary: 10002
Unique tokens in LABEL vocabulary: 3


In [24]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embedding_dim = 100
hidden_size = 300

train_iterator, valid_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

In [25]:
class RNNCell_Encoder(nn.Module):
  def __init__(self, input_dim, hidden_size):
    super().__init__()
    self.rnn = nn.RNNCell(input_dim, hidden_size)

  def forward(self, inputs):
    bz = inputs.shape[1]
    ht = torch.zeros((bz, hidden_size)).to(device)
    for word in inputs:
      ht = self.rnn(word, ht)
    return ht

class Net(nn.Module):
  def __init__(self):
    super().__init__()
    self.em = nn.Embedding(len(TEXT.vocab.stoi), embedding_dim)
    self.rnn = RNNCell_Encoder(embedding_dim, hidden_size)
    self.fc1 = nn.Linear(hidden_size, 256)
    self.fc2 = nn.Linear(256, 3)

  def forward(self, x):
    x = self.em(x)
    x = self.rnn(x)
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [26]:
model = Net()
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [27]:
def training(epoch, model, trainloader, validloader):
  correct = 0
  total = 0
  running_loss = 0

  model.train()

  for b in trainloader:
    x, y = b.text, b.label
    x, y = x.to(device), y.to(device)
    y_pred = model(x)
    loss = criterion(y_pred, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    with torch.no_grad():
      y_pred = torch.argmax(y_pred, dim=1)
      correct += (y_pred == y).sum().item()
      total += y.size(0)
      running_loss += loss.item()

  epoch_loss = running_loss / len(trainloader.dataset)
  epoch_acc = correct / total

  valid_correct = 0
  valid_total = 0
  valid_running_loss = 0

  model.eval()
  with torch.no_grad():
    for b in validloader:
      x, y = b.text, b.label
      x, y = x.to(device), y.to(device)
      y_pred = model(x)
      loss = criterion(y_pred, y)
      y_pred = torch.argmax(y_pred, dim=1)
      valid_correct += (y_pred == y).sum().item()
      valid_total += y.size(0)
      valid_running_loss += loss.item()

  epoch_valid_loss = valid_running_loss / len(validloader.dataset)
  epoch_valid_acc = valid_correct / valid_total

  print('epoch: ', epoch)
  print('loss: ', round(epoch_loss, 3))
  print('accuracy: ', round(epoch_acc, 3))
  print('valid_loss: ', round(epoch_valid_loss, 3))
  print('valid_accuracy: ', round(epoch_valid_acc, 3))

  return epoch_loss, epoch_acc, epoch_valid_loss, epoch_valid_acc

In [28]:
epochs = 40
train_loss = []
train_acc = []
valid_loss = []
valid_acc = []

for epoch in range(epochs):
  epoch_loss, epoch_acc, epoch_valid_loss, epoch_valid_acc = training(epoch, model, train_iterator, valid_iterator)
  train_loss.append(epoch_loss)
  train_acc.append(epoch_acc)
  valid_loss.append(epoch_valid_loss)
  valid_acc.append(epoch_valid_acc)

end = time.time()
print(f'Training done in {(end-start) / 60 :.0f}m {(end-start) % 60 :.0f}s')

epoch:  0
loss:  0.011
accuracy:  0.499
valid_loss:  0.011
valid_accuracy:  0.507
epoch:  1
loss:  0.011
accuracy:  0.503
valid_loss:  0.011
valid_accuracy:  0.5
epoch:  2
loss:  0.011
accuracy:  0.512
valid_loss:  0.011
valid_accuracy:  0.494
epoch:  3
loss:  0.011
accuracy:  0.52
valid_loss:  0.011
valid_accuracy:  0.495
epoch:  4
loss:  0.011
accuracy:  0.523
valid_loss:  0.011
valid_accuracy:  0.507
epoch:  5
loss:  0.011
accuracy:  0.533
valid_loss:  0.011
valid_accuracy:  0.5
epoch:  6
loss:  0.011
accuracy:  0.538
valid_loss:  0.011
valid_accuracy:  0.52
epoch:  7
loss:  0.011
accuracy:  0.543
valid_loss:  0.011
valid_accuracy:  0.504
epoch:  8
loss:  0.011
accuracy:  0.556
valid_loss:  0.011
valid_accuracy:  0.519
epoch:  9
loss:  0.01
accuracy:  0.56
valid_loss:  0.011
valid_accuracy:  0.512
epoch:  10
loss:  0.01
accuracy:  0.566
valid_loss:  0.011
valid_accuracy:  0.506
epoch:  11
loss:  0.01
accuracy:  0.57
valid_loss:  0.011
valid_accuracy:  0.502
epoch:  12
loss:  0.01
ac

In [29]:
def evaluate(epoch, model, testloader):
  test_correct = 0
  test_total = 0
  test_running_loss = 0

  model.eval()
  with torch.no_grad():
    for b in testloader:
      x, y = b.text, b.label
      x, y = x.to(device), y.to(device)
      y_pred = model(x)
      loss = criterion(y_pred, y)
      y_pred = torch.argmax(y_pred, dim=1)
      test_correct += (y_pred == y).sum().item()
      test_total += y.size(0)
      test_running_loss += loss.item()
  
  epoch_test_loss = test_running_loss / len(testloader.dataset)
  epoch_test_acc = test_correct / test_total

  print('-----------------------------------------')
  print('epoch: ', epoch)
  print('loss: ', round(epoch_test_loss, 3))
  print('accuracy: ', round(epoch_test_acc, 3))

  return epoch_test_loss, epoch_test_acc

In [30]:
epochs = 15
test_loss = []
test_acc = []
start = time.time()

for epoch in range(1, epochs+1):
  epoch_test_loss, epoch_test_acc = evaluate(epoch, model, test_iterator)
  test_loss.append(epoch_test_loss)
  test_acc.append(epoch_test_acc)

end = time.time()

print(f'Training done in {(end-start) / 60 :.0f}m {(end-start) % 60 :.0f}s')

-----------------------------------------
epoch:  1
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  2
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  3
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  4
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  5
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  6
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  7
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  8
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  9
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  10
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  11
loss:  0.021
accuracy:  0.517
-----------------------------------------
epoch:  12
loss:  0.021
accuracy:  0.517
-------------

In [1]:
import torch
import torchtext
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import time


In [5]:
start = time.time()
TEXT = torchtext.legacy.data.Field(sequential=True, batch_first=True, lower=True)
LABEL = torchtext.legacy.data.Field(sequential=False, batch_first=True)

from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(split_ratio=0.8)

TEXT.build_vocab(train_data, max_size=10000, min_freq=10, vectors=None)
LABEL.build_vocab(train_data)

BATCH_SIZE = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

downloading aclImdb_v1.tar.gz


100%|██████████| 84.1M/84.1M [00:02<00:00, 30.4MB/s]


In [6]:
train_iterator, valid_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

In [8]:
vocab_size = len(TEXT.vocab)
n_classes = 2 # Pos, Neg

In [19]:
class BasicRNN(nn.Module):
  def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
    super().__init__()
    self.n_layers = n_layers
    self.embed = nn.Embedding(n_vocab, embed_dim)
    self.hidden_dim = hidden_dim
    self.dropout = nn.Dropout(dropout_p)
    self.rnn = nn.RNN(embed_dim, self.hidden_dim, num_layers=self.n_layers, batch_first=True)
    self.out = nn.Linear(self.hidden_dim, n_classes)

  def forward(self, x):
    x = self.embed(x)
    h_0 = self._init_state(batch_size=x.size(0))
    x, _ = self.rnn(x, h_0)
    h_t = x[:, -1, :]
    self.dropout(h_t)
    logit = torch.sigmoid(self.out(h_t))
    return logit

  def _init_state(self, batch_size=1):
    weight = next(self.parameters()).data
    return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [20]:
model = BasicRNN(n_layers=1, hidden_dim=256, n_vocab=vocab_size, embed_dim=128, n_classes=n_classes, dropout_p=0.5)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [30]:
def train(model, optimizer, train_iter):
  model.train()
  for b, batch in enumerate(train_iter):
    x, y = batch.text.to(device), batch.label.to(device)
    y.data.sub_(1)
    optimizer.zero_grad()

    logit = model(x)
    loss = F.cross_entropy(logit, y)
    loss.backward()
    optimizer.step()

    if b % 50 == 0:
      print(f'Train Epoch: {b} [{b * len(x)}/{len(train_iter.dataset)} ({(b * len(x))/(len(train_iter.dataset)) * 100 :.0f})%]\tLoss: {loss.item():.6f}')

In [31]:
def evaluate(model, val_iter):
  model.eval()
  corrects, total, total_loss = 0, 0, 0

  for batch in val_iter:
    x, y = batch.text.to(device), batch.label.to(device)
    y.data.sub_(1)
    logit = model(x)
    loss = F.cross_entropy(logit, y, reduction="sum")
    total += y.size(0)
    total_loss += loss.item()
    corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()

  avg_loss = total_loss / len(val_iter.dataset)
  avg_accuracy = corrects / total
  return avg_loss, avg_accuracy

In [32]:
BATCH_SIZE = 100
LR = 0.001
EPOCHS = 10

for e in range(1, EPOCHS + 1):
  train(model, optimizer, train_iterator)
  val_loss, val_accuracy = evaluate(model, valid_iterator)
  print(f'[EPOCH: {e}], Validation Loss: {val_loss:.2f} | Validation Accuracy: {val_accuracy:.2f}%')

Train Epoch: 0 [0/20000 (0)%]	Loss: 0.686803
Train Epoch: 50 [5000/20000 (25)%]	Loss: 0.692514
Train Epoch: 100 [10000/20000 (50)%]	Loss: 0.696747
Train Epoch: 150 [15000/20000 (75)%]	Loss: 0.694003
[EPOCH: 1], Validation Loss: 0.69 | Validation Accuracy: 0.51%
Train Epoch: 0 [0/20000 (0)%]	Loss: 0.692554
Train Epoch: 50 [5000/20000 (25)%]	Loss: 0.694496
Train Epoch: 100 [10000/20000 (50)%]	Loss: 0.691629
Train Epoch: 150 [15000/20000 (75)%]	Loss: 0.690907
[EPOCH: 2], Validation Loss: 0.69 | Validation Accuracy: 0.50%
Train Epoch: 0 [0/20000 (0)%]	Loss: 0.693846
Train Epoch: 50 [5000/20000 (25)%]	Loss: 0.692319
Train Epoch: 100 [10000/20000 (50)%]	Loss: 0.691828
Train Epoch: 150 [15000/20000 (75)%]	Loss: 0.693744
[EPOCH: 3], Validation Loss: 0.69 | Validation Accuracy: 0.50%
Train Epoch: 0 [0/20000 (0)%]	Loss: 0.693795
Train Epoch: 50 [5000/20000 (25)%]	Loss: 0.691414
Train Epoch: 100 [10000/20000 (50)%]	Loss: 0.692201
Train Epoch: 150 [15000/20000 (75)%]	Loss: 0.692041
[EPOCH: 4], Val