<a href="https://colab.research.google.com/github/AnXiaoNuan/geektime_learn_NLP/blob/master/pytorch_torchtext_IMDB_BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# initialize environment

In [1]:
import nltk
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torchtext.legacy import data, datasets


import random

In [2]:
import time

In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# prepare data

tokenize, build vocabulary, covert text into word index.

Field defines how to process text, here is the most common parameters:

sequential – Whether the datatype represents sequential data. If False, no tokenization is applied. Default: True.

use_vocab – Whether to use a Vocab object. If False, the data in this field should already be numerical. Default: True.

preprocessing – The Pipeline that will be applied to examples using this field after tokenizing but before numericalizing. Many Datasets replace this attribute with a custom preprocessor. Default: None.

batch_first – Whether to produce tensors with the batch dimension first. Default: False.





In [4]:
nltk.download('punkt')
tokenizer = word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
TEXT = data.Field(tokenize=tokenizer, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root='/home')

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 23.2MB/s]


In [6]:
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [45]:
train_data.shape

<generator object Dataset.__getattr__ at 0x7f3e9d089dd0>

In [7]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size= MAX_VOCAB_SIZE, vectors="glove.6B.300d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data, valid_data, test_data)

.vector_cache/glove.6B.zip: 862MB [02:46, 5.18MB/s]                           
100%|█████████▉| 399848/400000 [00:35<00:00, 11279.15it/s]

# build iterator

In [8]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_sizes = (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    sort_within_batch = True,
    device = device)

cuda


# Define Model

In [9]:

class BiLSTM(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
    super().__init__()
    
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
    
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
    
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    self.dropout = nn.Dropout(dropout)
        
  def forward(self, text, text_lengths):
      
    embedded = self.embedding(text)
    
    #packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu())
    
    packed_output, (hidden, cell) = self.rnn(packed_embedded)
    
    output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output) 
    
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) # 双向LSTM
            
    return self.fc(hidden)


# initialize model

In [15]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = BiLSTM(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)
model = model.to(device)
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)


# initialize optimizer

In [29]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [30]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    #print(rounded_preds)
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

# train

In [43]:
def train(model, iterator, optimizer, criterion, epoch, validate_after_n_batch):

  best_valid_loss = float('inf')

  n_batch_train_loss = 0
  n_batch_train_acc = 0
  model.train()

  total_batch = 0

  start_time = time.time()

  for batch in iterator:
    
    #print(batch)
    total_batch += 1

    optimizer.zero_grad()

    text, text_lengths = batch.text

    #print(text.shape)

    text = text.to(device)

    logits = model(text, text_lengths).squeeze(1)

    #label = batch.label.type(torch.long)

    loss = criterion(logits, batch.label)

    acc = binary_accuracy(logits, batch.label)

    loss.backward()

    optimizer.step()

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    n_batch_train_loss += loss.item()
    n_batch_train_acc += acc.item()

    avg_batch_train_loss = n_batch_train_loss / total_batch
    avg_batch_train_acc = n_batch_train_acc / total_batch

    if total_batch % validate_after_n_batch == 0:
      # validation
      avg_batch_valid_loss, avg_batch_valid_acc = evaluate(model, val_iter, criterion)

      if avg_batch_valid_loss < best_valid_loss:
          best_valid_loss = avg_batch_valid_loss
          torch.save(model.state_dict(), 'model.pt')
      print(f'Epoch: {epoch+1:02} | Total Batch: {total_batch:06} | Training Time for latest {validate_after_n_batch:03} batches: {epoch_mins}m {epoch_secs}s' )
      print(f'\tTrain Loss: {avg_batch_train_loss:.3f} | Train Acc: {avg_batch_train_acc*100:.2f}%')
      print(f'\t Val. Loss: {avg_batch_valid_loss:.3f} |  Val. Acc: {avg_batch_valid_acc*100:.2f}%')
      start_time = time.time()
      model.train()
        
  #return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [41]:
def evaluate(model, iterator, criterion):
    
    total_loss = 0
    total_acc = 0
    
    model.eval()

    #print(len(iterator))
    
    with torch.no_grad():
    
        for batch in iterator:
          
          text, text_lengths = batch.text

          text = text.to(device)

          #label = batch.label.type(torch.long)
          
          logits = model(text, text_lengths).squeeze(1)
          
          loss = criterion(logits, batch.label)
          #print('loss:', loss)
          
          acc = binary_accuracy(logits, batch.label)
          #print('acc:', acc)

          total_loss += loss.item()
          total_acc += acc.item()
        
    return total_loss / len(iterator), total_acc / len(iterator)

In [33]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [44]:

N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    validate_after_n_batch = 50
    train(model, train_iter, optimizer, criterion, epoch, validate_after_n_batch)        

Epoch: 01 | Total Batch: 000050 | Training Time for latest 050 batches: 0m 7s
	Train Loss: 0.676 | Train Acc: 56.47%
	 Val. Loss: 0.640 |  Val. Acc: 63.91%
Epoch: 01 | Total Batch: 000100 | Training Time for latest 050 batches: 0m 7s
	Train Loss: 0.638 | Train Acc: 62.03%
	 Val. Loss: 0.534 |  Val. Acc: 73.39%
Epoch: 01 | Total Batch: 000150 | Training Time for latest 050 batches: 0m 7s
	Train Loss: 0.626 | Train Acc: 64.34%
	 Val. Loss: 0.617 |  Val. Acc: 69.81%
Epoch: 01 | Total Batch: 000200 | Training Time for latest 050 batches: 0m 7s
	Train Loss: 0.628 | Train Acc: 64.72%
	 Val. Loss: 0.678 |  Val. Acc: 54.17%
Epoch: 01 | Total Batch: 000250 | Training Time for latest 050 batches: 0m 7s
	Train Loss: 0.633 | Train Acc: 63.88%
	 Val. Loss: 0.630 |  Val. Acc: 64.15%
Epoch: 02 | Total Batch: 000050 | Training Time for latest 050 batches: 0m 7s
	Train Loss: 0.577 | Train Acc: 69.03%
	 Val. Loss: 0.525 |  Val. Acc: 74.64%
Epoch: 02 | Total Batch: 000100 | Training Time for latest 050 b

In [None]:
model.load_state_dict(torch.load('model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')