<a href="https://colab.research.google.com/github/AnXiaoNuan/geektime_learn_NLP/blob/master/pytorch_textcnn_torchtext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import nltk
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torchtext.legacy import data, datasets


import random

In [23]:
import time

In [24]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# prepare data

tokenize, build vocabulary, covert text into word index.

Field defines how to process text, here is the most common parameters:

sequential – Whether the datatype represents sequential data. If False, no tokenization is applied. Default: True.

use_vocab – Whether to use a Vocab object. If False, the data in this field should already be numerical. Default: True.

preprocessing – The Pipeline that will be applied to examples using this field after tokenizing but before numericalizing. Many Datasets replace this attribute with a custom preprocessor. Default: None.

batch_first – Whether to produce tensors with the batch dimension first. Default: False.





In [25]:
nltk.download('punkt')
tokenizer = word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
TEXT = data.Field(tokenize=tokenizer, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root='/home')

In [27]:
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [28]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size= MAX_VOCAB_SIZE, vectors="glove.6B.300d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data, valid_data, test_data)

# build iterator

In [29]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_sizes = (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    sort_within_batch = True,
    device = device)

cuda


# Define Model

In [30]:

class textCNNMulti(nn.Module):

    def __init__(self, args):
        super().__init__()
        dim = args['dim']
        n_class = args['n_class']
        embeddings=args['embedding_matrix']
        kernels = [3, 4, 5]
        kernel_number = [150, 150, 150]
        self.static_embed = nn.Embedding.from_pretrained(embeddings)
        self.non_static_embed = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.convs = nn.ModuleList([nn.Conv2d(2, number, (size, dim), padding=(size-1,0)) for (size, number) in zip(kernels, kernel_number)])
        self.dropout = nn.Dropout()
        self.out = nn.Linear(sum(kernel_number), n_class)

    def forward(self, x):
        #print('x original shape is ', x.shape) # shape: [137, 64]
        non_static_input = self.non_static_embed(x) 
        static_input = self.static_embed(x) # shape: [sequence_length, batch_size, embedding_dim]
        x = torch.stack([non_static_input, static_input], dim=1) # shape: [137, 2, 64, 300]
        x = x.permute(2, 1, 0, 3) # shape: [64, 2, 137, 300]
        #print('x after being stacked shape is ',x.shape)
        conv_pool_x = []
        for conv in self.convs:
            relu_x = nn.functional.relu(conv(x)) # shape: [64, 150, 139, 1]
            #print('relu_x before squeezing', relu_x.shape)
            relu_x = relu_x.squeeze(3) # [64, 150, 139]
            #print('relu_x after squeezing', relu_x.shape)
            pool_x = nn.functional.max_pool1d(relu_x, relu_x.size(2)) # [64, 150]
            #print('pool_x before squeezing', pool_x.shape)
            pool_x = pool_x.squeeze(2) # [64, 150]
            #print('pool_x after squeezing', pool_x.shape)
            conv_pool_x.append(pool_x)

        #print('len(conv_pool_x):', len(conv_pool_x))
        #print('conv_pool_x[0].shape:', conv_pool_x[0].shape) # [64, 150]
        conv_pool_x = torch.cat(conv_pool_x, 1) # [64, 450]
        #print('conv_pool_x:', conv_pool_x.shape)
        conv_pool_x = self.dropout(conv_pool_x) # [64, 450]
        #print('conv_pool_x after dropout:', conv_pool_x.shape)
        conv_pool_x = self.out(conv_pool_x)
        return conv_pool_x


In [31]:
a = torch.tensor([[1,2,3], [4, 5, 6]])
b = torch.tensor([[7, 8, 9], [11, 12, 13]])
c = torch.stack([a, b], dim=1)
c

tensor([[[ 1,  2,  3],
         [ 7,  8,  9]],

        [[ 4,  5,  6],
         [11, 12, 13]]])

In [32]:
c.shape

torch.Size([2, 2, 3])

# initialize model

In [33]:
args={}
args['vocb_size']=len(TEXT.vocab)
args['dim']=300
args['n_class']=len(LABEL.vocab)
args['embedding_matrix']=TEXT.vocab.vectors
args['lr']=0.001
args['momentum']=0.8
args['epochs']=180
args['log_interval']=100
args['test_interval']=500
args['save_dir']='./'
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = textCNNMulti(args)
model = model.to(device)
model.static_embed.weight.data[UNK_IDX] = torch.zeros(args['dim'])
model.static_embed.weight.data[PAD_IDX] = torch.zeros(args['dim'])

In [34]:
model.non_static_embed.weight.data[UNK_IDX] = torch.zeros(args['dim'])
model.non_static_embed.weight.data[PAD_IDX] = torch.zeros(args['dim'])

# initialize optimizer

In [35]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=args['lr'])
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [36]:
def binary_accuracy(logits, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    softmax = nn.Softmax(dim=1)
    probs = softmax(logits)
    _, y_pred_tags = torch.max(probs, dim = 1)
    correct = (y_pred_tags == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

# train

In [37]:
def train(model, iterator, optimizer, criterion, epoch, validate_after_n_batch):
    # epoch_loss = 0
    # epoch_acc = 0

    best_valid_loss = float('inf')

    n_batch_train_loss = 0
    n_batch_train_acc = 0

    model.train()

    total_batch = 0

    start_time = time.time()

    for batch in iterator:

        #print(batch)
        total_batch += 1

        optimizer.zero_grad()

        text, tex_lengths = batch.text

        #print(text.shape)

        text = text.cuda()

        logits = model(text).squeeze(1)

        label = batch.label.type(torch.long)

        loss = criterion(logits, label)

        acc = binary_accuracy(logits, label)

        loss.backward()

        optimizer.step()

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        n_batch_train_loss += loss.item()
        n_batch_train_acc += acc.item()

        avg_batch_train_loss = n_batch_train_loss / total_batch
        avg_batch_train_acc = n_batch_train_acc / total_batch

        if total_batch % validate_after_n_batch == 0:
            # validation
            avg_batch_valid_loss, avg_batch_valid_acc = evaluate(model, val_iter, criterion)

            if avg_batch_valid_loss < best_valid_loss:
                best_valid_loss = avg_batch_valid_loss
                torch.save(model.state_dict(), 'model.pt')
            print(f'Epoch: {epoch+1:02} | Total Batch: {total_batch:06} | Training Time for latest {validate_after_n_batch:03} batches: {epoch_mins}m {epoch_secs}s' )
            print(f'\tTrain Loss: {avg_batch_train_loss:.3f} | Train Acc: {avg_batch_train_acc*100:.2f}%')
            print(f'\t Val. Loss: {avg_batch_valid_loss:.3f} |  Val. Acc: {avg_batch_valid_acc*100:.2f}%')
            start_time = time.time()
            model.train()
        
        
  #return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [38]:
def evaluate(model, iterator, criterion):
    
    total_loss = 0
    total_acc = 0
    
    model.eval()

    #print(len(iterator))
    
    with torch.no_grad():
    
        for batch in iterator:
          
            text, text_lengths = batch.text

            text = text.cuda()

            label = batch.label.type(torch.long)

            logits = model(text).squeeze(1)

            loss = criterion(logits, label)
            #print('loss:', loss)

            acc = binary_accuracy(logits, batch.label)
            #print('acc:', acc)

            total_loss += loss.item()
            total_acc += acc.item()
        
    return total_loss / len(iterator), total_acc / len(iterator)

In [39]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [40]:

N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    validate_after_n_batch = 50
    train(model, train_iter, optimizer, criterion, epoch, validate_after_n_batch)        

Epoch: 01 | Total Batch: 000050 | Training Time for latest 050 batches: 0m 7s
	Train Loss: 0.778 | Train Acc: 58.13%
	 Val. Loss: 0.630 |  Val. Acc: 64.19%
Epoch: 01 | Total Batch: 000100 | Training Time for latest 050 batches: 0m 4s
	Train Loss: 0.736 | Train Acc: 59.94%
	 Val. Loss: 0.587 |  Val. Acc: 68.47%
Epoch: 01 | Total Batch: 000150 | Training Time for latest 050 batches: 0m 5s
	Train Loss: 0.709 | Train Acc: 61.64%
	 Val. Loss: 0.547 |  Val. Acc: 72.74%
Epoch: 01 | Total Batch: 000200 | Training Time for latest 050 batches: 0m 6s
	Train Loss: 0.678 | Train Acc: 63.77%
	 Val. Loss: 0.490 |  Val. Acc: 76.96%
Epoch: 01 | Total Batch: 000250 | Training Time for latest 050 batches: 0m 6s
	Train Loss: 0.649 | Train Acc: 65.70%
	 Val. Loss: 0.458 |  Val. Acc: 78.43%
Epoch: 02 | Total Batch: 000050 | Training Time for latest 050 batches: 0m 5s
	Train Loss: 0.417 | Train Acc: 80.72%
	 Val. Loss: 0.400 |  Val. Acc: 81.71%
Epoch: 02 | Total Batch: 000100 | Training Time for latest 050 b

In [None]:
model.load_state_dict(torch.load('model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')