References:

https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/5%20-%20Multi-class%20Sentiment%20Analysis.ipynb

https://github.com/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb

https://towardsdatascience.com/deep-learning-for-nlp-with-pytorch-and-torchtext-4f92d69052f

https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [1]:
import torch
from torch.utils.data import dataset
from torch import nn, Tensor
import spacy

nlp = spacy.load("en_core_web_sm")

use_gpu = torch.cuda.is_available()
if use_gpu:
  print("CUDA available.\nPytorch set with GPU")
  DEVICE = torch.device("cuda")
  result = spacy.require_gpu()
  print("Spacy set with GPU." if result else None)
else:
  print("CUDA not available. CPU processing")
  DEVICE = torch.device("cpu")
DEVICE



CUDA available.
Pytorch set with GPU
Spacy set with GPU.


device(type='cuda')

In [2]:
import pandas as pd
from torch.utils.data import Dataset

In [79]:
class ReviewsDataset(Dataset):
    def __init__(self, reviews_file):
        self.df = pd.read_csv(reviews_file)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        review = self.df.iloc[idx, 0]
        starts = self.df.iloc[idx, 1]
        return starts,review 

    def review(self, idx):
        return self.df.iloc[idx, 0]

    def stars(self, idx):
        return self.df.iloc[idx, 1]

In [80]:
label_type = 'final_label'

In [81]:
train_iter = ReviewsDataset(f'../dataset/{label_type}/train.csv')
val_iter = ReviewsDataset(f'../dataset/{label_type}/train.csv')
test_iter = ReviewsDataset(f'../dataset/{label_type}/test.csv')

In [82]:
SEED = 1234

import random
import numpy as np

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cuddeterministic = True

In [84]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for stars, review in data_iter:
        yield tokenizer(review)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [85]:
vocab(['here', 'is', 'an', 'example'])

[286, 12, 39, 266]

In [86]:
rows_iter = iter(train_iter)


In [87]:
next(rows_iter)

(1, 'Four Stars. thought provoking')

In [88]:
tokens_iter = iter(yield_tokens(train_iter))

In [89]:
next(tokens_iter)

['four', 'stars', '.', 'thought', 'provoking']

### Vocabulary size:

In [90]:
len(vocab)

8159

In [91]:
# vocab_spacy = {}

# for index,row in enumerate(iter(train_iter)):
#     doc = nlp(row[0])
#     sentences = doc.sents

#     for sent in sentences:
#         tokens = nlp(sent.text)

#     for tkn in tokens:
#         if tkn.text in vocab_spacy.keys():
#             vocab_spacy[tkn.text] += 1
#         else:
#             vocab_spacy[tkn.text] = 1
# len(vocab_spacy)

vocab_spacy: 4653 (takes 5 min to process with GPU enabled)

In [92]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: x

In [93]:
text_pipeline('here is an example')

[286, 12, 39, 266]

In [94]:
text_pipeline('This one is a weird one for me to write')

[15, 48, 12, 8, 1586, 48, 18, 58, 6, 791]

In [95]:
text_pipeline("<pad>")

[1]

In [96]:
PAD_IDX = vocab(['<pad>'])[0]
PAD_IDX

1

In [153]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [154]:
# Number of batches for training
train_batch_size = 64

# Number of batches for validation. Use a larger value than training.
# It helps speed up the validation process.
valid_batch_size = 100

In [155]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(DEVICE), label_list.to(DEVICE), offsets.to(DEVICE)

train_dataloader = DataLoader(train_iter, batch_size=train_batch_size, shuffle=False, collate_fn=collate_batch)

print(next(iter(train_dataloader)))

(tensor([204,  70,   2,  ..., 580, 180,   2]), tensor([1, 0, 1, 1, 2, 0, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 2, 0, 2, 1, 2, 0, 2, 2,
        1, 0, 2, 2, 1, 2, 2, 0, 1, 2, 1, 0, 2, 2, 0, 1, 0, 2, 2, 2, 2, 1, 1, 2,
        2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 0, 2, 2, 0, 1]), tensor([   0,    5,   32,  108,  127,  135,  264,  350,  360,  438,  688,  719,
         747,  796,  886,  917,  995, 1086, 1149, 1185, 1209, 1231, 1235, 1294,
        1417, 1452, 1751, 1782, 1837, 1878, 1900, 1950, 2093, 2165, 2256, 2455,
        2522, 2581, 2589, 2620, 2870, 2924, 2938, 2974, 3033, 3063, 3138, 3160,
        3239, 3245, 3289, 3355, 3367, 3409, 3936, 4001, 4011, 4040, 4179, 4202,
        4353, 4368, 4401, 4424]))


In [156]:
val_dataloader = DataLoader(val_iter, batch_size=valid_batch_size, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_iter, batch_size=valid_batch_size, shuffle=False, collate_fn=collate_batch)

In [105]:
print(next(iter(train_dataloader)))

(tensor([204,  70,   2,  ..., 580, 180,   2]), tensor([1, 0, 1, 1, 2, 0, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 2, 0, 2, 1, 2, 0, 2, 2,
        1, 0, 2, 2, 1, 2, 2, 0, 1, 2, 1, 0, 2, 2, 0, 1, 0, 2, 2, 2, 2, 1, 1, 2,
        2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 0, 2, 2, 0, 1]), tensor([   0,    5,   32,  108,  127,  135,  264,  350,  360,  438,  688,  719,
         747,  796,  886,  917,  995, 1086, 1149, 1185, 1209, 1231, 1235, 1294,
        1417, 1452, 1751, 1782, 1837, 1878, 1900, 1950, 2093, 2165, 2256, 2455,
        2522, 2581, 2589, 2620, 2870, 2924, 2938, 2974, 3033, 3063, 3138, 3160,
        3239, 3245, 3289, 3355, 3367, 3409, 3936, 4001, 4011, 4040, 4179, 4202,
        4353, 4368, 4401, 4424]))


In [106]:
tt_iter = iter(train_dataloader)
sample = next(tt_iter)
print(len(sample[0]))
print(len(sample[1]))
print(len(sample[2]))

4495
64
64


In [133]:
import torch.nn.functional as F
from torch.nn import Embedding, ModuleList, Conv2d, Module, Linear, Dropout

class CNN(Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = ModuleList([Conv2d(in_channels = 1, 
                                        out_channels = n_filters, 
                                        kernel_size = (fs, embedding_dim)) 
                            for fs in filter_sizes
                            ])
        
        self.fc = Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = Dropout(dropout)
        
    # def forward(self, text):#, offsets):
    def forward(self, text, offsets):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text, offsets)
        # embedded = self.embedding(text)#, offsets)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

FastText Embeddings

In [135]:
# from torchtext.vocab import FastText
# embedding = FastText('simple')

CharNGram Embeddings

In [136]:
# from torchtext.vocab import CharNGram
# embedding_charngram = CharNGram()

GloVe Embeddings

In [137]:
# from torchtext.vocab import GloVe
# embedding_glove = GloVe(name='6B', dim=100)

In [138]:
# EMBEDDING_LAYER = torch.nn.Embedding.from_pretrained(embedding.vectors,freeze=False)

In [139]:
# EMBEDDING_LAYER.embedding_dim

In [140]:
# myvocab = vocab(EMBEDDING_LAYER.stoi)

In [141]:
# EMBEDDING_LAYER.stoi['<unk>']

In [142]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 3
DROPOUT = 0.5

In [143]:
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
# model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [144]:
if use_gpu:
    print("Trying to use GPU")
    import torch.backends.cudnn as cudnn
    torch.cuda.init()
    cudnn.benchmark = True
    model.cuda()
model

Trying to use GPU


CNN(
  (embedding): Embedding(8159, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=300, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [145]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 937,103 trainable parameters


In [146]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()
# criterion = nn.CrossEntropyLoss(    )

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [147]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [148]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [149]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [150]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        # predicted_label = model(text)#, offsets)
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            # predicted_label = model(text)#, offsets)
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [152]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train], generator = torch.Generator('cuda'))

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=False, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

TypeError: forward() takes 2 positional arguments but 3 were given