<a href="https://colab.research.google.com/github/Andrian0s/ML4NLP1-2023-Tutorial-Notebooks/blob/main/tutorial_notebooks/08_tutorial_lightning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import spacy
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.utils.data as data
# import torch.utils.data as data, torchvision as tv
import lightning as L

In [None]:
nlp = spacy.blank('en')

In [None]:
# use the GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 7 GPU(s) available.
Device name: NVIDIA GeForce GTX TITAN X


In [None]:
# import the dataset (txt file) line by line
def load_text(path):
    with open(path, 'rb') as f:
        texts = []
        for line in f:
            texts.append(line.decode(errors='ignore').lower().strip())
    return texts

In [None]:
neg_text = load_text("movie_review_data/data/rt-polarity.neg")
pos_text = load_text("movie_review_data/data/rt-polarity.pos")
# concat negative and positive texts
texts = neg_text + pos_text
# we know the order in texts variable, so we can label it accordingly
labels = np.array([0]*len(neg_text) + [1]*len(pos_text))

In [None]:
def tokenize(texts):
  """
  Assign unique id to each token
  """
  max_len = 0
  tokenized_texts = []
  word2idx = {}

  # Add <pad> and <unk> tokens to the vocabulary
  word2idx['<pad>'] = 0
  word2idx['<unk>'] = 1

  # Building our vocab from the corpus starting from index 2
  idx = 2
  for sent in texts:
    tokenized_sent = nlp(sent)
    # Add `tokenized_sent` to `tokenized_texts`
    tokenized_texts.append(tokenized_sent)
    # Add new token to `word2idx`
    for token in tokenized_sent:
      # string any token objects are different things, be careful.
      if token.text not in word2idx:
        word2idx[token.text] = idx
        idx += 1

        # Update `max_len`
    max_len = max(max_len, len(tokenized_sent))

  return tokenized_texts, word2idx, max_len

In [None]:
def encode(tokenized_texts, word2idx, max_len):
    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        tokenized_padded_sent = list(tokenized_sent) + ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(str(token)) for token in tokenized_padded_sent]
        input_ids.append(input_id)

    return np.array(input_ids)

In [None]:
tokenized_texts, word2idx, max_len = tokenize(texts)
input_ids = encode(tokenized_texts, word2idx, max_len)

In [None]:
# Convert data type to torch.Tensor
train_inputs = torch.from_numpy(input_ids)
labels = torch.from_numpy(labels)

In [None]:
import torch.utils.data as data
# Create DataLoader for training data
all_data = TensorDataset(train_inputs, labels)
dataset = TensorDataset(train_inputs, labels)
total_len = len(dataset)
train_size = int(0.7 * total_len)
val_size = int(0.2 * total_len)
test_size = total_len - train_size - val_size

train_data, val_data, test_data = data.random_split(dataset, [train_size, val_size, test_size])

In [None]:
# Specify batch_size
batch_size = 8
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_dataloader = DataLoader(val_data)
test_dataloader = DataLoader(test_data)

In [None]:
train_classes = [label for _, label in train_data]

In [None]:
class CNN(nn.Module):
    def __init__(self,
                 vocab_size=len(word2idx),
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN class.
        Args:
            vocab_size (int): Need to be specified when pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN, self).__init__()
        # Embedding layer
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (batch_size, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)

        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

In [None]:
conf = {
    'vocab_size': len(word2idx),
    'embed_dim': 300,
    'filter_sizes': [3, 4, 5],
    'num_filters': [200, 200, 200],
    'num_classes': 2,
    'dropout': 0.5
}

In [None]:
import torch.optim
from torchmetrics import Accuracy

class CNNLit(L.LightningModule):
    def __init__(self, conf, *args, **kwargs):
        """
        Inputs:
        """
        super().__init__()

        self.save_hyperparameters(conf)
        self.model = CNN(
            vocab_size=self.hparams['vocab_size'],
            embed_dim=self.hparams['embed_dim'],
            filter_sizes=self.hparams['filter_sizes'],
            num_filters=self.hparams['num_filters'],
            num_classes=self.hparams['num_classes'],
            dropout=self.hparams['dropout']
        )
        # Create model
        # self.model = create_model(model_name, model_hparams)
        """
        self.model = CNN(embed_dim=300,
            filter_sizes=[3, 4, 5],
            num_filters=[100, 100, 100],
            num_classes=2,
            dropout=0.5)
        """
        # Create loss module
        self.loss_module = nn.CrossEntropyLoss()

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        output = self.encoder(x)
        return output

    def training_step(self, batch, batch_idx):
        # "batch" is the output of the training data loader.
        input_ids, labels = batch
        preds = self.model(input_ids)
        loss = self.loss_module(preds, labels)
        self.log("train_loss", loss)
        return loss  # Return tensor to call ".backward" on

    def validation_step(self, batch, batch_idx):
        input_ids, labels = batch
        preds = self.model(input_ids)
        loss = self.loss_module(preds, labels)
        # flat_preds = preds.clone().detach().argmax(dim=-1)
        flat_preds = torch.argmax(preds, axis=1).flatten().cpu()
        accuracy = Accuracy(task='binary')
        acc = accuracy(flat_preds, labels.cpu())
        self.log('val_acc', acc)
        self.log("val_loss", loss)

    def test_step(self, batch, batch_idx):
        input_ids, labels = batch
        preds = self.model(input_ids)
        flat_preds = np.argmax(preds.cpu(), axis=1).flatten()
        accuracy = Accuracy(task='binary')
        acc = accuracy(flat_preds, labels.cpu())
        self.log("test_acc", acc)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [None]:
L.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:3") if torch.cuda.is_available() else torch.device("cpu")

Global seed set to 42


In [None]:
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

cnn = CNNLit(conf)
trainer = L.Trainer(
        # We run on a single GPU (if possible)
        accelerator="auto",
        devices=1,
        # How many epochs to train for if no patience is set
        max_epochs=10,
        callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
        auto_lr_find=True
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(cnn, train_dataloader, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6]

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | CNN              | 6.3 M 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
6.3 M     Trainable params
0         Non-trainable params
6.3 M     Total params
25.097    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
    val_result = trainer.test(cnn, dataloaders=val_dataloader, verbose=False)
    test_result = trainer.test(cnn, dataloaders=test_dataloader, verbose=False)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6]




Testing: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6]


Testing: 0it [00:00, ?it/s]

In [None]:
    result = {"test": test_result[0]["test_acc"], "val": val_result[0]["test_acc"]}
    print(result)

{'test': 0.7375820279121399, 'val': 0.73968106508255}
