<a href="https://colab.research.google.com/github/Andrian0s/ML4NLP1-2023-Tutorial-Notebooks/blob/main/tutorial_notebooks/07_tutorial_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy

In [None]:
import os
import re
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import spacy
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
import torch.utils.data as data, torchvision as tv
import lightning as L

In [None]:
nlp = spacy.blank('en')

In [None]:
# use the GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 7 GPU(s) available.
Device name: NVIDIA GeForce GTX TITAN X


In [None]:
# download the dataset with wget
# if the dataset is on github, try git clone instead.
!wget -P "data/" https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz
# unpack the file
!tar xvzf 'data/rt-polaritydata.tar.gz' -C 'data/'
!mv data/rt-polaritydata/rt-polarity.neg data/
!mv data/rt-polaritydata/rt-polarity.pos data/

In [None]:
# import the dataset (txt file) line by line
def load_text(path):
    with open(path, 'rb') as f:
        texts = []
        for line in f:
            texts.append(line.decode(errors='ignore').lower().strip())
    return texts

In [None]:
neg_text = load_text("movie_review_data/data/rt-polarity.neg")
pos_text = load_text("movie_review_data/data/rt-polarity.pos")

In [None]:
# concat negative and positive texts
texts = neg_text + pos_text

In [None]:
texts[0:5]

['simplistic , silly and tedious .',
 "it's so laddish and juvenile , only teenage boys could possibly find it funny .",
 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .',
 '[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation .',
 'a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification .']

In [None]:
# we know the order in texts variable, so we can label it accordingly
labels = np.array([0]*len(neg_text) + [1]*len(pos_text))

In [None]:
def tokenize(texts):
  """
  Assign unique id to each token
  """
  max_len = 0
  tokenized_texts = []
  word2idx = {}

  # Add <pad> and <unk> tokens to the vocabulary
  word2idx['<pad>'] = 0
  word2idx['<unk>'] = 1

  # Building our vocab from the corpus starting from index 2
  idx = 2
  for sent in texts:
    tokenized_sent = nlp(sent)
    # Add `tokenized_sent` to `tokenized_texts`
    tokenized_texts.append(tokenized_sent)
    # Add new token to `word2idx`
    for token in tokenized_sent:
      # string any token objects are different things, be careful.
      if token.text not in word2idx:
        word2idx[token.text] = idx
        idx += 1

        # Update `max_len`
    max_len = max(max_len, len(tokenized_sent))

  return tokenized_texts, word2idx, max_len

In [None]:
def encode(tokenized_texts, word2idx, max_len):
    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        tokenized_padded_sent = list(tokenized_sent) + ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(str(token)) for token in tokenized_padded_sent]
        input_ids.append(input_id)

    return np.array(input_ids)

In [None]:
tokenized_texts, word2idx, max_len = tokenize(texts)
input_ids = encode(tokenized_texts, word2idx, max_len)

In [None]:
# Convert data type to torch.Tensor
train_inputs = torch.from_numpy(input_ids)
labels = torch.from_numpy(labels)

In [None]:
texts[0:3]

['simplistic , silly and tedious .',
 "it's so laddish and juvenile , only teenage boys could possibly find it funny .",
 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .']

In [None]:
train_inputs[0:3]

tensor([[ 2,  3,  4,  5,  6,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 8,  9, 10, 11,  5, 12,  3, 13, 14, 15, 16, 17, 18,  8, 19,  7,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0],
        [20,  5, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 23,
         24, 36, 37,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [None]:
import torch.utils.data as data
# Create DataLoader for training data
all_data = TensorDataset(train_inputs, labels)
dataset = TensorDataset(train_inputs, labels)
total_len = len(dataset)
train_size = int(0.7 * total_len)
val_size = int(0.2 * total_len)
test_size = total_len - train_size - val_size

train_data, val_data, test_data = data.random_split(dataset, [train_size, val_size, test_size])

In [None]:
# Specify batch_size
batch_size = 8
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_dataloader = DataLoader(val_data)
test_dataloader = DataLoader(test_data)

In [None]:
class CNN(nn.Module):
    def __init__(self,
                 vocab_size=len(word2idx),
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN class.
        Args:
            vocab_size (int): Need to be specified when pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN, self).__init__()
        # Embedding layer
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (batch_size, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        print(x_fc.shape)

        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

In [None]:
import torch.optim as optim

# Instantiate CNN model
model = CNN(embed_dim=300,
            filter_sizes=[3, 4, 5],
            num_filters=[100, 100, 100],
            num_classes=2,
            dropout=0.5)

# Send model to `device` (GPU/CPU)
model.to(device)

# Instantiate Adadelta optimizer
optimizer = optim.Adadelta(model.parameters(),
                               lr=0.01,
                               rho=0.95)

In [None]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

# Start training loop
print("Start training...\n")
print(f"{'Epoch':^7} | {'Train Loss':^12}")
print("-"*60)

for epoch_i in range(1):
  total_loss = 0
  # Put the model into the training mode
  model.train()
  for step, batch in enumerate(train_dataloader):

    # Load batch to GPU
    b_input_ids, b_labels = tuple(t.to(device) for t in batch)

    # Zero out any previously calculated gradients
    model.zero_grad()

    # Perform a forward pass. This will return logits.
    logits = model(b_input_ids)
    break

    # Compute loss and accumulate the loss values
    loss = loss_fn(logits, b_labels)

    total_loss += loss.item()

    # Perform a backward pass to calculate gradients
    loss.backward()

    # Update parameters
    optimizer.step()

    # Calculate the average loss over the entire training data
    avg_train_loss = total_loss / len(train_dataloader)
  print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f}")

Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
torch.Size([8, 300])
   1    |   0.693357  
