In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Get to the folder we are at
FOLDERNAME = 'Colab\ Notebooks/NLPpractice'
%cd drive/MyDrive/$FOLDERNAME/

/content/drive/MyDrive/Colab Notebooks/NLPpractice


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd

In [4]:
# Seed for same output
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [7]:
raw_data = pd.read_csv('IMDBDataset.csv')

In [8]:
# Get data & labels
reviews = raw_data.review
labels = raw_data.sentiment

In [9]:
# One hot encoding
# Replace 'positive' with 1; 'negative' with 0
labels.replace({'positive': 1, 'negative':0}, inplace=True)

In [10]:
labels

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [11]:
patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"']
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '']

In [12]:
# Get rid of synbols that will not be used
def preprocessing(reviews, patterns, replacements):
  lst = []
  for i in range(len(reviews)):
    review = reviews[i].lower()
    for pattern, replacement in zip(patterns, replacements):
      review = review.replace(pattern, replacement)
    lst.append(review)
  return lst

In [13]:
reviews = preprocessing(reviews, patterns, replacements)

In [14]:
num_train = 35000
num_val = 15000
longest_num_tokens = 700

In [15]:
def indexing_tokens():
  indices = {'<SOS>':0, '<EOS>': 1, '<PAD>': 2, '<UNK>': 3}
  counter = 4
  for i in range(num_train):
    tokens = reviews[i].split()
    for token in tokens:
      if token not in indices:
        indices[token] = counter
        counter += 1
  return indices


In [16]:
def get_data(indices, longest_line_tokens, mode='train'):
    data = []
    Y = []
    if mode == 'train':
      for i in range(num_train):
        one_train_data = []
        y, tokens = labels[i], reviews[i].split()
        for token in tokens:
          one_train_data.append(indices[token])
          if len(one_train_data) == longest_line_tokens:
            break
        while len(one_train_data) < longest_line_tokens:
          one_train_data.append(indices['<PAD>'])
        one_train_data.insert(indices['<SOS>'], 0)
        one_train_data.append(indices['<EOS>'])
        data.append(one_train_data)
        Y.append(y)
    else:
      for i in range(num_train, num_train+num_val):
        one_val_data = []
        y, tokens = labels[i], reviews[i].split()
        for token in tokens:
          if token not in indices:
            one_val_data.append(indices['<UNK>'])
          else:
            one_val_data.append(indices[token])
          if len(one_val_data) == longest_line_tokens:
            break
        while len(one_val_data) < longest_line_tokens:
          one_val_data.append(indices['<PAD>'])
        one_val_data.insert(indices['<SOS>'], 0)
        one_val_data.append(indices['<EOS>'])
        data.append(one_val_data)
        Y.append(y)
    return data, Y

In [17]:
# Loading training data and validation data
indices = indexing_tokens()
training_data, training_labels = get_data(indices, longest_num_tokens)
val_data, val_labels = get_data(indices, longest_num_tokens, mode='val')  # split data

In [21]:
print(training_data[:10])
print('Number of training:', len(training_data))
print('Number of validation:', len(val_data))
print('Length of corpus:', len(indices))    # total amount of tokens

[[0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 6, 32, 33, 11, 34, 31, 35, 16, 36, 37, 38, 39, 40, 41, 5, 42, 43, 44, 45, 23, 46, 6, 47, 48, 49, 31, 25, 26, 50, 51, 52, 53, 6, 54, 55, 56, 57, 25, 52, 58, 59, 60, 30, 61, 62, 63, 64, 56, 42, 37, 26, 65, 45, 6, 66, 67, 5, 6, 47, 68, 26, 69, 16, 24, 11, 26, 6, 70, 71, 62, 6, 72, 73, 74, 75, 76, 68, 77, 78, 79, 80, 81, 82, 83, 84, 5, 6, 85, 86, 87, 6, 88, 89, 90, 91, 39, 92, 93, 94, 95, 26, 50, 96, 79, 6, 97, 98, 81, 26, 99, 62, 100, 101, 102, 103, 104, 105, 106, 107, 39, 108, 94, 109, 110, 111, 112, 113, 39, 114, 115, 22, 116, 117, 118, 119, 120, 121, 6, 122, 123, 5, 6, 52, 26, 124, 62, 6, 125, 11, 68, 126, 86, 7, 127, 128, 129, 130, 131, 132, 133, 53, 134, 135, 130, 136, 130, 137, 16, 138, 139, 140, 6, 32, 17, 119, 141, 142, 34, 31, 24, 94, 143, 68, 36, 144, 119, 145, 121, 119, 36, 146, 53, 68, 147, 24, 119, 148, 108, 119, 149, 51, 150, 53, 16, 39, 151, 152, 62, 6, 96, 153, 5,

In [26]:
# Create tensors of train & val
train_tensor = torch.tensor(training_data)
train_labels_tensor = torch.tensor(training_labels)
val_tensor = torch.tensor(val_data)
val_labels_tensor = torch.tensor(val_labels)

In [27]:
print('Train Tensor:', train_tensor.shape)
print('Val Tensor:', val_tensor.shape)
print('Train labels Tensor:', train_labels_tensor.shape)
print('Val labels Tensor:', val_labels_tensor.shape)

Train Tensor: torch.Size([35000, 702])
Val Tensor: torch.Size([15000, 702])
Train labels Tensor: torch.Size([35000])
Val labels Tensor: torch.Size([15000])


In [28]:
vocab_size = 122545
embedding_dim = 300
hidden_dim = 256
sequence_len = longest_num_tokens + 2
output_dim = 2
print_every = 400
batch_size = 32
qkv_dim = 200  # size of weight matrix
heads = 10  # times of parallel proccessing (how many slices of whole data)

In [30]:
class InputEncoding(nn.Module):
  def __init__(self,sequence_len, vocab_size, embedding_dim):
    super().__init__()
    self.word_emb = nn.Embedding(vocab_size, embedding_dim)
    self.positional_emb = nn.Embedding(sequence_len, embedding_dim)

  def forward(self, x):
    N, sequence_len = x.shape
    emb1 = self.word_emb(x)
    # N x sequence_len x embedding_dim

    position = torch.arange(0, sequence_len)  # [0, 1, 2 ..., 701]
    positions = position.expand(N, sequence_len).to(device)  # N * [0, 1, 2 ... 701], 2D-matrix
    emb2 = self.positional_emb(positions)
    # N x sequence_len x embedding_dim

    return emb1+emb2  # same dimension

In [32]:
class FeedForwardLayer(nn.Module):
  def __init__(self, emb_size, d_out):
    super().__init__()
    self.linear1 = nn.Linear(emb_size, d_out)
    self.linear2 = nn.Linear(d_out, emb_size)

  def forward(self, x):
    return self.linear2(nn.functional.dropout(nn.functional.relu(self.linear1(x))))  # emb_size

In [34]:
class MultiHeadSelfAttention(nn.Module):
  def __init__(self, embedding_dim, qkv_dim, heads):
    super().__init__()
    self.heads = heads
    assert qkv_dim%heads == 0, "Q, K, V dimension is not divisable by heads"

    self.n_features_in_head = qkv_dim // heads

    # dimensions become smaller in every head
    self.to_q = nn.Linear(embedding_dim, qkv_dim)  # fully connected
    self.to_k = nn.Linear(embedding_dim, qkv_dim)
    self.to_v = nn.Linear(embedding_dim, qkv_dim)

    # after training, need to go back to the original dimension
    self.to_out = nn.Linear(qkv_dim, embedding_dim)  # same as the input dimension

  def forward(self, q, k, v):
    # input sizes, q k v are emb1+emb2
    N, sequence_len, embedding_dim = q.shape

    # change dimension
    query, key, value = self.to_q(q), self.to_k(k), self.to_v(v)

    # multi heads
    query = query.reshape(N, sequence_len, self.heads, self.n_features_in_head)
    key = key.reshape(N, sequence_len, self.heads, self.n_features_in_head)
    value = value.reshape(N, sequence_len, self.heads, self.n_features_in_head)

    # compute cosine similarity (inner product of multi-dimensional matrix) as paper
    # inner product of query and key(NHQK) and QK have the same simencion
    similarity = torch.einsum('NQHF, NKHF->NHQK', [query, key])  # NQHF -> batch_size, q, head, feature
    scale = embedding_dim ** 0.5
    out = torch.softmax(similarity/scale, dim=3)         # to probability，get the highest value at dim=3
    out = torch.einsum('NHQK, NKHF->NQHF', [out, value])    # out is a similarity matrix, back to original dimension
    out = out.reshape(N, sequence_len, self.heads*self.n_features_in_head)
    out = self.to_out(out)
    return out

In [36]:
class ResidualBlock(nn.Module):
  def __init__(self):
    super().__init__()
    self.norm = nn.LayerNorm(embedding_dim)
  def forward(self, x, sub_layer):
    x = sub_layer(x)
    x = self.norm(x)
    return x + nn.functional.dropout(x)

In [39]:
class Encoder(nn.Module):
  def __init__(self, attention: MultiHeadSelfAttention, feed_forward: FeedForwardLayer, N, seq_len, vocab_size, emb_size):
    super().__init__()
    self.encoding = InputEncoding(sequence_len, vocab_size, embedding_dim)
    self.attention = attention
    self.feed_forward = feed_forward
    self.residual = ResidualBlock()
    self.norm = nn.LayerNorm(embedding_dim)
    self.out = nn.Linear(embedding_dim*sequence_len, 2)
  def forward(self, x):
    x = self.encoding(x)
    x = self.residual(x, lambda x: self.attention(x, x, x))  # out of multi-head
    x = self.residual(x, self.feed_forward)
    x = self.norm(x)
    x = torch.flatten(x, 1)
    return self.out(x)

In [40]:
model = Encoder(MultiHeadSelfAttention(embedding_dim, qkv_dim, heads), FeedForwardLayer(embedding_dim, embedding_dim), batch_size, sequence_len, vocab_size, embedding_dim)
model = model.cuda()

In [41]:
mini_trains = DataLoader(train_tensor, batch_size=batch_size)
mini_train_labels = DataLoader(training_labels, batch_size=batch_size)

mini_vals = DataLoader(val_tensor, batch_size=batch_size)
mini_val_labels = DataLoader(val_labels, batch_size=batch_size)

In [42]:
iterator = iter(mini_trains)
print(next(iterator).shape)

iterator = iter(mini_train_labels)
print(next(iterator).shape)

torch.Size([32, 702])
torch.Size([32])


In [43]:
# Training Procedure
def train(num_epoch, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer):
  for epoch in range(num_epoch):
    num_iters = 0
    for x, y in zip(mini_trains, mini_train_labels):
      model.train()
      x = x.to(device)
      y = y.to(device)
      scores = model(x)
      loss = loss_function(scores, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if num_iters % print_every == 0:
        evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device)
      num_iters += 1

In [44]:
# Evaluate Procedure
def evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device):
  model.eval()
  with torch.no_grad():
    acc_count = 0
    for x, y in zip(mini_vals, mini_val_labels):
      x=x.to(device)
      y=y.to(device)
      scores=model(x)
      predictions=scores.max(1)[1]
      acc = predictions.eq(y).sum().item()
      acc_count += acc
    print(f'Epoch[{epoch+1}] Acc: {acc_count/len(val_data)}')

In [45]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [46]:
# Start training
train(4, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer)

Epoch[1] Acc: 0.5006666666666667
Epoch[1] Acc: 0.49846666666666667
Epoch[1] Acc: 0.5039333333333333
Epoch[2] Acc: 0.5106
Epoch[2] Acc: 0.5556
Epoch[2] Acc: 0.6764
Epoch[3] Acc: 0.7509333333333333
Epoch[3] Acc: 0.7642
Epoch[3] Acc: 0.8034666666666667
Epoch[4] Acc: 0.8416666666666667
Epoch[4] Acc: 0.8534
Epoch[4] Acc: 0.8689333333333333
