<a href="https://colab.research.google.com/github/Dhanush123555/ML_Escapades/blob/main/TFNS_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import torch
from datasets import load_dataset

ds = load_dataset("zeroshot/twitter-financial-news-sentiment")

In [22]:
import re

class Tokenizer():
  def __init__(self):
    self.table = {}
    self.reverse = {}

  def getVocab(self):
    return len(self.table.keys()) + 1

  def fit(self, ds):
    j = 1 #0 reserved for padding
    for i in range(ds.num_rows):
      text = str(ds[i]["text"])
      text = self.cleanText(text)
      for word in text.strip().split():
        if word not in self.table:
          self.table[word] = j
          self.reverse[j] = word
          j += 1

  def cleanText(self, text):
    text = text.lower()
    text = re.sub(r"http\S*", " ", text)
    text = re.sub(r'\.$', ' . ', text)
    text = re.sub(r'\. ', ' . ', text)
    text = re.sub(r'[:()]', ' ', text)
    return text

  def single_tokenize(self, element):
    ids = []
    text = self.cleanText(element["text"])
    for word in text.strip().split():
      if word in self.table:
        ids.append(self.table[word])
    return {"input_ids" : ids}

  def tokenize(self, ds):
    return ds.map(self.single_tokenize, batched = False)

  def single_decode(self, element):
    ids = element["input_ids"]
    text = []
    for id in ids:
      text.append(self.reverse[id])
    return {"output" : text}

  def decode(self, ds):
    return ds.map(self.single_decode, batched = False)


In [23]:
def filter_data(data, labels):
  filtered_data = []
  filtered_labels = []
  filtered_lengths = []

  for i, seq in enumerate(data):
    if len(seq) > 0:
      filtered_data.append(data[i])
      filtered_labels.append(labels[i])
      filtered_lengths.append(len(seq))

  return filtered_data, filtered_labels, torch.tensor(filtered_lengths, dtype = torch.int64)

In [24]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class BatchedDataset(Dataset):

  def __init__(self, data, labels, lengths):
    self.data = pad_sequence(data, batch_first = True )
    self.labels = torch.tensor(labels, dtype = torch.long)
    self.lengths = lengths

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.data[idx], self.labels[idx], self.lengths[idx]

In [25]:
train_ds = ds["train"]
test_ds = ds["validation"]

tokenizer = Tokenizer()
tokenizer.fit(train_ds)

train_ds = tokenizer.tokenize(train_ds)
test_ds = tokenizer.tokenize(test_ds)

train_ds.set_format(type="torch", columns=["input_ids","label"])
test_ds.set_format(type="torch", columns=["input_ids","label"])

train_labels = train_ds["label"][:]
train_data = train_ds["input_ids"][:]

test_labels = test_ds["label"][:]
test_data = test_ds["input_ids"][:]

train_data, train_labels, train_lengths = filter_data(train_data, train_labels)
test_data, test_labels, test_lengths = filter_data(test_data, test_labels)

final_train_ds = BatchedDataset(train_data, train_labels, train_lengths)
train_dl = DataLoader(final_train_ds, batch_size = 64, shuffle = True)

final_test_ds = BatchedDataset(test_data, test_labels, test_lengths)
test_dl = DataLoader(final_test_ds, batch_size = 64, shuffle = True)

In [85]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

class SentimentRNN(nn.Module):
  def __init__(self, vocab_size):
    super(SentimentRNN, self).__init__()

    self.embedding = nn.Embedding(vocab_size, 32)
    self.RNN = nn.GRU(32, 64, bidirectional = True, batch_first=True)
    self.fc1 = nn.Linear(128, 3)
    self.Dropout = nn.Dropout(0.2)

  def forward(self, x, lens):
    x = self.embedding(x)
    h0 = torch.zeros(2, x.size(0), 64)
    x = pack_padded_sequence(x, lens, batch_first = True, enforce_sorted = False)
    _, x = self.RNN(x, h0)
    x = torch.cat((x[0], x[1]), dim = 1)
    x = self.Dropout(x)
    outputs = self.fc1(x)

    return outputs

In [86]:
model = SentimentRNN(tokenizer.getVocab())
print(model)

SentimentRNN(
  (embedding): Embedding(21492, 32)
  (RNN): GRU(32, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=3, bias=True)
  (Dropout): Dropout(p=0.2, inplace=False)
)


In [17]:
model.parameters()

<generator object Module.parameters at 0x783bfffa7140>

In [56]:
# Let's calculate Train accuracy

def calculate_train_accuracy():
  model.eval()

  correct = 0
  for batch_x, batch_y, lens in train_dl:
    with torch.no_grad():
      output = model(batch_x, lens)

    pred = torch.argmax(output, dim = 1)
    correct += (pred == batch_y).sum()

  return correct / train_lengths.size(0)


# Accuracy is 0.9968, which means it is overfitting

In [35]:
# Now let's calculate test accuracy

def calculate_test_accuracy():
  model.eval()

  correct = 0
  for batch_x, batch_y, lens in test_dl:
    with torch.no_grad():
      output = model(batch_x, lens)

    pred = torch.argmax(output, dim = 1)
    correct += (pred == batch_y).sum()

  return correct / test_lengths.size(0)

In [87]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.CrossEntropyLoss()

epochs = 10
for epoch in range(epochs):
  model.train()
  epoch_loss = 0
  for batch_x, batch_y, lens in train_dl:
    optimizer.zero_grad()
    output = model(batch_x, lens)
    loss = criterion(output, batch_y)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()

  print(f"Epoch {epoch}: Loss = {epoch_loss}    Train_accuracy: {calculate_train_accuracy()}    Test Accuracy: {calculate_test_accuracy()}")

Epoch 0: Loss = 131.3566222190857    Train_accuracy: 0.6776391863822937    Test Accuracy: 0.6803863644599915
Epoch 1: Loss = 113.10533368587494    Train_accuracy: 0.7283782362937927    Test Accuracy: 0.7097858190536499
Epoch 2: Loss = 98.00137269496918    Train_accuracy: 0.773770809173584    Test Accuracy: 0.7265854477882385
Epoch 3: Loss = 83.08433765172958    Train_accuracy: 0.8259775638580322    Test Accuracy: 0.7236455082893372
Epoch 4: Loss = 69.54334226250648    Train_accuracy: 0.8612014055252075    Test Accuracy: 0.7417051792144775
Epoch 5: Loss = 55.831190502271056    Train_accuracy: 0.8921270370483398    Test Accuracy: 0.7572448253631592
Epoch 6: Loss = 43.73532725870609    Train_accuracy: 0.9321731925010681    Test Accuracy: 0.7555648684501648
Epoch 7: Loss = 32.09863857552409    Train_accuracy: 0.9548170566558838    Test Accuracy: 0.7564048767089844
Epoch 8: Loss = 22.70220957696438    Train_accuracy: 0.9659293293952942    Test Accuracy: 0.7538849115371704
Epoch 9: Loss = 15

In [None]:
# The test accuracy is 73.54%, which is great for a vanilla RNN, although our model does overfit on the training data
"""
Model:
    SentimentRNN(
    (embedding): Embedding(21492, 100)
    (RNN): RNN(100, 32, batch_first=True, dropout=0.2, bidirectional=True)
    (fc1): Linear(in_features=64, out_features=3, bias=True)
  )

Accuracy: Train: 99.95%, Test: 73.54%

Model:
  SentimentRNN(
    (embedding): Embedding(21492, 64)
    (RNN): RNN(64, 32, batch_first=True, dropout=0.2, bidirectional=True)
    (fc1): Linear(in_features=64, out_features=3, bias=True)
  )

Accuracy: Train: 99.97% Test: 71.7%

Model:
    SentimentRNN(
    (embedding): Embedding(21492, 64)
    (RNN): RNN(64, 16, batch_first=True, dropout=0.2, bidirectional=True)
    (fc1): Linear(in_features=32, out_features=3, bias=True)
  )

Accuracy: Train: 94% Test: 68%

Model:
    SentimentRNN(
    (embedding): Embedding(21492, 32)
    (RNN): RNN(32, 32, batch_first=True, dropout=0.2, bidirectional=True)
    (fc1): Linear(in_features=64, out_features=3, bias=True)
  )

Accuracy: Train: 93.9% Test: 71.3


"""

In [None]:
"""
  LSTM:
    Model:
      SentimentRNN(
      (embedding): Embedding(21492, 32)
      (RNN): LSTM(32, 32, batch_first=True, dropout=0.2, bidirectional=True)
      (fc1): Linear(in_features=64, out_features=3, bias=True)
      )

    Accuracy: Train: 96.7% Test: 76.7%

    Model:
      SentimentRNN(
      (embedding): Embedding(21492, 32)
      (RNN): LSTM(32, 64, batch_first=True, dropout=0.2, bidirectional=True)
      (fc1): Linear(in_features=128, out_features=3, bias=True)
      )

    Accuracy: Train: 98.66% Test: 76.4%

    Model:
      SentimentRNN(
      (embedding): Embedding(21492, 16)
      (RNN): LSTM(16, 32, batch_first=True, dropout=0.2, bidirectional=True)
      (fc1): Linear(in_features=64, out_features=3, bias=True)
      )

    Accuracy: Train: 90.82% Test: 73.16%
"""

In [None]:
"""
  GRU:
    Model:
      SentimentRNN(
      (embedding): Embedding(21492, 16)
      (RNN): GRU(16, 32, batch_first=True, dropout=0.2, bidirectional=True)
      (fc1): Linear(in_features=64, out_features=3, bias=True)
      )

    Accuracy: Train: 89.38% Test: 75.17%

    Model:
      SentimentRNN(
      (embedding): Embedding(21492, 32)
      (RNN): GRU(32, 64, batch_first=True, dropout=0.2, bidirectional=True)
      (fc1): Linear(in_features=128, out_features=3, bias=True)
      )

    Accuracy: Train: 98.9% Test: 76.98%

    Model:
      SentimentRNN(
      (embedding): Embedding(21492, 32)
      (RNN): GRU(32, 64, batch_first=True, bidirectional=True)
      (Dropout): Dropout(p=0.2, inplace=False)
      (fc1): Linear(in_features=128, out_features=3, bias=True)
      )

    Accuracy: Train: 98.34% Test: 73.24%

"""