**Task:**

Create a DL training pipeline to train a LSTM/GRU (any one) and one Transformers
network to identify Sentiments in a sentence. You are free to choose any open
source dataset (like IMDB reviews or Amazon product ratings or any other). Report
your accuracy. Create an inference pipeline which can accept a user text and provide
score to each sentiment on it.**bold text**

# **2**. Transformer model finetuning approach

In [None]:
# module installation
!pip install 'portalocker>=2.8.2' # for data_iter_downloading

Collecting portalocker>=2.8.2
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader

import torchtext
from torchtext.datasets import IMDB
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

import warnings
warnings.filterwarnings('ignore')

print(torch.__version__)
print(torchtext.__version__)

2.2.1+cu121
0.17.1+cpu


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# device = torch.device('cpu')
device

device(type='cuda')

In [None]:
train_data_iter, test_data_iter = IMDB(split=('train', 'test'))

In [None]:
import pandas as pd
def get_dataframe(iterator=train_data_iter):
  labels, reviews = [], []
  for label, line in iterator:
      labels.append(label)
      reviews.append(line)
  df = pd.DataFrame({'sentiment': labels, 'review': reviews})
  return df


In [None]:
df_train = get_dataframe(train_data_iter)
print(df_train.shape)
df_test = get_dataframe(test_data_iter)
print(df_test.shape)

(25000, 2)
(25000, 2)


In [None]:
print(df_train['sentiment'].value_counts())
print(df_test['sentiment'].value_counts())

sentiment
1    12500
2    12500
Name: count, dtype: int64
sentiment
1    12500
2    12500
Name: count, dtype: int64


In [None]:

df_train = df_train.sample(frac=1).reset_index(drop=True)


In [None]:
import re
def clean_text(text):
  # removing html tags that contains at max 10 character, incase someone wrote review in the brackets.
  text = re.sub(r"<[^>]{1,10}>", " ", text)

  #removing all not alphabet charaters
  text = re.sub(r"[^A-Za-z .,']", " ", text)

  # removing mulitple spaces
  text = re.sub(r"\s+", " ", text)

  return text.strip(" ").strip(".").lower()

In [None]:
df_train['review'] = df_train['review'].apply(clean_text)
df_test['review'] = df_test['review'].apply(clean_text)

df_train['sentiment'] = df_train['sentiment'] - 1
df_test['sentiment'] = df_test['sentiment'] -1

In [None]:
train_data_iter = [(row['sentiment'], row['review']) for _, row in df_train.iterrows()]
test_data_iter = [(row['sentiment'], row['review']) for _, row in df_test.iterrows()]


In [None]:
#building vocabolary
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')
VOCAB_SIZE=20000
from torchtext.vocab import vocab

from collections import Counter

counter = Counter()
for _, text in train_data_iter:
    tokens = tokenizer(text)
    counter.update(tokens)

most_common = counter.most_common(VOCAB_SIZE)
most_common = {f'{key[0]}': key[1] for key in most_common}

vocab = vocab(Counter(most_common), specials=["<unk>", "<pad>"])


In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
class BuildingReviewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, vocab, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['review']
        label = self.data.iloc[idx]['sentiment']
        tokens = self.tokenizer(text)[:self.max_length]
        padded_tokens = tokens + ['<pad>'] * (self.max_length - len(tokens))
        tokenized_text = torch.tensor([self.vocab[token] if token in self.vocab else self.vocab['<unk>'] for token in padded_tokens])
        return tokenized_text, label

In [None]:
batch_size = 32
max_length = 250
train_dataloader = DataLoader(BuildingReviewsDataset(df_train, tokenizer, vocab, max_length),
                              batch_size=batch_size,
                              shuffle=True)
test_dataloader = DataLoader(BuildingReviewsDataset(df_test, tokenizer, vocab, max_length),
                              batch_size=batch_size,
                              shuffle=False)


In [None]:
class SentimentModelvTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=2, num_encoder_layers=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead),
            num_layers=num_encoder_layers)
        self.fc = nn.Linear(d_model, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x

In [None]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
model = SentimentModelvTransformer(len(vocab))

num_trainable_params = count_trainable_parameters(model)
print(f"Number of trainable parameters: {num_trainable_params}")

Number of trainable parameters: 4339586


In [None]:
TransformerModel = SentimentModelvTransformer(
    len(vocab)
).to(device)

In [None]:
EPOCHS = 3
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(TransformerModel.parameters(), lr=0.0001)

In [None]:
torch.manual_seed(42)
epochs_list = []
training_loss_list = []
test_loss_list = []
test_acc_list = []


for epoch in range(EPOCHS):
    total_train_loss = 0
    train_acc = 0
    total_test_loss = 0
    test_acc = 0
    TransformerModel.train()
    for idx, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        inputs, labels = batch

        outputs = model(inputs)

        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()


        _, predicted = torch.max(outputs, 1)
        train_acc += (predicted == labels).sum().item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_accuracy = train_acc / len(df_train)

    TransformerModel.eval()
    with torch.inference_mode():
        for batch in test_dataloader:
            inputs, labels = batch
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            total_test_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            test_acc += (predicted == labels).sum().item()

    avg_test_loss = total_test_loss / len(test_dataloader)
    test_accuracy = test_acc / len(df_test)

    epochs_list.append(epoch)
    training_loss_list.append(avg_train_loss)
    test_loss_list.append(avg_test_loss)
    test_acc_list.append(test_accuracy)

    print(f"Epoch {epoch+1}, Avg Training Loss: {avg_train_loss:.3f}, Training Accuracy: {train_accuracy: .2f}, Avg Test Loss: {avg_test_loss:.3f}, Test Accuracy: {test_accuracy:.2f}")



In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 5))

# # Plotting Loss
# plt.subplot(1, 2, 1)
# plt.plot(epochs_list, [tensor.detach().cpu().numpy() for tensor in training_loss_list] , label='Training Loss', marker='o')
# plt.plot(epochs_list, [tensor.detach().cpu().numpy() for tensor in test_loss_list], label='Test Loss', marker='x')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.title('Training and Test Loss')
# plt.legend()

# # Plotting Accuracy
# plt.subplot(1, 2, 2)
# plt.plot(epochs_list, test_acc_list, label='Test Accuracy', marker='o', color='green')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.title('Test Accuracy')
# plt.legend()

# plt.tight_layout()
# plt.show()


In [None]:
# inference pipeline

def preprocess_text(input_text, vocab, tokenizer, max_length):
    tokens = tokenizer(text)[:max_length]
    padded_tokens = tokens + ['<pad>'] * (max_length - len(tokens))
    tokenized_text = torch.tensor([vocab[token] if token in vocab else vocab['<unk>'] for token in padded_tokens])
    return tokenized_text

def predict_sentiment(text, model, vocab, tokenizer, max_length):
    TransformerModel.eval()
    with torch.inference_mode():
        tokenized_text = preprocess_text(text, vocab, tokenizer, max_length)
        input_tensor = tokenized_text.unsqueeze(0).to(device)
        prediction = model(input_tensor)
        return prediction

input_text = "worst movie ever"
sentiment = predict_sentiment(input_text, TransformerModel, vocab, tokenizer, max_length)

sentiment_in_words = "positive" if sentiment.argmax() == 1 else "negative"
print("Sentiment:", sentiment_in_words)

Sentiment: positive
