<a href="https://colab.research.google.com/github/Apoak/Deep-Learning-Projects/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tokenizers transformers torchmetrics

In [None]:
import numpy as np
import sklearn
import torch
import os
import pandas as pd
import tqdm
from torch.utils.data import TensorDataset
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchmetrics

In [None]:
# device = 'cpu'
device = 'cuda'

In [None]:
if not os.path.exists('IMDB-Dataset.csv'):
  !wget -O IMDB-Dataset.csv -q "https://www.dropbox.com/scl/fi/0c7zc2adk1mgwgut5w80w/IMDB-Dataset.csv?rlkey=1drfg4zw36mhu32ndy2ihnygw&dl=1"

In [None]:
df = pd.read_csv('IMDB-Dataset.csv')
df.head()

In [None]:
text = list(df['review'].str.replace('<br />',''))
labels = np.array(df['sentiment'].map({'negative':0,'positive':1}))

In [None]:
print(df.shape)
print(labels.shape)

(50000, 2)
(50000,)


In [None]:
from transformers import AutoTokenizer #, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# model = AutoModel.from_pretrained("bert-base-cased")

Example of how to tokenize text:


In [None]:
# seq = text[0][:10]
seq = text[0][:10]
seq

In [None]:
seq = seq + "[0]"
print(seq)
tokens = tokenizer.tokenize(seq)
print(tokens)
token_ids = tokenizer(seq)['input_ids']
token_ids


In [None]:
tokenizer.decode(token_ids+[0,0,0])

**Bag of words model.**

 Create a 90/10 train/test split of the data. Create TF-IDF
weighted histograms (using TfidfVectorizer) using the top 1000 words and train
an MLP model (MLPClassifier) to classify them. Compute the train and test
accuracy of the model (using the .score() function).

In [None]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(text, labels, test_size=0.1)
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_features=1000)
x_train_V = vectorizer.fit_transform(x_train)
x_test_V = vectorizer.transform(x_test)

In [None]:
mlp = sklearn.neural_network.MLPClassifier()
mlp.fit(x_train_V, y_train)

In [None]:
print("Test: ", mlp.score(x_test_V, y_test))
print("Train: ", mlp.score(x_train_V, y_train))

**RNN model:**

Train a GRU to process sequences of BPE tokens output a binary
sentiment prediction. (Donâ€™t forget to set the batch_first flag if needed!)
Use an Embedding layer to map the BPE tokens to embedding vectors for input
to the GRU.
If the text is too long, take a random sub-sequence; if the text is too short, pad it
using token index 0.

In [None]:
# Split raw data first
train_texts, test_texts, train_labels, test_labels = sklearn.model_selection.train_test_split(text, labels, test_size=0.1)
# validation_texts, t_texts, validation_labels, t_labels = sklearn.model_selection.train_test_split(train_texts, train_labels, test_size=0.5)
max_seq_length = 100

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, embedding):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.embedding = embedding

    def __len__(self):
        return len(self.texts)

    # The dataloader is smart, calls getitem under the hood and gives it the current idx to fetch the cooresponding review.
    def __getitem__(self, idx):
        # Process each review individually
        text = self.texts[idx]

        # Truncate if needed
        if len(text) > self.max_length:
            start_idx = random.randint(0, len(text) - self.max_length)
            text = text[start_idx:start_idx + self.max_length]

        # Tokenize and add padding
        tokens = self.tokenizer(text, padding='max_length', truncation=True,
                                max_length=self.max_length, return_tensors="pt")

        # Get embeddings
        with torch.no_grad():  # Important: don't build computation graph during dataset creation
            embedded = self.embedding(tokens['input_ids'].squeeze(0))

        return embedded, self.labels[idx]

# Create datasets and dataloaders
embedding_layer = nn.Embedding(num_embeddings=len(tokenizer), embedding_dim=100)

train_dataset = ReviewDataset(train_texts, train_labels, tokenizer, max_seq_length, embedding_layer)
test_dataset = ReviewDataset(test_texts, test_labels, tokenizer, max_seq_length, embedding_layer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# validation_dataset = ReviewDataset(validation_texts, validation_labels, tokenizer, max_seq_length, embedding_layer)
# validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=False)

**Tokenized sequence:**

In [None]:
from torch.nn.utils.rnn import pad_sequence
import random

**GRU INITIALIZATION:**

In [None]:
class GRU(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate):
    super(GRU, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)

    self.dropout = nn.Dropout(dropout_rate)
    self.fc = nn.Linear(hidden_size, output_size)
    self.input_projection = nn.Linear(input_size, hidden_size)

    # self.sigmoid = nn.Sigmoid()
    dropout=dropout_rate

  def forward(self, x):

        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, hn = self.gru(x, h0)  # out is the GRU outputs, hn is the final hidden state

        # Resnet idea
        if x.size(2) != self.hidden_size:  # check if input size and hidden size are the same
          x = self.input_projection(x)

        out = out + x

        out = self.dropout(out[:, -1, :])

        #out = self.fc(out[:, -1, :])
        out = self.fc(out)
        # out = self.sigmoid(out)

        return out



**Hyperperameter tuning:**

In [None]:
gru = GRU(
    input_size = 100,
    hidden_size = 128,
    num_layers= 3,
    output_size=1,
    dropout_rate=0.3)

gru = gru.to(device)

In [None]:
lr = 3e-4
# lr = 1e-3

opt = torch.optim.Adam(gru.parameters(),lr=lr)
# opt = torch.optim.AdamW(gru.parameters(),lr=lr, weight_decay=1e-4)

#pos_weight = torch.tensor([2.0]).to(device)  # Give 2x importance to positive class
loss_fn = nn.BCEWithLogitsLoss()
# loss_fn = nn.BCELoss()

epochs = 20

**GRU Training:**

In [None]:
# Train gru
def train_gru(gru, train_loader, epochs, optimizer, criterion):

  for epoch in range(epochs):
    gru.train()
    for x_batch, y_batch in train_loader:
      x_batch = x_batch.to(device)
      y_batch = y_batch.to(device)

      optimizer.zero_grad()
      y_pred = gru(x_batch)
      y_pred = y_pred.view(-1)
      #y_pred = y_pred.squeeze()
      # print(y_batch.shape)
      # print(y_pred.shape)

      #loss = loss_fn(output.view(-1,len(ds.vocabulary)),y_batch.view(-1))

      loss = criterion(y_pred,y_batch.float())
      loss.backward()
      torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=1)
      optimizer.step()

    print(f"Epoch: {epoch}, Loss: {loss.item()}")

In [None]:
train_gru(gru, train_loader, epochs, opt, loss_fn)

**GRU Evaluation:**

In [None]:
def test_gru(gru, test_loader):
  gru.eval()
  metric = torchmetrics.classification.BinaryAccuracy().to(device) #(task='multiclass', num_classes=len(tokenizer)).to(device)
  with torch.no_grad():
    for x_batch, y_batch in test_loader:
      x_batch = x_batch.to(device)
      y_batch = y_batch.to(device)
      y_pred = gru(x_batch)
      # y_pred = y_pred.squeeze()
      y_pred = torch.sigmoid(y_pred).squeeze()
      # metric(y_pred.to('cpu'), y_batch.to('cpu'))
      # acc = metric.compute().item()
      # print(f"Accuracy: {acc}")
      # batch_acc = torchmetrics.functional.accuracy(y_pred, y_batch, task="binary").item()
      # print(f"Accuracy = {batch_acc:.4f}")
      metric.update(y_pred, y_batch)  # Accumulate accuracy across batches

    acc = metric.compute().item()  # Compute accuracy after all batches
    print(f"Final Accuracy: {acc}")

In [None]:
test_gru(gru, test_loader)

**VALIDATION TEST:**

In [None]:
test_gru(gru, train_loader)