In [1]:
from google.colab import drive
drive.mount('/content/drive') #mounting drive so that restarting runtime doesnt clear all files

Mounted at /content/drive


In [2]:
import datasets
from torch.utils.data import DataLoader, Dataset #necessary imports
import torch.nn as nn

In [3]:
dataset_train=datasets.load_from_disk('/content/drive/MyDrive/NLP_CSOC_IG/dataset_train')
dataset_test=datasets.load_from_disk('/content/drive/MyDrive/NLP_CSOC_IG/dataset_test')

In [4]:
print(dataset_train.column_names)
print(dataset_train[0])

['rating', 'text_review', 'input_ids', 'token_type_ids', 'attention_mask']
{'rating': 2, 'text_review': 'Earcuff Product arived so quickly and I love it I have cancer and wear scarves so the cuff is perfect Feel naked without earrings but this is great May have to get another one for the other earExcellent seller', 'input_ids': [101, 4540, 10841, 4246, 4031, 10488, 7178, 2061, 2855, 1998, 1045, 2293, 2009, 1045, 2031, 4456, 1998, 4929, 11228, 6961, 2061, 1996, 26450, 2003, 3819, 2514, 6248, 2302, 27212, 2021, 2023, 2003, 2307, 2089, 2031, 2000, 2131, 2178, 2028, 2005, 1996, 2060, 4540, 10288, 29109, 16136, 14939, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
import torch #Custom Dataset class
class ReviewDataset(Dataset):

    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        item = self.dataset[index]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'rating': torch.tensor(max(0,item['rating']-1), dtype=torch.long)
        }

In [6]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from transformers import BertModel

# Load BERT once and freeze
bert = BertModel.from_pretrained('bert-base-uncased')
for param in bert.parameters():
    param.requires_grad = False
bert.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### LSTM Architecture

In [7]:
##Using LSTM

class LSTMWithFrozenBert(nn.Module):
    def __init__(self, bert_model, hidden_dim, output_dim):
        super().__init__()
        self.bert=bert_model

        self.LSTM=nn.LSTM(self.bert.config.hidden_size,hidden_dim,batch_first=True)
        self.fc=nn.Linear(hidden_dim,output_dim)

    def forward(self,input_ids,attention_mask):
      with torch.no_grad():
        bert_output=self.bert(input_ids=input_ids,attention_mask=attention_mask)
        embeddings=bert_output.last_hidden_state
      out,_=self.LSTM(embeddings)
      out = (out[:, -1, :]) # Take the last timestep
      out = self.fc(out)

      return out

In [None]:
epochs=5
lr_rate=0.003
# instance of LSTM class

model=LSTMWithFrozenBert(bert,128,1)

criterion=nn.BCEWithLogitsLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr_rate) #parameters set for training the rn

In [20]:
from tqdm import tqdm #showing the progress bar

model.to(device) #efficiently use GPU if available

batch_dataset_train=DataLoader(ReviewDataset(dataset_train), batch_size=256, shuffle=True,num_workers=2,pin_memory=True) #enabling multiprocessing and pin memory for gpu

for i in (range(epochs)):

  model.train()

  running_loss = 0.0 #average loss over an epoch

  for batch in tqdm(batch_dataset_train):
        # Move inputs and labels to device
        input_ids = batch['input_ids'].to(device)

        labels = batch['rating'].float().to(device)

        attention_mask=batch['attention_mask'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids,attention_mask)  # forward pass

        # outputs shape: (batch_size, 1), labels shape: (batch_size)
        outputs = outputs.squeeze()  # remove extra dims if needed

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

  avg_loss = running_loss / len(batch_dataset_train)
  print(f"Epoch [{i+1}/{epochs}], Loss: {avg_loss:.4f}")

100%|██████████| 106/106 [13:49<00:00,  7.82s/it]


Epoch [1/5], Loss: 0.4299


100%|██████████| 106/106 [13:48<00:00,  7.82s/it]


Epoch [2/5], Loss: 0.4134


100%|██████████| 106/106 [13:48<00:00,  7.81s/it]


Epoch [3/5], Loss: 0.3741


100%|██████████| 106/106 [13:47<00:00,  7.81s/it]


Epoch [4/5], Loss: 0.3471


100%|██████████| 106/106 [13:47<00:00,  7.80s/it]

Epoch [5/5], Loss: 0.3462





In [21]:
model.eval()

batch_dataset_test=DataLoader(ReviewDataset(dataset_test), batch_size=256, shuffle=False,num_workers=2,pin_memory=True)

running_loss=0.0

preds=[]

labels_fin=[]

for batch in tqdm(batch_dataset_test):
        # Move inputs and labels to device
        input_ids = batch['input_ids'].to(device)

        labels = batch['rating'].float().to(device)

        attention_mask=batch['attention_mask'].float().to(device)

        with torch.no_grad():
           outputs = model(input_ids,attention_mask)  # forward pass

          # outputs shape: (batch_size, 1), labels shape: (batch_size)
           outputs = outputs.squeeze()  # remove extra dims if needed

           preds.extend((torch.sigmoid(outputs)>=0.5).cpu().numpy())

           labels_fin.extend(labels.cpu().numpy())

           loss = criterion(outputs, labels)

           running_loss += loss.item()

avg_loss = running_loss / len(batch_dataset_test)
print(f"Loss: {avg_loss:.4f}")


100%|██████████| 12/12 [01:29<00:00,  7.43s/it]

Loss: 0.3163





In [None]:
from sklearn.metrics import accuracy_score,f1_score

print(f"Accuracy on test: {accuracy_score(labels_fin, preds):0.4f}") #accuracy on test data

print(f"F1-score on test: {f1_score(labels_fin, preds):0.4f}") #f1 score on test data

Accuracy on test: 0.8660
F1-score on test: 0.8542


In [23]:
print(model) # a summary of the model

LSTMWithFrozenBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

### RNN Architecture

In [24]:
class RNNWithFrozenBert(nn.Module):
    def __init__(self, bert_model, hidden_dim, output_dim):
        super().__init__()
        self.bert = bert_model  # Use externally passed BERT

        self.rnn = nn.RNN(self.bert.config.hidden_size, hidden_dim, batch_first=True, nonlinearity='tanh')
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = bert_output.last_hidden_state  # shape: (batch, seq_len, 768)

        out, _ = self.rnn(embeddings)
        out = out[:, -1, :]  # last timestep
        out = self.fc(out)
        return out

In [None]:
epochs=5
lr_rate=0.003
# instance of RNN class

model=RNNWithFrozenBert(bert,128,1)

criterion=nn.BCEWithLogitsLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr_rate) #parameters set for training the rn

In [28]:
model.to(device) #efficiently use GPU if available

batch_dataset_train=DataLoader(ReviewDataset(dataset_train), batch_size=256, shuffle=True,num_workers=2,pin_memory=True) #enabling multiprocessing and pin memory for gpu

for i in (range(epochs)):

  model.train()

  running_loss = 0.0 #average loss over an epoch

  for batch in tqdm(batch_dataset_train):
        # Move inputs and labels to device
        input_ids = batch['input_ids'].to(device)

        labels = batch['rating'].float().to(device)

        attention_mask=batch['attention_mask'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids,attention_mask)  # forward pass

        # outputs shape: (batch_size, 1), labels shape: (batch_size)
        outputs = outputs.squeeze()  # remove extra dims if needed

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

  avg_loss = running_loss / len(batch_dataset_train)
  print(f"Epoch [{i+1}/{epochs}], Loss: {avg_loss:.4f}")

100%|██████████| 106/106 [13:37<00:00,  7.71s/it]


Epoch [1/5], Loss: 0.4564


100%|██████████| 106/106 [13:38<00:00,  7.72s/it]


Epoch [2/5], Loss: 0.3803


100%|██████████| 106/106 [13:37<00:00,  7.72s/it]


Epoch [3/5], Loss: 0.3837


100%|██████████| 106/106 [13:38<00:00,  7.72s/it]


Epoch [4/5], Loss: 0.3725


100%|██████████| 106/106 [13:38<00:00,  7.72s/it]

Epoch [5/5], Loss: 0.3635





In [29]:
model.eval()

batch_dataset_test=DataLoader(ReviewDataset(dataset_test), batch_size=256, shuffle=False,num_workers=2,pin_memory=True)

running_loss=0.0

preds=[]

labels_fin=[]

for batch in tqdm(batch_dataset_test):
        # Move inputs and labels to device
        input_ids = batch['input_ids'].to(device)

        labels = batch['rating'].float().to(device)

        attention_mask=batch['attention_mask'].to(device)

        with torch.no_grad():
           outputs = model(input_ids,attention_mask)  # forward pass

          # outputs shape: (batch_size, 1), labels shape: (batch_size)
           outputs = outputs.squeeze()  # remove extra dims if needed

           preds.extend((torch.sigmoid(outputs)>=0.5).cpu().numpy())

           labels_fin.extend(labels.cpu().numpy())

           loss = criterion(outputs, labels)

           running_loss += loss.item()

avg_loss = running_loss / len(batch_dataset_test)
print(f"Loss: {avg_loss:.4f}")


100%|██████████| 12/12 [01:29<00:00,  7.43s/it]

Loss: 0.3498





In [None]:
from sklearn.metrics import accuracy_score, f1_score

print(f"Accuracy on test: {accuracy_score(labels_fin, preds):0.4f}") #accuracy on test data

print(f"F1-score on test: {f1_score(labels_fin, preds):0.4f}") #f1 score on test data

Accuracy on test: 0.8510
F1-score on test: 0.8480


In [None]:
print(model) #model summary

RNNWithFrozenBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element