# Attention LSTM with BERT Embeddings (Demo)


## Setup

### Mount to Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/Colab\ Notebooks/group_project

/content/drive/MyDrive/Colab Notebooks/group_project


### Necessary Package Installation

In [None]:
!pip install accelerate

### Necessary Package Import

In [27]:
from typing import Dict
from transformers import PreTrainedTokenizer, BertTokenizer, BertModel
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import pandas as pd

## Dataset

### Deffinition

In [28]:
# premise,hypothesis,label
class NILDataset(Dataset):
    def __init__(self, root: str, *, tokenizer: PreTrainedTokenizer, max_length: int, is_testing: bool = False) -> None:
        self._df = pd.read_csv(root).fillna("")
        self._tokenizer = tokenizer
        self._max_length = max_length
        self._is_test = is_testing

    def __len__(self) -> int:
        return len(self._df)

    def __getitem__(self, index) -> Dict:
        row = self._df.loc[index]

        premise = row["premise"]
        hypothesis = row["hypothesis"]
        if not self._is_test:
            label = row["label"]

        encoding = self._tokenizer.encode_plus(
            premise, hypothesis,
            add_special_tokens=True,
            max_length=self._max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target = {
            "input_ids": encoding['input_ids'].squeeze(0),  # Remove the batch dimension
            "attention_mask": encoding['attention_mask'].squeeze(0)  # Remove the batch dimension
        }

        if not self._is_test:
            target["label"] = torch.tensor(label, dtype=torch.float32)

        return target

### Initialisation

In [7]:
bert_model = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(bert_model)
max_length = 256

test_dataset = NILDataset("./data/NLI/test.csv", tokenizer=tokenizer, max_length=max_length, is_testing=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Model

### Architecure

In [8]:
class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.attention = nn.Linear(hidden_dim, 1)

    def forward(self, lstm_output):
        # lstm_output shape: (batch_size, seq_len, hidden_dim)
        attention_scores = self.attention(lstm_output).squeeze(2)  # (batch_size, seq_len)
        attention_weights = torch.softmax(attention_scores, dim=1).unsqueeze(2)  # (batch_size, seq_len, 1)
        weighted_output = lstm_output * attention_weights  # (batch_size, seq_len, hidden_dim)
        context_vector = weighted_output.sum(1)  # Sum over the sequence dimension (batch_size, hidden_dim)

        return context_vector

class NLIModelDL(nn.Module):
    def __init__(self, *, bert_model:str, lstm_hidden_dim: int):
        super(NLIModelDL, self).__init__()
        self._bert = BertModel.from_pretrained(bert_model)
        self._lstm = nn.LSTM(self._bert.config.hidden_size, lstm_hidden_dim, batch_first=True)
        self._attention = AttentionLayer(lstm_hidden_dim)
        self._fc = nn.Linear(lstm_hidden_dim, 2)

    def forward(self, input_ids, attention_mask, labels = None):
        input_ids = input_ids.to(self._bert.device)
        attention_mask = attention_mask.to(self._bert.device)
        if labels is not None:
            labels = labels.to(self._bert.device).long()

        with torch.no_grad():  # Freeze BERT during training
            encoded_layers = self._bert(input_ids, attention_mask=attention_mask)

        lstm_out, _ = self._lstm(encoded_layers.last_hidden_state)
        lstm_out = lstm_out[:, -1, :]
        logits = self._fc(lstm_out)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))

        return {'loss': loss, 'logits': logits} if loss is not None else logits

### Initialisation

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = torch.load("./models/attention_lstm__with_bert_embeddings.pth")
model = model.to(device)

## Testing

### Setup

In [16]:
model = model.eval()  # Set the model to evaluation mode

data_loader = DataLoader(test_dataset, batch_size=16)

predictions = []

### Experiment

In [15]:
import warnings
warnings.filterwarnings("ignore")

In [25]:
for item in data_loader:
    with torch.no_grad():
        logits = model(item['input_ids'].to(device), item['attention_mask'].to(device))
        batch_predictions = logits.argmax(dim=-1)  # This will be a tensor of shape [batch_size]
        predictions.extend(batch_predictions.tolist())  # Convert tensor to list and extend the main list

In [26]:
# Create a DataFrame for predictions and save to CSV
predictions_df = pd.DataFrame(predictions, columns=['prediction'])
predictions_df.to_csv("./out/Group_61_B.csv", index=False)  # Change the path as needed