#  ELECTRA with RoBERTa Embeddings

# Necessary Package Installation

In [None]:
!pip install datasets
!pip install accelerate -U
import torch
import pandas as pd
from transformers import ElectraTokenizer, ElectraForSequenceClassification, RobertaModel, RobertaTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch import nn
from torch.nn import CrossEntropyLoss

## Custom Neural Network Model: ElectraWithRobertaEmbeddings

This part of the code defines a custom neural network model, `ElectraWithRobertaEmbeddings`, designed to integrate embeddings from both ELECTRA and RoBERTa models for the task of Natural Language Inference (NLI). This model harnesses the complementary strengths of two distinct transformer architectures to enhance performance on the NLI task.

### Class Definition: `ElectraWithRobertaEmbeddings`
- **Initialization (`__init__` method)**:
  - Initializes the model architecture by setting up the `electra_model` and `roberta_model`.
  - Defines an additional embedding layer to adjust RoBERTa's embeddings to a smaller size, making them compatible for concatenation with ELECTRA's embeddings.
  - A classifier layer is also defined that will output the final logits from the combined embeddings.

- **Forward Pass (`forward` method)**:
  - **ELECTRA Forward Pass**: Processes the input text to extract embeddings specific to the [CLS] token, representing the aggregate sequence information.
  - **RoBERTa Embedding Preparation**: Adjusts the RoBERTa embeddings through a linear layer to reduce their dimensionality.
  - **Feature Combination**: Concatenates the adjusted ELECTRA and RoBERTa embeddings.
  - **Classification**: Passes the combined embeddings through a linear classifier to generate logits.
  - **Loss Computation**: Computes the cross-entropy loss if labels are provided, making it suitable for both training and evaluation phases.

### Function: `tokenize_and_extract_embeddings`
- Prepares the inputs for the model by:
  - **Tokenization**: Tokenizes the input text (premise and hypothesis) using both ELECTRA and RoBERTa tokenizers, ensuring that the text is appropriately preprocessed to align with the model's requirements.
  - **RoBERTa Embedding Generation**: With RoBERTa in evaluation mode, processes the tokenized inputs to generate embeddings, specifically extracting the embedding from the [CLS] token which summarizes the entire sequence.
  - **Input Preparation**: Formats the tokenized inputs and embeddings correctly, ensuring they are ready for model consumption, including adjusting dimensions and ensuring data is moved to the appropriate compute device (CPU or GPU).

This setup is crucial for leveraging the combined capabilities of ELECTRA and RoBERTa models in a unified framework to effectively tackle the nuances and complexities of natural language inference.


In [None]:
# Define custom model incorporating RoBERTa embeddings

class ElectraWithRobertaEmbeddings(nn.Module):
    def __init__(self, electra_model, roberta_model):
        super().__init__()
        self.electra = electra_model
        self.roberta = roberta_model
        self.embedding_layer = nn.Linear(768, 256)  # Adjust based on Roberta's output size
        self.classifier = nn.Linear(768 + 256, 2)   # Adjusted for combined input size

    def forward(self, input_ids, attention_mask, roberta_embeddings, labels=None):
        # Forward pass for ELECTRA, getting the last hidden states instead of the logits
        electra_outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        electra_hidden_state = electra_outputs.hidden_states[-1][:, 0, :]  # Get the last hidden state of the [CLS] token

        # Prepare RoBERTa embeddings
        roberta_features = self.embedding_layer(torch.tensor(roberta_embeddings, dtype=torch.float32).to(input_ids.device))

        # Combine ELECTRA and RoBERTa features
        combined_features = torch.cat((electra_hidden_state, roberta_features), dim=1)

        # Pass through the classifier
        logits = self.classifier(combined_features)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
            return (loss, logits)
        return logits

# Adjust the tokenize_and_extract_embeddings function to correctly prepare outputs
def tokenize_and_extract_embeddings(example):
    # Tokenization for ELECTRA
    inputs = electra_tokenizer(example['premise'], example['hypothesis'], truncation=True, padding='max_length', max_length=128, return_tensors="pt")

    # Tokenization for RoBERTa
    roberta_inputs = roberta_tokenizer(example['premise'], example['hypothesis'], truncation=True, padding='max_length', max_length=128, return_tensors="pt")
    roberta_inputs = {k: v.to(device) for k, v in roberta_inputs.items()}

    # Generate RoBERTa embeddings
    with torch.no_grad():
        roberta_outputs = roberta_base_model(**roberta_inputs)
    embeddings = roberta_outputs.last_hidden_state[:, 0]  # Take [CLS] token representation

    # Prepare final inputs for the model
    inputs = {k: v.squeeze() for k, v in inputs.items()}  # Remove batch dimension if added by return_tensors
    inputs.update({'roberta_embeddings': embeddings.cpu().numpy()})
    return inputs





## Load Tokenizers and Models
The following code initializes the tokenizers and models required for the NLI task using ELECTRA and RoBERTa:

- `electra_tokenizer`: Loaded from the pretrained 'google/electra-base-discriminator'. This tokenizer is designed to work with the ELECTRA model for tokenizing the text.
- `roberta_tokenizer`: Loaded from the pretrained 'roberta-base'. This tokenizer is used for preparing inputs specifically for the RoBERTa model.
- `electra_base_model`: An instance of `ElectraForSequenceClassification` loaded with outputs for the hidden states, tuned specifically for sequence classification tasks with two labels.
- `roberta_base_model`: Loaded from 'roberta-base' to be used for generating embeddings from the text.

### Moving Models to GPU
To leverage the computational efficiency of GPUs:

- The `device` variable determines if a GPU is available and sets the device accordingly.
- Both models are moved to the specified device (`cuda` if GPU is available, otherwise `cpu`).
- `roberta_base_model` is set to evaluation mode to ensure that it does not update weights during inference, which is standard practice for using pre-trained models in a predictive context.


In [None]:
# Load tokenizers and models
electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
electra_base_model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=2, output_hidden_states=True)
roberta_base_model = RobertaModel.from_pretrained('roberta-base')

# Move models to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
electra_base_model.to(device)
roberta_base_model.to(device)
roberta_base_model.eval()  # Set RoBERTa to evaluation mode

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

## Evaluation Metrics Function: compute_metrics

This section of the code defines the function `compute_metrics`, which is used to evaluate the performance of the model during training and validation. This function calculates several key metrics that are essential for assessing the effectiveness of the model in the Natural Language Inference (NLI) task.

### Function: `compute_metrics`
- **Purpose**: To calculate and return the model's performance metrics based on its predictions compared to the actual labels.
- **Parameters**:
  - `pred`: A structure containing the model's predictions and the actual labels.
- **Process**:
  - Extracts `label_ids` from the `pred` object, which represent the true labels of the data.
  - Computes the model's predicted labels using `argmax` on the prediction logits, which identifies the most likely class for each example.
  - Calculates the following metrics using scikit-learn's `precision_recall_fscore_support` and `accuracy_score` functions:
    - **Accuracy**: The proportion of correctly predicted labels to the total number of cases.
    - **Precision**: The ratio of correctly predicted positive observations to the total predicted positives. It is a measure of a classifier's exactness.
    - **Recall (Sensitivity)**: The ratio of correctly predicted positive observations to the all observations in actual class. It is a measure of a classifier's completeness.
    - **F1 Score**: The weighted average of Precision and Recall. This score takes both false positives and false negatives into account.
- **Returns**:
  - A dictionary containing the calculated metrics: accuracy, F1 score, precision, and recall.

These metrics are crucial for understanding the model's performance across various dimensions of accuracy and are particularly important for balancing considerations between precision and recall, especially in datasets that may have imbalanced classes.


In [None]:
# Define compute metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


## Loading and Preprocessing Datasets

This section of the code is dedicated to loading the training and evaluation datasets and preparing them for use with the model. This involves reading data from CSV files, converting them into a format suitable for the model, and applying preprocessing steps.

### Loading Data
- **Training Data (`train_df`)**:
  - Loaded from a CSV file located at `/content/drive/MyDrive/NLU/training_data/NLI/train.csv`.
  - This dataset is used to train the model.
  - **The path can be changed and set to any necessary path to be used for training**
- **Validation Data (`val_df`)**:
  - Loaded from a CSV file located at `/content/drive/MyDrive/NLU/training_data/NLI/dev.csv`.
  - This dataset is used to evaluate the model's performance on unseen data during the training process.
  - **The path can be changed and set to any necessary path to be used for evaluating**
  
### Conversion to Datasets
- **Training Dataset (`train_dataset`)**:
  - Converts the `train_df` DataFrame into a `Dataset` object, which is a more efficient format for handling data within the Hugging Face `datasets` library. This format facilitates easier application of transformations and batching operations.
- **Validation Dataset (`val_dataset`)**:
  - Similarly, converts the `val_df` DataFrame into a `Dataset` object for the same reasons as the training dataset.


### Data Preprocessing
- **Tokenization and Embedding Extraction**:
  - Applies the `tokenize_and_extract_embeddings` function to both the training and validation datasets using the `map` function of the `Dataset` class.
  - This operation is performed in batches (`batched=True`), which optimizes processing time and resource utilization by processing multiple samples in each operation instead of one-by-one.
  - The function handles tokenization for both ELECTRA and RoBERTa models and extracts necessary embeddings which are critical for the model input.

This preparation phase is crucial as it ensures the data is in the correct format and is enriched with necessary embeddings, thereby making it ready for effective training and validation of the NLI model.


In [None]:
import warnings

# Load datasets for training and evaluating
train_df = pd.read_csv('/content/drive/MyDrive/NLU/training_data/NLI/train.csv') # Modify the path if necessary: This is the dataset for training the model
val_df = pd.read_csv('/content/drive/MyDrive/NLU/training_data/NLI/dev.csv') # Modify the path if necessary: This is the dataset for evaluating the mdoel

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Ignore specific warning
warnings.filterwarnings("ignore")

# Apply tokenization and embedding extraction
train_dataset = train_dataset.map(tokenize_and_extract_embeddings, batched=True)
val_dataset = val_dataset.map(tokenize_and_extract_embeddings, batched=True)

## Configuring and Initializing Model Training

This segment of the code configures the training parameters and initializes the training process using the Hugging Face Transformers' `Trainer` API. The `Trainer` simplifies the process of fine-tuning pre-trained models on custom datasets.

### Model Initialization
- **ElectraWithRobertaEmbeddings Model**:
  - Initializes the custom model combining ELECTRA and RoBERTa embeddings, taking pre-trained models as parameters.

### Trainer Initialization
- **`Trainer` Setup**:
  - `model`: The custom model to be trained.
  - `args`: The training arguments.
  - `train_dataset`: The training dataset prepared earlier.
  - `eval_dataset`: The evaluation dataset prepared earlier.
  - `compute_metrics`: Function to compute evaluation metrics during training.

This setup defines the critical parameters and constructs required for effectively training the NLI model. The `Trainer` class from Hugging Face automates many aspects of training neural networks, allowing for a streamlined and efficient training process.

**`Output directory and logging directory can be modified as necessary`**


In [None]:
# Training arguments
# Output directory and logging directory can be modified as necessary
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/NLU/results', #Change as necessary when running
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.07,
    logging_dir='/content/drive/MyDrive/NLU/logs', #Change as necessary when running
    logging_strategy="epoch",
    evaluation_strategy="no",
    learning_rate=3e-5,
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy="no",
    lr_scheduler_type='linear'
)

# Initialize the Trainer
model = ElectraWithRobertaEmbeddings(electra_base_model, roberta_base_model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [None]:
# Train the model on tran.csv
trainer.train()

# Evalaute the model on dev.csv
trainer.evaluate()

Step,Training Loss
3368,0.0861
6736,0.0501
10104,0.0206


{'eval_loss': 0.8540595769882202,
 'eval_accuracy': 0.8965414873088912,
 'eval_f1': 0.8996977982443517,
 'eval_precision': 0.9006050129645635,
 'eval_recall': 0.8987924094307073,
 'eval_runtime': 25.2507,
 'eval_samples_per_second': 266.804,
 'eval_steps_per_second': 16.712,
 'epoch': 3.0}

In [None]:
# Save the model
torch.save(model.state_dict(), '/content/drive/MyDrive/NLU/Model/electra_roberta_nli_model.pth') # The model saving path can be modified if necessary
