# ELECTRA with RoBERTa Embeddings


# Installing and Importing necessary packages

In [None]:
!pip install datasets
!pip install accelerate -U
import torch
import pandas as pd
from transformers import ElectraTokenizer, RobertaTokenizer, ElectraForSequenceClassification, RobertaModel
from datasets import Dataset
from torch.utils.data import DataLoader

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/542.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.7

## Model Definition and Data Preparation

In this section, we define the neural network model and prepare the functions necessary for data tokenization and embedding extraction. This setup is crucial for loading the trained model and making predictions on new data.

### Custom Model: ElectraWithRobertaEmbeddings

This class combines embeddings from the ELECTRA and RoBERTa models. Here's an overview of its components:

- **Initialization**:
  - Integrates the `electra_model` and `roberta_model`.
  - Sets up a linear transformation for RoBERTa's embeddings to align dimensions.
  - Configures a final classifier layer to output predictions based on combined features.

- **Forward Pass**:
  - Receives token IDs and attention masks for input management.
  - Combines transformed embeddings from both models.
  - Outputs logits which are the raw, unnormalized scores that the model associates with each class.

### Tokenization and Embedding Extraction Function

This function prepares the input data by:

- **Tokenizing**: Processes text using tokenizers for both ELECTRA and RoBERTa.
- **Embedding Extraction**: Retrieves the [CLS] token embeddings from RoBERTa, which summarize the input sequences.

### Collate Function

The `collate_fn` function standardizes how batches of data are combined. This is necessary for efficient batch processing during prediction:

- Converts lists of tensors for `input_ids`, `attention_mask`, and `roberta_embeddings` into a unified format that the model can process as batches.


In [None]:
# Define your model class as before
class ElectraWithRobertaEmbeddings(torch.nn.Module):
    def __init__(self, electra_model, roberta_model):
        super().__init__()
        self.electra = electra_model
        self.roberta = roberta_model
        self.embedding_layer = torch.nn.Linear(768, 256) # Adjust based on Roberta's output size
        self.classifier = torch.nn.Linear(768 + 256, 2) # Adjusted for combined input size

    def forward(self, input_ids, attention_mask, roberta_embeddings):
        # Forward pass for ELECTRA, getting the last hidden states instead of the logits
        electra_outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        electra_hidden_state = electra_outputs.hidden_states[-1][:, 0, :] # Get the last hidden state of the [CLS] token
        # Prepare RoBERTa embeddings
        roberta_features = self.embedding_layer(torch.tensor(roberta_embeddings, dtype=torch.float32).to(input_ids.device))
        # Combine ELECTRA and RoBERTa features
        combined_features = torch.cat((electra_hidden_state, roberta_features), dim=1)
        logits = self.classifier(combined_features)
        return logits


# Function to tokenize and extract embeddings
def tokenize_and_extract_embeddings(example):
    # Tokenization for ELECTRA
    inputs = electra_tokenizer(example['premise'], example['hypothesis'], truncation=True, padding='max_length', max_length=128, return_tensors="pt")
    # Tokenization for RoBERTa
    roberta_inputs = roberta_tokenizer(example['premise'], example['hypothesis'], truncation=True, padding='max_length', max_length=128, return_tensors="pt")
    roberta_inputs = {k: v.to(device) for k, v in roberta_inputs.items()}

    # Generate RoBERTa embeddings
    with torch.no_grad():
        roberta_outputs = roberta_base_model(**roberta_inputs)
    embeddings = roberta_outputs.last_hidden_state[:, 0]  # Take [CLS] token representation

    # Prepare final inputs for the model
    inputs = {k: v.squeeze() for k, v in inputs.items()}  # Remove batch dimension if added by return_tensors
    inputs.update({'roberta_embeddings': embeddings.cpu().numpy()})

    return inputs



def collate_fn(batch):
    # Convert list back to tensors
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    roberta_embeddings = torch.stack([torch.tensor(item['roberta_embeddings']) for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'roberta_embeddings': roberta_embeddings
    }


## Loading the Trained Model

This section of the notebook handles the loading of the previously trained `ElectraWithRobertaEmbeddings` model and sets it up for inference. The model is prepared to process new data and make predictions.

### Setup Device for Model
- **Device Configuration**: Establishes whether to use GPU acceleration (if available) or default to CPU. This step optimizes computation depending on the hardware capabilities.

### Load Pre-trained Components
- **Model Components**:
  - `electra_base_model`: Loads the pre-trained ELECTRA model configured to output hidden states, which are necessary for our custom model to function correctly.
  - `roberta_base_model`: Loads the pre-trained RoBERTa model, used here primarily for extracting embeddings.
  - `electra_tokenizer` and `roberta_tokenizer`: Load the tokenizers for ELECTRA and RoBERTa respectively, which are essential for text preprocessing to match the input format expected by the models.

### Initialize and Load the Custom Model
- **Custom Model Initialization**:
  - Instantiates the `ElectraWithRobertaEmbeddings` model with the loaded ELECTRA and RoBERTa models as parameters.
- **Load Model State**:
  - The state of the previously trained model is loaded from a specified path. This state includes weights and biases tuned during the training process. **`The path can be modified if necessary`**
  
### Prepare Model for Inference
- **Device Assignment**: Moves the model to the configured device (GPU or CPU) to ensure that all computations are performed on the correct hardware.
- **Evaluation Mode**: Sets the model to evaluation mode, which disables training-specific behaviors like dropout, ensuring consistent predictions.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizers and models
electra_base_model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=2, output_hidden_states=True)
roberta_base_model = RobertaModel.from_pretrained('roberta-base')
electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

model = ElectraWithRobertaEmbeddings(electra_base_model, roberta_base_model)

# Load the saved model state
model.load_state_dict(torch.load('/content/drive/MyDrive/NLU/Model/electra_roberta_nli_model.pth')) # Modify the model path if necessary, the Gooogle Drive shared link is in README file and also here: https://drive.google.com/file/d/10x8CU3p_ENpIhDjxnfKInBlJN3KqbBFl/view?usp=share_link

model.to(device)
model.eval()  # Set the model to evaluation mode

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

ElectraWithRobertaEmbeddings(
  (electra): ElectraForSequenceClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0-11): 12 x ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense)

## Function to Predict and Output Results

This section outlines a function designed to load a dataset, use a trained model to make predictions, and then save these predictions to a specified file.

### Function: `predict_and_output`
- **Parameters**:
  - `dataset_Path`: Path to the dataset file where the test data is stored.
  - `output_Path`: Path where the predictions will be saved as a CSV file.

### Steps Involved:
- **Load Dataset**:
  - Reads the test dataset from a CSV file into a pandas DataFrame.
  - Converts the DataFrame into a `Dataset` object to facilitate transformations and batching.

- **Data Preparation**:
  - Applies the `tokenize_and_extract_embeddings` function to the dataset using batch processing, which prepares the data for model input by extracting necessary features and embeddings.

- **Prediction Loop**:
  - Initializes a DataLoader for efficient batch processing of test data.
  - Iterates over batches of data, computing logits for each batch without updating model parameters (`torch.no_grad()` context).
  - Extracts the predicted class for each instance in the batch by finding the argmax of the logits.

- **Output Predictions**:
  - Optionally, prints the list of predictions.
  - Compiles predictions into a DataFrame and writes it to a CSV file at the specified output path.

This function encapsulates the entire process from data loading to prediction output, ensuring predictions are easily accessible for further analysis or reporting.


In [None]:
def predict_and_output(dataset_Path, output_Path):
    # Load the dataset
    df_test = pd.read_csv(dataset_Path)
    test_dataset = Dataset.from_pandas(df_test)

    test_dataset = test_dataset.map(tokenize_and_extract_embeddings, batched=True)
    data_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

    predictions = []

    for item in data_loader:
      with torch.no_grad():
        logits = model(item['input_ids'].to(device), item['attention_mask'].to(device), item['roberta_embeddings'].to(device))
        batch_predictions = logits.argmax(dim=-1)  # This will be a tensor of shape [batch_size]
        predictions.extend(batch_predictions.tolist())  # Convert tensor to list and extend the main list

    # Optionally, print predictions
    #print(predictions)

    # Create a DataFrame for predictions and save to CSV
    predictions_df = pd.DataFrame(predictions, columns=['prediction'])
    predictions_df.to_csv(output_Path, index=False)  # Change the path as needed

In [None]:
test_dataset_path = '/content/drive/MyDrive/NLU/test_data/NLI/test.csv'
predictions_output_path = '/content/drive/MyDrive/NLU/Group_61_C.csv'

predict_and_output(test_dataset_path, predictions_output_path)

Map:   0%|          | 0/3302 [00:00<?, ? examples/s]

  roberta_features = self.embedding_layer(torch.tensor(roberta_embeddings, dtype=torch.float32).to(input_ids.device))
