-   The provided code processes a set of text responses for the given model, evaluates their semantic and syntactic similarities using a pre-trained DeBERTa model
-   Then it writes the pairwise semantic similarity predictions to a CSV file. 
-   The evaluation includes assessing whether different generated texts are semantically distinct and calculating syntactic similarities using the ROUGE metric.

----------------IMPORTANT NOTE----------------------
-   This code requires atleast 40 GB of GPU RAM for smaller LLM model responses like facebook's opt-125m or opt-350m
-   For larger model like opt-30b this code requires atleast 80 GB of GPU RAM
-   This notebook was used to generate similarities for opt-125m and opt-350m using Google Colab Pro

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.0 MB/s[0m eta [36m0:00:

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.14.

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.0 responses-0.18.0


In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=d93e1a3c73f89cfdc455789fa60806e3f9936ab8fb2e7542d6ff734335675688
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
pip install wandb

Collecting wandb
  Downloading wandb-0.15.8-py3-none-any.whl (2.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.1 MB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m1.7/2.1 MB[0m [31m25.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.32-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.29.2-py2.py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.6/215.6 kB[0m [31m30.0 MB

In [None]:
import os
import random
import numpy as np
import torch

# Please make sure you are using CUDA enabled GPU for this project
device = 'cuda'

# Setting the seed value ensures that the results are reproducible across different runs
seed_val = 10

# Ensuring that the seed is set for Python's hashing, random operations, NumPy, and PyTorch
os.environ['PYTHONHASHSEED'] = str(seed_val)
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

<torch._C.Generator at 0x7fc0af4c3210>

In [None]:
# LLM Parameter Tuning for NLI
params = {    
    # Model related arguments
    'model_version': 'opt-125m',
    'experiment_id': 'run_1',
}

In [None]:
import wandb

# We are using wandb to track our experiments
wandb.init(project='nlg_uncertainty', id=params['experiment_id'], config=params, resume='allow')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 24


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
run_version = wandb.run.name

run_version

'run_1'

In [None]:
model_version = params['model_version']

model_version

'opt-125m'

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Tokenizer for the given model
# Since we will be using the same tokenizer for other notebooks, we will save it in the cache directory
generation_tokenizer = AutoTokenizer.from_pretrained(f"facebook/{model_version}", use_fast=False, cache_dir='/content/drive/MyDrive/NLP Proj/hf_cache_dir')

# Using Microsoft DeBERTa model for the generation of similarities
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-large-mnli").cuda()

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import pickle

# Please generate the responses using the generation notebook first
# Please run the cleaner notebook before running this notebook
# Load the responses for the given run and model version
with open(f'drive/MyDrive/NLP Proj/sequences/{run_version}/{model_version}_cleaned_generations.pkl', 'rb') as infile:
    responses = pickle.load(infile)

In [None]:
# Checking sample response of the model
for sample in responses:
    print(sample['question'])
    print(sample['generated_texts'])
    print(sample['id'])
    break

Who was their target group?
[' men.', ' sports people.', ' sports.', ' sports.', ' men.']
['34fnn24dcm9txoko3yb4ydvtefry5p_7']


In [None]:
import evaluate
from datasets import load_metric

result_dict = {}

# METEOR metric can be used for evaluating summarization tasks (useful for some time of datasets but not for ours)
meteor = load_metric('meteor')

deberta_predictions = []

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
from tqdm import tqdm

# Using Rouge to evaluate syntactic similarity for our datasets (coQA)
rouge_metric = evaluate.load('rouge')

for sample in tqdm(responses):
    
    # Getting data from the sample
    # Please run the cleaner notebook before running this code
    question_text = sample['question']
    generated_texts = sample['cleaned_generated_texts'] if 'cleaned_generated_texts' in sample else sample['generated_texts']
    sample_id = sample['id'][0]
    unique_responses = list(set(generated_texts))

    # Variables for semantic similarity analysis
    answer_pairs_1 = []
    answer_pairs_2 = []
    has_semantically_different_answers = False
    syntactic_similarities = {rouge_type: 0.0 for rouge_type in ['rouge1', 'rouge2', 'rougeL']}
    semantic_set_ids = {answer: index for index, answer in enumerate(unique_responses)}

    # print('No of unique answers:', len(unique_responses))

    # Evalauate semantic similarity if there are more than 1 unique answers
    if len(unique_responses) > 1:
        
        for i, reference_answer in enumerate(unique_responses):
            for j in range(i + 1, len(unique_responses)):

                answer_pairs_1.append(unique_responses[i])
                answer_pairs_2.append(unique_responses[j])

                # Create input pairs and encode them
                input_pair = question_text + ' ' + unique_responses[i] + ' [SEP] ' + unique_responses[j]
                encoded_input = tokenizer.encode(input_pair, padding=True)
                prediction = model(torch.tensor([encoded_input], device='cuda'))['logits']
                predicted_label = torch.argmax(prediction, dim=1)

                # Reverse the input pair and encode
                reverse_input_pair = question_text + ' ' + unique_responses[j] + ' [SEP] ' + unique_responses[i]
                encoded_reverse_input = tokenizer.encode(reverse_input_pair, padding=True)
                reverse_prediction = model(torch.tensor([encoded_reverse_input], device='cuda'))['logits']
                reverse_predicted_label = torch.argmax(reverse_prediction, dim=1)

                # Determine semantic similarity
                deberta_prediction = 0 if 0 in predicted_label or 0 in reverse_predicted_label else 1
                if deberta_prediction == 0:
                    has_semantically_different_answers = True
                else:
                    semantic_set_ids[unique_responses[j]] = semantic_set_ids[unique_responses[i]]

                deberta_predictions.append([unique_responses[i], unique_responses[j], deberta_prediction])

        # Evalauate syntactic similarity
        results = rouge_metric.compute(predictions=answer_pairs_1, references=answer_pairs_2)
        for rouge_type in syntactic_similarities.keys():
            syntactic_similarities[rouge_type] = results[rouge_type]
    
    # Store the results in the result dictionary
    result_dict[sample_id] = {
        'syntactic_similarities': syntactic_similarities,
        'has_semantically_different_answers': has_semantically_different_answers,
        'semantic_set_ids': [semantic_set_ids[x] for x in generated_texts]
    }

  prediction = model(torch.tensor(torch.tensor([encoded_input]), device='cuda'))['logits']
  reverse_prediction = model(torch.tensor(torch.tensor([encoded_reverse_input]), device='cuda'))['logits']
100%|██████████| 7184/7184 [1:55:04<00:00,  1.04it/s]


In [None]:
import csv


with open('deberta_predictions_{}.csv'.format(model_version), 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    
    # Writing header row and deberta predictions for the CSV file
    writer.writerow(['qa_1', 'qa_2', 'prediction'])
    writer.writerows(deberta_predictions)

In [None]:
import pathlib
pathlib.Path(f'drive/MyDrive/NLP Proj/ss/{run_name}').mkdir(parents=True, exist_ok=True)

In [None]:
with open(f'drive/MyDrive/NLP Proj/ss/{run_name}/{args.generation_model}_generations_similarities.pkl', 'wb') as outfile:
    pickle.dump(result_dict, outfile)
