## SQuAD v2 0-shot QA using Phi-3-mini-128k-instruct with AMR

In [1]:
import transformers
import torch
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import sentence_transformers
import nltk
import datasets

In [2]:
squad = datasets.load_dataset("squad_v2")
squad_val = squad['validation'].to_pandas()
squad_val['answer_list'] = squad_val.answers.map(lambda x: x['text'])

Processing AMR file

In [None]:
squad_amrs = pd.read_csv("/projects/anra7539/projects/representation_efficacy/squad_val_amrs.csv")

squad_amrs.drop_duplicates(inplace = True)
squad_amrs.reset_index(drop = True, inplace = True)

squad_amrs['only_amr'] = squad_amrs.amr.map(lambda x: "\n".join(x.split("\n")[1:]))

In [None]:
squad_val = squad_val.merge(squad_amrs, on = ['context'], how = 'right').reset_index(drop = True)

### Inferencing pipeline

In [7]:
name = "microsoft/Phi-3-mini-128k-instruct"
device = "cuda"

model = transformers.AutoModelForCausalLM.from_pretrained(name,
                                                          load_in_8bit = True,
                                                          trust_remote_code = True,
                                             device_map = device,
                                             cache_dir='/scratch/alpine/anra7539')

tokenizer = transformers.AutoTokenizer.from_pretrained(name, truncation_side = "left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
def qna(question, context, amr, prompt):
    with torch.no_grad():
        input_text = f'''{prompt}\n\nContext:{context}\n\nAMR:{amr}\n\nQuestion:{question}\nAnswer:'''
        input_tokens = tokenizer(input_text, return_tensors = "pt", truncation = True, max_length = 2048).to(device)
    
        outputs = model.generate(**input_tokens, max_new_tokens = 30, pad_token_id = tokenizer.eos_token_id)
    
        answer = tokenizer.decode(outputs[0], skip_special_tokens = True).split("Answer:")[1].split("\n")[0].split(".")[0].strip()
    return answer

In [9]:
prompt = f'''Answer the given question based on the context.
If the question can't be answered based on the information in the context, return "unanswerable".
You will not return anything except the answer.
You may also use the provided linearized Abstract Meaning Representation (AMR) structure of the paragraph to aid in reasoning.'''

In [None]:
output_file = '/projects/anra7539/projects/representation_efficacy/squad_amr_answers_qphi3/predicted_answers.json'

if os.path.exists(output_file):
    with open(output_file, 'r') as f:
        try:
            existing_data = [json.loads(line) for line in f]
        except json.JSONDecodeError:
            existing_data = []
else:
    existing_data = []

In [None]:
processed_indices = {item['index'] for item in existing_data}

In [None]:
with open(output_file, 'a') as f:
    for i in tqdm(range(len(squad_val))):
        if i in processed_indices:
            continue 
        
        answer = qna(squad_val.question[i], squad_val.context[i], squad_val.only_amr[i], prompt)
        
        result = {
            "index": i,
            "context": squad_val.context[i],
            "question": squad_val.question[i],
            "answer_list": list(squad_val.answer_list[i]),
            "prediction": answer
        }

        f.write(json.dumps(result) + "\n")

## Results

In [None]:
output_file = '/projects/anra7539/projects/representation_efficacy/squad_raw_answers_qphi3/predicted_answers.json'

with open(output_file, 'r') as f:
    data = [json.loads(line) for line in f]

full_dataset = pd.DataFrame(data)

### Average F1-score

In [11]:
def f1_score_strings(str1, str2):
    tokens1 = set(str1.lower().split())
    tokens2 = set(str2.lower().split())
    
    true_positives = len(tokens1 & tokens2)  
    false_positives = len(tokens1 - tokens2)  
    false_negatives = len(tokens2 - tokens1)  
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0
    
    return f1

In [12]:
def max_score_extraction(tgt_string, ref_string, scoring_function):
    score = []
    if len(ref_string)>0:
        for s in ref_string:
            score.append(scoring_function(tgt_string, s))

        return max(score)
    else:
        return scoring_function(tgt_string, "unanswerable")

In [13]:
full_dataset.fillna('', inplace = True)

In [14]:
full_dataset['f1_scores'] = full_dataset.apply(lambda x: max_score_extraction(x['prediction'], x['answer_list'], f1_score_strings), axis = 1)

In [15]:
np.mean(full_dataset.f1_scores)

0.6157322038293058

## Cosine similarity

In [16]:
similarity_model = sentence_transformers.SentenceTransformer('all-MiniLM-L6-v2')


def sent_similarity(str1, str2):
    embedding1 = similarity_model.encode(str1.lower())
    embedding2 = similarity_model.encode(str2.lower())
    
    return sentence_transformers.util.cos_sim(embedding1, embedding2)

In [17]:
full_dataset['cosine_similarity'] = full_dataset.apply(lambda x: max_score_extraction(x['prediction'], x['answer_list'], sent_similarity), axis = 1)

In [18]:
np.mean(full_dataset.cosine_similarity)

0.6854046125137762

## ROUGE

In [19]:
from rouge_score import rouge_scorer

In [20]:
def compute_rouge_1(generated_text, reference_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text.lower(), generated_text.lower())
    return scores['rouge1'].fmeasure

def compute_rouge_2(generated_text, reference_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text.lower(), generated_text.lower())
    return scores['rouge2'].fmeasure

def compute_rouge_l(generated_text, reference_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text.lower(), generated_text.lower())
    return scores['rougeL'].fmeasure

In [21]:
full_dataset['rouge_1'] = full_dataset.apply(lambda x: max_score_extraction(x['prediction'], x['answer_list'], compute_rouge_1), axis = 1)
full_dataset['rouge_2'] = full_dataset.apply(lambda x: max_score_extraction(x['prediction'], x['answer_list'], compute_rouge_2), axis = 1)
full_dataset['rouge_L'] = full_dataset.apply(lambda x: max_score_extraction(x['prediction'], x['answer_list'], compute_rouge_l), axis = 1)

In [22]:
print(f"ROUGE-1 score = {np.mean(full_dataset.rouge_1)}")
print(f"ROUGE-2 score = {np.mean(full_dataset.rouge_2)}")
print(f"ROUGE-L score = {np.mean(full_dataset.rouge_L)}")

ROUGE-1 score = 0.6356323669653294
ROUGE-2 score = 0.5172571385455257
ROUGE-L score = 0.6343967849515131


## BLEU

In [23]:
from nltk.translate.bleu_score import sentence_bleu

In [24]:
def max_score_extraction_bleu(tgt_string, ref_string, sentence_bleu):
    score = []
    if len(ref_string)>0:
        for s in ref_string:
            score.append(sentence_bleu([nltk.word_tokenize(tgt_string.lower())], 
                                       nltk.word_tokenize(s.lower())))

        return max(score)
    else:
        return sentence_bleu([nltk.word_tokenize(tgt_string.lower())], 
                             nltk.word_tokenize("unanswerable"))

In [25]:
full_dataset['bleu_scores'] = full_dataset.apply(lambda x: max_score_extraction_bleu(x['prediction'], x['answer_list'], sentence_bleu), axis = 1)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [26]:
print(np.mean(full_dataset.bleu_scores))

0.07049791420245992
