In [2]:
import torch, wandb
import pandas as pd
from vllm import LLM, SamplingParams
import argparse
from src.utils import str2bool
from datasets import load_dataset, Dataset
from src.utils import exact_match_score, f1_score, text_has_answer
from tqdm.auto import tqdm



In [2]:
INST = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nDoc : {DOC}\n
Based on the above document, answer the following question. Please provide the answer as a single word or term, without forming a complete sentence:
Question : {QUESTION}
Answer:<|im_end|>\n<|im_start|>assistant\n"""
INST_ADV = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nDoc 0: {ORIGIN_DOC}
Doc 1: {ADV_DOC}\n
Based on the above documents, answer the following question. Please provide the answer as a single word or term, without forming a complete sentence:
Question : {QUESTION}
Answer:<|im_end|>\n<|im_start|>assistant\n"""

def make_original_prompt(dataset, q: str):
    docs, questions = dataset["context"], dataset[q]
    return [INST.format(DOC=doc, QUESTION=q) for doc, q in zip(docs, questions)]

def make_original_sent(dataset, q):
    docs, questions = dataset["answer_sent"], dataset[q]
    return [INST.format(DOC=doc, QUESTION=q) for doc, q in zip(docs, questions)]

def make_adv_prompt(dataset, q):
    docs, adv_docs, questions = dataset["context"], dataset["adversarial_passage"], dataset[q]
    return [INST_ADV.format(ORIGIN_DOC=doc, ADV_DOC=adv_doc, QUESTION=q) for doc, adv_doc, q in zip(
        docs, adv_docs, questions)]

def make_adv_gpt_prompt(dataset, q):
    docs, adv_docs, questions = dataset["context"], dataset["gpt_adv_passage"], dataset[q]
    return [INST_ADV.format(ORIGIN_DOC=doc, ADV_DOC=adv_doc, QUESTION=q) for doc, adv_doc, q in zip(
        docs, adv_docs, questions)]

def make_adv_gpt_sent(dataset, q):
    docs, adv_docs, questions = dataset["answer_sent"], dataset["adversary"], dataset[q]
    return [INST_ADV.format(ORIGIN_DOC=doc, ADV_DOC=adv_doc, QUESTION=q) for doc, adv_doc, q in zip(
        docs, adv_docs, questions)]
import random
def make_random_prompts(dataset, q):
    docs, questions = dataset["context"], dataset[q]
    result = []
    for doc, question in zip(docs, questions):
        random_doc = random.choice(docs)
        while random_doc == doc:
            random_doc = random.choice(docs)
        result.append(INST_ADV.format(ORIGIN_DOC=doc, ADV_DOC=random_doc, QUESTION=question))
    return result

def make_random_prompt(dataset, q):
    docs, questions = dataset["context"], dataset[q]
    result = []
    for doc, question in zip(docs, questions):
        random_doc = random.choice(docs)
        while random_doc == doc:
            random_doc = random.choice(docs)
        result.append(INST.format(DOC=random_doc, QUESTION=question))
    return result

def selelct_prompt_func(key: str, q_type: str):
    if key == "origin":
        return make_original_prompt
    elif key == "adv":
        return make_adv_prompt
    elif key == "adv-gpt":
        return make_adv_gpt_prompt

def select_sent_func(key: str):
    if key == "origin":
        return make_original_sent
    else:
        return make_adv_gpt_sent

from typing import List
def make_new_question(questions : List[str]):
    PROMPT = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nPlease paraphrase the following question. You should maintain original meaning and information.
    Question : {QUESTION}
    Paraphrased question:<|im_end|>\n<|im_start|>assistant\n"""
    prompts = [PROMPT.format(QUESTION=q) for q in questions]
    sampling_params = SamplingParams(max_tokens=50)
    paraphrased_questions = llm.generate(prompts, sampling_params, use_tqdm=True)
    paraphrased_questions = [o.outputs[0].text.strip() for o in paraphrased_questions]
    return paraphrased_questions
sampling_params = SamplingParams(max_tokens=20)

In [3]:
llm=LLM(model="Qwen/Qwen1.5-7B-Chat", tensor_parallel_size=2, seed=42, dtype="auto")

2024-02-12 22:19:01,516	INFO worker.py:1724 -- Started a local Ray instance.


INFO 02-12 22:19:02 llm_engine.py:72] Initializing an LLM engine with config: model='Qwen/Qwen1.5-7B-Chat', tokenizer='Qwen/Qwen1.5-7B-Chat', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, seed=42)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 02-12 22:19:13 weight_utils.py:164] Using model weights format ['*.safetensors']
[36m(RayWorkerVllm pid=1803757)[0m INFO 02-12 22:19:13 weight_utils.py:164] Using model weights format ['*.safetensors']
INFO 02-12 22:19:22 llm_engine.py:322] # GPU blocks: 15490, # CPU blocks: 1024
INFO 02-12 22:19:24 model_runner.py:632] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 02-12 22:19:24 model_runner.py:636] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
[36m(RayWorkerVllm pid=1803757)[0m INFO 02-12 22:19:24 model_runner.py:632] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the 

[36m(RayWorkerVllm pid=1803757)[0m INFO 02-12 22:19:31 custom_all_reduce.py:199] Registering 2275 cuda graph addresses
[36m(RayWorkerVllm pid=1803757)[0m INFO 02-12 22:19:31 model_runner.py:698] Graph capturing finished in 7 secs.


In [4]:
dataset = load_dataset("Atipico1/mrqa-adv-test-adv-gpt-passage", split="train")
dataset = dataset.add_column("new_question", make_new_question(dataset["question"]))

Processed prompts: 100%|██████████| 684/684 [00:07<00:00, 94.68it/s] 


In [5]:
import numpy as np

In [6]:
### Closed QA
answers = dataset["answer_in_context"]
result = []
def make_cqa_prompt(dataset, q_type):
    INST = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAnswer the following question. Please provide the answer as a single word or term, without forming a complete sentence:
Question : {QUESTION}
Answer:<|im_end|>\n<|im_start|>assistant\n"""
    questions = dataset[q_type]
    return [INST.format(QUESTION=q) for q in questions]
for q_type in ["question", "new_question"]:
    prompts = make_cqa_prompt(dataset, q_type)
    outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
    outputs = [o.outputs[0].text.strip() for o in outputs]
    ems = [exact_match_score(pred, label) for pred, label in zip(outputs, answers)]
    f1s = [f1_score(pred, label) for pred, label in zip(outputs, answers)]
    accs = [text_has_answer(label, pred) for label, pred in zip(answers, outputs)]
    metrics = ["closed"+"_"+q_type]
    for metric_name, values in zip(["EM", "F1", "Acc"], [ems, f1s, accs]):
        metrics.append(round(np.mean(values)*100, 3))
    result.append(metrics)
df = pd.DataFrame(data=result, columns=["prompt", "EM", "F1", "Acc"])
df

Processed prompts: 100%|██████████| 684/684 [00:04<00:00, 162.68it/s]
Processed prompts: 100%|██████████| 684/684 [00:04<00:00, 155.98it/s]


Unnamed: 0,prompt,EM,F1,Acc
0,closed_question,14.62,19.029,16.813
1,closed_new_question,10.38,14.452,12.865


In [7]:
# Only Random
answers = dataset["answer_in_context"]
result = []

with torch.no_grad():
    for q_type in ["question", "new_question"]:
        prompts = make_random_prompt(dataset, q_type)
        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
        outputs = [o.outputs[0].text.strip() for o in outputs]
        ems = [exact_match_score(pred, label) for pred, label in zip(outputs, answers)]
        f1s = [f1_score(pred, label) for pred, label in zip(outputs, answers)]
        accs = [text_has_answer(label, pred) for label, pred in zip(answers, outputs)]
        metrics = ["random"+"_"+q_type]
        for metric_name, values in zip(["EM", "F1", "Acc"], [ems, f1s, accs]):
            metrics.append(round(np.mean(values)*100, 3))
        result.append(metrics)
df = pd.DataFrame(data=result, columns=["prompt", "EM", "F1", "Acc"])
df

Processed prompts: 100%|██████████| 684/684 [00:10<00:00, 66.74it/s] 
Processed prompts: 100%|██████████| 684/684 [00:10<00:00, 66.60it/s] 


Unnamed: 0,prompt,EM,F1,Acc
0,random_question,9.211,11.949,10.234
1,random_new_question,6.871,8.713,7.749


In [8]:
# Random + Original
answers = dataset["answer_in_context"]
result = []
import numpy as np
with torch.no_grad():
    for q_type in ["question", "new_question"]:
        prompts = make_random_prompts(dataset, q_type)
        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
        outputs = [o.outputs[0].text.strip() for o in outputs]
        ems = [exact_match_score(pred, label) for pred, label in zip(outputs, answers)]
        f1s = [f1_score(pred, label) for pred, label in zip(outputs, answers)]
        accs = [text_has_answer(label, pred) for label, pred in zip(answers, outputs)]
        metrics = ["random"+"_"+q_type]
        for metric_name, values in zip(["EM", "F1", "Acc"], [ems, f1s, accs]):
            metrics.append(round(np.mean(values)*100, 3))
        result.append(metrics)
df = pd.DataFrame(data=result, columns=["prompt", "EM", "F1", "Acc"])

Processed prompts: 100%|██████████| 684/684 [00:14<00:00, 46.74it/s] 
Processed prompts: 100%|██████████| 684/684 [00:14<00:00, 45.83it/s] 


In [9]:
df

Unnamed: 0,prompt,EM,F1,Acc
0,random_question,78.655,85.902,81.579
1,random_new_question,65.936,73.775,69.298


In [10]:
import numpy as np
answers = dataset["answer_in_context"]
result = []
with torch.no_grad():
    for q_type in ["question", "new_question"]:
        for key in tqdm(["origin", "adv", "adv-gpt"]):
            prompt_func = selelct_prompt_func(key, q_type)
            prompts = prompt_func(dataset, q_type)
            outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
            outputs = [o.outputs[0].text.strip() for o in outputs]
            ems = [exact_match_score(pred, label) for pred, label in zip(outputs, answers)]
            f1s = [f1_score(pred, label) for pred, label in zip(outputs, answers)]
            accs = [text_has_answer(label, pred) for label, pred in zip(answers, outputs)]
            metrics = [key+"__"+q_type]
            for metric_name, values in zip(["EM", "F1", "Acc"], [ems, f1s, accs]):
                metrics.append(round(np.mean(values)*100, 3))
            result.append(metrics)
df = pd.DataFrame(data=result, columns=["prompt", "EM", "F1", "Acc"])
df

  0%|          | 0/3 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 684/684 [00:09<00:00, 69.92it/s] 
Processed prompts: 100%|██████████| 684/684 [00:13<00:00, 49.51it/s] 
Processed prompts: 100%|██████████| 684/684 [00:14<00:00, 48.55it/s] 


  0%|          | 0/3 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 684/684 [00:09<00:00, 68.87it/s] 
Processed prompts: 100%|██████████| 684/684 [00:14<00:00, 48.38it/s] 
Processed prompts: 100%|██████████| 684/684 [00:14<00:00, 47.92it/s] 


Unnamed: 0,prompt,EM,F1,Acc
0,origin__question,78.801,86.192,82.31
1,adv__question,82.749,90.165,85.234
2,adv-gpt__question,66.374,75.512,69.591
3,origin__new_question,65.205,72.468,69.444
4,adv__new_question,69.298,77.189,74.269
5,adv-gpt__new_question,54.094,63.484,58.041


In [11]:
# Sent Level
import numpy as np
answers = dataset["answer_in_context"]
result = []
with torch.no_grad(): 
    for q_type in ["question", "new_question"]:
        for key in tqdm(["origin", "adv"]):
            prompt_func = select_sent_func(key)
            prompts = prompt_func(dataset, q_type)
            outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
            outputs = [o.outputs[0].text.strip() for o in outputs]
            ems = [exact_match_score(pred, label) for pred, label in zip(outputs, answers)]
            f1s = [f1_score(pred, label) for pred, label in zip(outputs, answers)]
            accs = [text_has_answer(label, pred) for label, pred in zip(answers, outputs)]
            metrics = [key+"_"+q_type]
            for metric_name, values in zip(["EM", "F1", "Acc"], [ems, f1s, accs]):
                metrics.append(round(np.mean(values)*100, 3))
            result.append(metrics)
df = pd.DataFrame(data=result, columns=["prompt", "EM", "F1", "Acc"])
df

  0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 684/684 [00:06<00:00, 109.73it/s]
Processed prompts: 100%|██████████| 684/684 [00:07<00:00, 88.10it/s] 


  0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 684/684 [00:06<00:00, 104.62it/s]
Processed prompts: 100%|██████████| 684/684 [00:07<00:00, 87.41it/s] 


Unnamed: 0,prompt,EM,F1,Acc
0,origin_question,77.193,85.263,81.725
1,adv_question,55.409,64.334,58.626
2,origin_new_question,68.275,75.44,72.515
3,adv_new_question,48.538,57.287,51.901


In [15]:
key = "adv-gpt"
prompt_func = selelct_prompt_func(key)
prompts = prompt_func(dataset)
dataset = dataset.add_column("prompt", prompts)

dataset=dataset.remove_columns("pred")
prompts = make_original_prompt(dataset)

In [16]:
with torch.no_grad():
    outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
    outputs = [o.outputs[0].text.strip() for o in outputs]
dataset = dataset.add_column("pred", outputs)
print("Before :", len(dataset))
dataset = dataset.filter(lambda x: len(x["pred"]) > 0)
print("After :", len(dataset))

Processed prompts: 100%|██████████| 684/684 [00:11<00:00, 60.08it/s] 


Flattening the indices:   0%|          | 0/684 [00:00<?, ? examples/s]

Before : 684


Filter:   0%|          | 0/684 [00:00<?, ? examples/s]

After : 684


In [17]:
df = pd.DataFrame(dataset)
df["is_em"] = df.apply(lambda x: exact_match_score(x["pred"], x["answer_in_context"]), axis=1)
df["f1"] = df.apply(lambda x: f1_score(x["pred"], x["answer_in_context"]), axis=1)
df["is_accurate"] = df.apply(lambda x: text_has_answer(x["answer_in_context"], x["pred"]), axis=1)
df["answer_in_context"] = df["answer_in_context"].apply(lambda x: ", ".join(x) if len(x)>1 else x[0])
df = df[["question", "answer_in_context","prompt", "pred", "is_em", "f1", "is_accurate"]]

print("ACC:", round(df["is_accurate"].mean()*100, 2))
print("EM:", round(df["is_em"].mean()*100, 2))
print("F1:", round(df["f1"].mean()*100, 2))

ACC: 83.19
EM: 79.82
F1: 87.87


In [10]:
df = pd.DataFrame(dataset)
df["is_em"] = df.apply(lambda x: exact_match_score(x["pred"], x["answer_in_context"]), axis=1)
df["f1"] = df.apply(lambda x: f1_score(x["pred"], x["answer_in_context"]), axis=1)
df["is_accurate"] = df.apply(lambda x: text_has_answer(x["answer_in_context"], x["pred"]), axis=1)
df["answer_in_context"] = df["answer_in_context"].apply(lambda x: ", ".join(x) if len(x)>1 else x[0])
df = df[["question", "answer_in_context","prompt", "pred", "is_em", "f1", "is_accurate"]]

print("ACC:", round(df["is_accurate"].mean()*100, 2))
print("EM:", round(df["is_em"].mean()*100, 2))
print("F1:", round(df["f1"].mean()*100, 2))

ACC: 70.61
EM: 67.84
F1: 76.31


In [51]:
df

Unnamed: 0,question,answer_in_context,prompt,pred,is_em,f1,is_accurate
0,What year was the earliest Chopin recording cr...,1895,<|im_start|>system\nYou are a helpful assistan...,1895,1.0,1.0,True
1,Which footballer was killed in a car accident?,"Theyab Awana,",<|im_start|>system\nYou are a helpful assistan...,Theyab Awana,1.0,1.0,True
2,Florida State Road 535 ends at which Orange Co...,Lake Buena Vista,<|im_start|>system\nYou are a helpful assistan...,Lake Buena Vista,1.0,1.0,True
3,What is the seat of the county that includes t...,Troy,<|im_start|>system\nYou are a helpful assistan...,Troy,1.0,1.0,True
4,Spider is a 2007 Australian black comedy short...,Blue-Tongue Films,<|im_start|>system\nYou are a helpful assistan...,Blue-Tongue Films,1.0,1.0,True
...,...,...,...,...,...,...,...
679,According to the Australian Bureau of Statisti...,12400,<|im_start|>system\nYou are a helpful assistan...,12400,1.0,1.0,True
680,What is the lead single of the album being pro...,Sign of the Times,<|im_start|>system\nYou are a helpful assistan...,"""Sign of the Times""",1.0,1.0,True
681,When did she vote a second time against war?,1941,<|im_start|>system\nYou are a helpful assistan...,1941,1.0,1.0,True
682,"""Walk Right Now"" is a song written by a group ...",1964,<|im_start|>system\nYou are a helpful assistan...,1964,1.0,1.0,True


In [13]:
df = pd.DataFrame(dataset)
df["prompt"] = prompts
df["is_em"] = df.apply(lambda x: exact_match_score(x["pred"], x["answer_in_context"]), axis=1)
df["f1"] = df.apply(lambda x: f1_score(x["pred"], x["answer_in_context"]), axis=1)
df["is_accurate"] = df.apply(lambda x: text_has_answer(x["answer_in_context"], x["pred"]), axis=1)
df["answer_in_context"] = df["answer_in_context"].apply(lambda x: ", ".join(x) if len(x)>1 else x[0])
df = df[["question", "answer_in_context","prompt", "pred", "is_em", "f1", "is_accurate"]]
wandb.init(project="evaluate-LLM", name="adv")
wandb.log({"raw_data":df,
            "Acc": round(df["is_accurate"].mean()*100, 2),
            "EM": round(df["is_em"].mean()*100, 2),
            "F1": round(df["f1"].mean()*100, 2)})
wandb.finish()



VBox(children=(Label(value='0.981 MB of 1.957 MB uploaded\r'), FloatProgress(value=0.5013307933847657, max=1.0…

0,1
Acc,▁
EM,▁
F1,▁

0,1
Acc,83.48
EM,79.68
F1,86.67


In [31]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-72B-Chat")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
text

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nGive me a short introduction to large language model.<|im_end|>\n<|im_start|>assistant\n'

In [None]:
df = pd.DataFrame(load_dataset("Atipico1/mrqa-adv-test-adv-gpt-passage", split="train"))

In [6]:
df.columns

Index(['subset', 'qid', 'question', 'answers', 'masked_query', 'context',
       'answer_sent', 'answer_in_context', 'query_embedding', 'entity',
       'similar_entity', 'similar_entity_score', 'random_entity',
       'random_entity_score', 'rewritten_context', 'valid',
       'clear_answer_sent', 'vague_answer_sent', 'adversary', 'replace_count',
       'adversarial_passage', 'gpt_adv_passage'],
      dtype='object')

In [8]:
df[["question", "answers", "context", "answer_sent", "clear_answer_sent","vague_answer_sent","adversary", "adversarial_passage", "gpt_adv_passage"]].to_excel("adv-gpt.xlsx")