In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
os.chdir("/home/yw699/codes/LLM-Hallu")
sys.path.append(os.path.abspath("src"))
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"

In [3]:
import yaml
import math
from dataset import Dataset
from prompt_engineer import PromptGenerator
from models import HuggingfaceModel
from utils import *
from metrics import *
import logging
import wandb
from tqdm import tqdm
import gc
import torch
import numpy as np
import random

In [4]:
setup_logger()

In [5]:
with open("configs/experiment_config1.yaml", "r") as file:
    config = yaml.safe_load(file)
    
wandb_config = config["wandb"]
metrics_config = config["metrics"]
experiment_details = {'config': config}

In [6]:
user = os.environ['USER']
slurm_jobid = os.getenv('SLURM_JOB_ID', None)
scratch_dir = os.getenv('SCRATCH_DIR', '.')
entity = os.getenv('WANDB_SEM_UNC_ENTITY', None)

dir = f"{scratch_dir}/{user}/{entity}"
if not os.path.exists(dir):
        os.makedirs(dir)
project = config["wandb"]["project"]

if config["wandb"]["debug"]:
    project = f"{project}_debug"

experiment_lot = config["wandb"]['experiment_lot']
notes=f'slurm_id: {slurm_jobid}, experiment_lot: {experiment_lot}'

wandb.init(
    entity=entity,
    project= project,
    dir=dir,
    config=config,
    notes=notes,
)

logging.info('Finished wandb init.')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myupuwang2001[0m ([33myupuwang2001-duke-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


2024-11-19 12:14:33 INFO     Finished wandb init.


In [7]:
dataset_loader = Dataset(config)
train_dataset, validation_dataset = dataset_loader.load_data()


if not isinstance(train_dataset, list):
        logging.info('Train dataset: %s', train_dataset)

answerable_indices, unanswerable_indices = split_dataset(train_dataset)


if config["dataset"]['answerable_only']:
        unanswerable_indices = []
        val_answerable, val_unanswerable = split_dataset(validation_dataset)
        del val_unanswerable
        validation_dataset = [validation_dataset[i] for i in val_answerable]
        train_dataset = [train_dataset[i] for i in answerable_indices]


2024-11-19 12:14:35 INFO     Train dataset: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})


In [8]:
# The prompt is used in every sampling process.
promptgenerator = PromptGenerator(config,train_dataset)
few_shot_prompt,prompt_indices = promptgenerator.construct_fewshot_prompt_by_nums(2)
experiment_details['prompt_indices'] = prompt_indices
experiment_details['prompt'] = few_shot_prompt
experiment_details['BRIEF'] = promptgenerator.BRIEF
logging.info('Prompt is: %s', few_shot_prompt)

2024-11-19 12:14:56 INFO     Prompt is: Answer the following question as briefly as possible.
Context: According to Strabo, the Greco-Bactrians seem to have had contacts with China through the silk road trade routes (Strabo, XI.XI.I). Indian sources also maintain religious contact between Buddhist monks and the Greeks, and some Greco-Bactrians did convert to Buddhism. Demetrius, son and successor of Euthydemus, invaded north-western India in 180 BC, after the destruction of the Mauryan empire there; the Mauryans were probably allies of the Bactrians (and Seleucids). The exact justification for the invasion remains unclear, but by about 175 BC, the Greeks ruled over parts of north-western India. This period also marks the beginning of the obfuscation of Greco-Bactrian history. Demetrius possibly died about 180 BC; numismatic evidence suggest the existence of several other kings shortly thereafter. It is probable that at this point that the Greco-Bactrian kingdom split into several semi-

In [9]:
huggingface_model = HuggingfaceModel(config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
metric = get_metric('squad')
validation_promptgenerator = PromptGenerator(config,validation_dataset)
p_true_evaluator = PTrueEvaluator(config,huggingface_model,promptgenerator,validation_promptgenerator,metric,experiment_details)

In [11]:
for m in metrics_config:
    metric_name = m.get("name")
    if metric_name == "p_true":
        logging.info(80*'#')
        logging.info('Constructing few-shot prompt for p_true.')
        p_true_few_shot_prompt, p_true_responses, len_p_true,p_true_indices = p_true_evaluator.construct_few_shot_prompt_for_p_true(few_shot_prompt,5,3)
        wandb.config.update({'p_true_num_fewshot': len_p_true}, allow_val_change=True)
        wandb.log(dict(len_p_true=len_p_true))
        experiment_details['p_true_indices'] = p_true_indices
        experiment_details['p_true_responses'] = p_true_responses
        experiment_details['p_true_few_shot_prompt'] = p_true_few_shot_prompt
        logging.info('Finished constructing few-shot prompt for p_true.')
        logging.info(80*'#')
        logging.info('p_true_few_shot_prompt: %s', p_true_few_shot_prompt)
        logging.info(80*'#')

2024-11-19 12:15:07 INFO     ################################################################################
2024-11-19 12:15:07 INFO     Constructing few-shot prompt for p_true.
2024-11-19 12:15:07 INFO     P_TRUE >> Current Question: Answer the following question as briefly as possible.
Context: After receiving his J.D. from Boston College Law School, Kerry worked in Massachusetts as an Assistant District Attorney. He served as Lieutenant Governor of Massachusetts under Michael Dukakis from 1983 to 1985 and was elected to the U.S. Senate in 1984 and was sworn in the following January. On the Senate Foreign Relations Committee, he led a series of hearings from 1987 to 1989 which were a precursor to the Iran–Contra affair. Kerry was re-elected to additional terms in 1990, 1996, 2002 and 2008. In 2002, Kerry voted to authorize the President "to use force, if necessary, to disarm Saddam Hussein", but warned that the administration should exhaust its diplomatic avenues before launching w

In [12]:

p_true_evaluator.all_evaluate(few_shot_prompt,1.0,2,p_true_few_shot_prompt,3)

2024-11-19 12:15:18 INFO     xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
2024-11-19 12:15:18 INFO     Starting with dataset_split train.
2024-11-19 12:15:18 INFO     xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
2024-11-19 12:15:18 INFO     Unused items in train dataset: 86819
  0%|          | 0/3 [00:00<?, ?it/s]2024-11-19 12:15:18 INFO     Iteration 0:  ################################################################################
2024-11-19 12:15:18 INFO     Current input: Answer the following question as briefly as possible.
Context: Central Catalan is considered the standard pronunciation of the language. The descriptions below are mostly for this variety. For the differences in pronunciation of the different dialects, see the section pronunciation of dialects in this article.
Question: What are the descriptions for?
Answer:
2024-11-19 12:15:18 INFO     context:       Central Catalan is considered the standa

Question: Who was Kerry an Lt. Gov. for?
Brainstormed Answers: Michael Dukakis 
Michael Dukakis 
Massachusetts 
Michael Dukakis 
Michael Dukakis 
Michael Dukakis 
Possible answer: Michael Dukakis
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: What is the time of lowest temperature for evening type young adults?
Brainstormed Answers: 06:00 (6 a.m.) 
06:00 (6 a.m.) 
about 06:00 (6 a.m.) 
About 06:00 (6 a.m.) 
about 06:00 (6 a.m.) 
About 06:00 (6 a.m.) 
Possible answer: 06:00 (6 a.m.)
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: What sound formatting do DVDs use?
Brainstormed Answers: Dolby Digital 
Dolby Digital 
Dolby Digital (also called AC-3) and DTS 
Dolby Digital, Dolby Digital EX and DTS 
Dolby Digital 
Dolby Digital 
Possible answer: Dolby Digital
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: If one computer model turns out correct, by what year would there be a nearly complete loss of rai

2024-11-19 12:15:21 INFO     Iteration 1:  ################################################################################
2024-11-19 12:15:21 INFO     Current input: Answer the following question as briefly as possible.
Context: Other predecessors of the Reformed church included the pro-reform and Gallican Roman Catholics, such as Jacques Lefevre (c. 1455–1536). The Gallicans briefly achieved independence for the French church, on the principle that the religion of France could not be controlled by the Bishop of Rome, a foreign power. During the Protestant Reformation, Lefevre, a professor at the University of Paris, published his French translation of the New Testament in 1523, followed by the whole Bible in the French language in 1530. William Farel was a student of Lefevre who went on to become a leader of the Swiss Reformation, establishing a Protestant government in Geneva. Jean Cauvin (John Calvin), another student at the University of Paris, also converted to Protestantism. Lo

Question: Who was Kerry an Lt. Gov. for?
Brainstormed Answers: Michael Dukakis 
Michael Dukakis 
Massachusetts 
Michael Dukakis 
Michael Dukakis 
Michael Dukakis 
Possible answer: Michael Dukakis
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: What is the time of lowest temperature for evening type young adults?
Brainstormed Answers: 06:00 (6 a.m.) 
06:00 (6 a.m.) 
about 06:00 (6 a.m.) 
About 06:00 (6 a.m.) 
about 06:00 (6 a.m.) 
About 06:00 (6 a.m.) 
Possible answer: 06:00 (6 a.m.)
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: What sound formatting do DVDs use?
Brainstormed Answers: Dolby Digital 
Dolby Digital 
Dolby Digital (also called AC-3) and DTS 
Dolby Digital, Dolby Digital EX and DTS 
Dolby Digital 
Dolby Digital 
Possible answer: Dolby Digital
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: Who was one French pro-reform Roman Catholic of the 15th century? 
Brainstormed Answers: Jacques L

2024-11-19 12:15:23 INFO     Iteration 2:  ################################################################################
2024-11-19 12:15:23 INFO     Current input: Answer the following question as briefly as possible.
Context: In science, alumni include astronomers Carl Sagan, a prominent contributor to the scientific research of extraterrestrial life, and Edwin Hubble, known for "Hubble's Law", NASA astronaut John M. Grunsfeld, geneticist James Watson, best known as one of the co-discoverers of the structure of DNA, experimental physicist Luis Alvarez, popular environmentalist David Suzuki, balloonist Jeannette Piccard, biologists Ernest Everett Just and Lynn Margulis, computer scientist Richard Hamming, the creator of the Hamming Code, lithium-ion battery developer John B. Goodenough, mathematician and Fields Medal recipient Paul Joseph Cohen, and geochemist Clair Cameron Patterson, who developed the uranium-lead dating method into lead-lead dating. Nuclear physicist and research

Question: Who was Kerry an Lt. Gov. for?
Brainstormed Answers: Michael Dukakis 
Michael Dukakis 
Massachusetts 
Michael Dukakis 
Michael Dukakis 
Michael Dukakis 
Possible answer: Michael Dukakis
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: What is the time of lowest temperature for evening type young adults?
Brainstormed Answers: 06:00 (6 a.m.) 
06:00 (6 a.m.) 
about 06:00 (6 a.m.) 
About 06:00 (6 a.m.) 
about 06:00 (6 a.m.) 
About 06:00 (6 a.m.) 
Possible answer: 06:00 (6 a.m.)
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: What sound formatting do DVDs use?
Brainstormed Answers: Dolby Digital 
Dolby Digital 
Dolby Digital (also called AC-3) and DTS 
Dolby Digital, Dolby Digital EX and DTS 
Dolby Digital 
Dolby Digital 
Possible answer: Dolby Digital
Is the possible answer:
A) True
B) False
The possible answer is: A
Question: What NASA astronaut is also a university alumni member?
Brainstormed Answers: John M. Grunsfeld
Jo

In [13]:
print(experiment_details)

{'config': {'wandb': {'debug': False, 'project': 'test', 'experiment_lot': 'MyExperiment'}, 'dataset': {'name': 'squad', 'seed': 42, 'answerable_only': True}, 'prompt': {'few-shot': False, 'shot_num': 3, 'brief_always': True, 'use_context': True, 'add_tag': True, 'prompt_template_path': '/home/yw699/codes/LLM-Hallu/data/prompt_templates/ask_templates/test2.txt'}, 'model': {'model_name': 'meta-llama/Llama-2-7b-hf', 'stop_sequences': 'default', 'max_new_tokens': 50}, 'sample': {'temperature': 1.0, 'sample_count': 5, 'sampling_method': 'simple_sample'}, 'metrics': [{'name': 'p_true', 'p_true_num_fewshot': 2}, {'name': 'accuracy'}, {'name': 'diversity'}], 'p_true': {'compute_p_true': True, 'get_training_set_generations': True, 'get_training_set_generations_most_likely_only': True, 'compute_accuracy_at_all_temps': True, 'p_true_hint': False}}, 'prompt_indices': [36010, 59002], 'prompt': 'Answer the following question as briefly as possible.\nContext: According to Strabo, the Greco-Bactrians