In [1]:
import json
import os
from glob import glob

from langchain.callbacks import get_openai_callback
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from tqdm.notebook import tqdm

from src.hotpotqa.evaluation import evaluate
from src.hotpotqa.oracle_retriever import get_summarized_doc, get_hotpot_doc_content
from src.hotpotqa.text_prompt import TextPrompt
from src.utils import get_current_time, print_chain_stats
from datetime import datetime

In [None]:
num_dev_examples = 500   # number of examples to evaluate (full dev set)
num_demonstrations = 2 #6 
save_results = True

# llm_name = "gpt-3.5-turbo-16k-0613"
# llm_name = "gpt-3.5-turbo"
llm_name = "gpt-4o"
# llm_name = "mistralai/Mistral-7B-Instruct-v0.2"
# llm_name = "mistralai/Mixtral-8X7B-Instruct-v0.1"

start_idx = 0 # starting index to evaluate of the dev set. Only modify if you want to skip some examples
end_idx = start_idx + num_dev_examples

In [None]:
#if you use vLLM
inference_server_url = "http://0.0.0.0:8000/v1"
llm = ChatOpenAI(
    model = "mistralai/Mistral-7B-Instruct-v0.2",
    openai_api_key="token-abc123",
    openai_api_base=inference_server_url,
    max_tokens=30,
    temperature=0,
)

  llm = ChatOpenAI(


In [None]:
# if you use Azure OpenAI Service
os.environ["OPENAI_API_TYPE"] = ""
os.environ["OPENAI_API_VERSION"] = ""
os.environ["OPENAI_API_BASE"] = ""
os.environ["OPENAI_API_KEY"] = ""

llm = AzureChatOpenAI(
    deployment_name=llm_name,
    temperature=0.0,
    request_timeout=30,
    max_retries=3,
    timeout=60 * 3,
)

In [None]:
# if you use OpenAI API
openai_api_key = ""
llm = ChatOpenAI(api_key=openai_api_key, model=llm_name, temperature=0.0, request_timeout=30, max_retries=3, timeout=60 * 3)

  llm = ChatOpenAI(api_key=openai_api_key, model=llm_name, temperature=0.0, request_timeout=30, max_retries=3, timeout=60 * 3)


In [None]:
with open("data/HotpotQA/hotpot_random_eg_dev.json") as f:
    train = json.load(f)

with open("data/HotpotQA/stratified_hotpotqa_500sample.json") as f:
    dev = json.load(f)


In [None]:
## RUN THIS. DO NOT CHANGE
current_time = datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")
output_path = os.path.join(
    "outputs/HotpotQA/ICL/TextPrompt/",
    llm_name,
    f"ICL_{num_demonstrations}",
    f"dev{start_idx}_{end_idx}",
    "w_txt_rationales",
)

# by default seed = 0. But it will be set to the number of runs if save_results = True
seed = 0
# make output path
if save_results:
    # creating the base folder
    os.makedirs(output_path, exist_ok=True)
    # creating the run folder
    num_runs = len(glob(os.path.join(output_path, "*")))
    seed = num_runs
    output_path = os.path.join(output_path, f"run_{num_runs}", f"seed_{seed}", f"{current_time}")
    os.makedirs(output_path, exist_ok=True)
    print(f"Output path: {output_path}")

model = TextPrompt(
    llm,
    train,
    use_rationales=True,
    use_conditions=True,
    num_span_examples=num_demonstrations,
    num_yn_examples=num_demonstrations,
    seed=seed,
)

openai_metadata = {
    "completion_tokens": [],
    "total_cost": [],
    "total_tokens": [],
    "prompt_tokens": [],
}

list_answers = []
pbar = tqdm(dev[start_idx:end_idx]) #start_idx:end_idx
for x in pbar:
    question = x["question"]
    summarized_doc = get_hotpot_doc_content(x)
    gold_ans = x["answer"]

    answer = model(question, gold_ans, summarized_doc)
    
    current_answer = {
        "id": x["_id"],
        "question": question,
        "gold_answer": gold_ans,
        "generated_answer": answer
    }

    # pbar.set_description(
    #     f"Current total cost: {sum(openai_metadata['total_cost']):.2f}"
    # )
    with open(os.path.join(output_path, "output.json"), "a") as f:
        json.dump(current_answer, f)
        f.write('\n')

# evaluation
# results = evaluate(
#     {d["id"]: d["answers"] for d in list_answers},
#     dev[start_idx:end_idx],
# )

# if save_results:
    # store outputs
    # with open(os.path.join(output_path, "output.json"), "w") as f:
    #     json.dump(list_answers, f)
    
    # # store openai metadata
    # with open(os.path.join(output_path, "openai_metadata.json"), "w") as f:
    #     json.dump(openai_metadata, f, indent=4)
    # # store results
    # with open(os.path.join(output_path, "results.json"), "w") as f:
    #     json.dump(results, f, indent=4)
    # # store timestamp
    # with open(os.path.join(output_path, "timestamp.txt"), "w") as f:
    #     f.write(get_current_time())

# print("## OpenAI Metadata ##")
# print_chain_stats(openai_metadata)
# print("\n\n## Results ##")
# print(json.dumps(results, indent=4))

In [3]:
generated_output_file = "./outputs/HotpotQA/ICL/TextPrompt/mistralai/Mixtral-8X7B-Instruct-v0.1/ICL_2/dev0_7405/w_txt_rationales/run_0/seed_0/2025_07_18-02_11_51_PM/output_reformatted_500sample.json"

# transformed_data = {"answer": {}}

# Open the input JSON file
with open(generated_output_file, 'r') as infile: #, open('output.json', 'w') as outfile:
    # Read each line from the input file
    for line in infile:
        # Load the line as a dictionary
        data = json.loads(line.strip())

        # total_predictions = [
        #     {"_id": data["id"], "predicted_answer": data["generated_answer"]}
            
        # ]

    total_predictions = []
    for id, answer in data['answer'].items():
        total_predictions.append({"_id": id, "predicted_answer": answer})

# Write the transformed data to the output file
with open('./evaluation/prediction_results/Textprompt_mx_output_reformatted_500sample.json', 'w') as outfile:
    outfile.write(json.dumps(total_predictions) + '\n')  