In [2]:
import json
import os
from glob import glob

from langchain.callbacks import get_openai_callback
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from tqdm.notebook import tqdm

from src.hotpotqa.code_prompt import CodePrompt, RationaleType
from src.hotpotqa.evaluation import evaluate
from src.hotpotqa.oracle_retriever import get_summarized_doc, get_hotpot_doc_content
from src.utils import get_current_time, print_chain_stats
from datetime import datetime

In [None]:
# DEFINE THIS VARIABLES
num_dev_examples = 500 # number of examples to evaluate (full dev set)
num_translation_examples = 4
num_interpreter_examples = 4
save_results = True

# llm_name = "gpt-4o"
llm_name = "gpt-3.5-turbo"
# llm_name = "mistralai/Mistral-7B-Instruct-v0.2"
start_idx = 0  # starting index to evaluate of the dev set. Only modify if you want to skip some examples
end_idx = start_idx + num_dev_examples

In [3]:
#if you use Meta LLama3 using vLLM
inference_server_url = "http://0.0.0.0:8000/v1"
llm = ChatOpenAI(
    model = "mistralai/Mistral-7B-Instruct-v0.2",
    openai_api_key="token-abc123",
    openai_api_base=inference_server_url,
    max_tokens=30,
    temperature=0,
)

  llm = ChatOpenAI(


In [None]:
# if you use Azure OpenAI Service
os.environ["OPENAI_API_TYPE"] = ""
os.environ["OPENAI_API_VERSION"] = ""
os.environ["OPENAI_API_BASE"] = ""
os.environ["OPENAI_API_KEY"] = ""

llm = AzureChatOpenAI(
    deployment_name=llm_name,
    temperature=0.0,
    request_timeout=30,
    max_retries=3,
    timeout=60 * 3,
)

In [None]:
# if you use OpenAI API
openai_api_key = ""
llm = ChatOpenAI(
    api_key=openai_api_key,
    model=llm_name,
    temperature=0.0,
    request_timeout=30,
    max_retries=3,
    timeout=60 * 3,
)

In [None]:
with open("data/HotpotQA/hotpot_random_eg_dev.json") as f:
    train = json.load(f)

with open("data/HotpotQA/stratified_hotpotqa_500sample.json") as f:
    dev = json.load(f)



In [5]:
# loading demonstrations for each component of the chain of prompts
with open("data/HotpotQA/ICL/doc2code/examples.json") as f:
    doc2code_examples = json.load(f)

with open("data/HotpotQA/ICL/question2code/examples.json") as f:
    question2code_examples = json.load(f)

with open("data/HotpotQA/ICL/QAcode2answer/span_examples.json") as f:
    qacode2span_answer_examples = json.load(f)

with open("data/HotpotQA/ICL/QAcode2answer/yes_no_examples.json") as f:
    qacode2yn_answer_examples = json.load(f)

In [None]:
## RUN THIS. DO NOT CHANGE
current_time = datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")
output_path = os.path.join(
    "outputs/HotpotQA/ICL/CodePrompt/",
    llm_name,
    f"transl_icl{num_translation_examples}_interp_icl{num_interpreter_examples}",
    f"dev{start_idx}_{end_idx}",
    "w_txt_rationales",
)

# by default seed = 0. But it will be set to the number of runs if save_results = True
seed = 0
# make output path
if save_results:
    # creating the base folder
    os.makedirs(output_path, exist_ok=True)
    # creating the run folder
    num_runs = len(glob(os.path.join(output_path, "*")))
    seed = num_runs
    output_path = os.path.join(output_path, f"run_{num_runs}", f"seed_{seed}", f"{current_time}" )
    os.makedirs(output_path, exist_ok=True)
    print(f"Output path: {output_path}")

model = CodePrompt(
    llm,
    question2code_examples,
    doc2code_examples,
    qacode2yn_answer_examples,
    qacode2span_answer_examples,
    use_rationales= None, #RationaleType.TEXT, - since we have no evidence in hotpot qa
    use_conditions= False, #True,
    num_translation_examples=num_translation_examples,
    num_interpreter_examples=num_interpreter_examples,
    seed=seed,
)
list_answers = []
openai_metadata = {
    "completion_tokens": [],
    "total_cost": [],
    "total_tokens": [],
    "prompt_tokens": [],
}
pbar = tqdm(dev[start_idx : end_idx]) #start_idx + num_dev_examples
for x in pbar:
    question = x["question"]
    summarized_doc = get_hotpot_doc_content(x)
    gold_ans = x["answer"]


    answer = model(question, gold_ans, summarized_doc)
    # print("returned ans: ", answer)
    current_answer = {
        "id": x["_id"],
        "question": question,
        "gold_answer": gold_ans,
        "generated_answer": answer
    }

    with open(os.path.join(output_path, "raw_output.json"), "a") as f:
        json.dump(current_answer, f)
        f.write('\n')

    # summarized_doc = get_summarized_doc(x, url2doc)
    # with get_openai_callback() as cb:
    #     answer, inter_results = model(question, x["question"], summarized_doc)
    #     openai_metadata["completion_tokens"].append(cb.completion_tokens)
    #     openai_metadata["total_cost"].append(cb.total_cost)
    #     openai_metadata["total_tokens"].append(cb.total_tokens)
    #     openai_metadata["prompt_tokens"].append(cb.prompt_tokens)
    # list_answers.append(
    #     {"id": x["id"], "answers": answer, "inter_results": inter_results}
    # )
    # pbar.set_description(
    #     f"Current total cost: {sum(openai_metadata['total_cost']):.2f}"
    # )

# results = evaluate(
#     {d["id"]: d["answers"] for d in list_answers},
#     dev[start_idx : end_idx],
# )

# if save_results:
#     # store outputs
#     with open(os.path.join(output_path, "output.json"), "w") as f:
#         json.dump(list_answers, f)
#     # store openai metadata
#     with open(os.path.join(output_path, "openai_metadata.json"), "w") as f:
#         json.dump(openai_metadata, f, indent=4)
#     # store results
#     with open(os.path.join(output_path, "results.json"), "w") as f:
#         json.dump(results, f, indent=4)
#     # store timestamp
#     with open(os.path.join(output_path, "timestamp.txt"), "w") as f:
#         f.write(get_current_time())

# print("## OpenAI Metadata ##")
# print_chain_stats(openai_metadata)
# print("\n\n## Results ##")
# print(json.dumps(results, indent=4))
# print("\n\n## Timestamp ##")
# print(get_current_time())

In [None]:
generated_output_file = "./outputs/HotpotQA/ICL/CodePrompt/mistralai/Mixtral-8X7B-Instruct-v0.1/transl_icl4_interp_icl4/dev0_7405/w_txt_rationales/run_0/seed_0/2025_07_18-03_58_08_PM/output_reformatted_500sample.json"


# Open the input JSON file
with open(generated_output_file, 'r') as infile: #, open('output.json', 'w') as outfile:
    # Read each line from the input file
    for line in infile:
        # Load the line as a dictionary
        data = json.loads(line.strip())

        # if "Answer: " in data["generated_answer"]:
        #     data["generated_answer"] = data["generated_answer"].split("Answer: ")[1]
        
        # total_predictions = [
        #     {"_id": data["id"], "predicted_answer": data["generated_answer"]}
            
        # ]
    total_predictions = []
    for id, answer in data['answer'].items():
        total_predictions.append({"_id": id, "predicted_answer": answer})
    

# Write the transformed data to the output file
with open('./evaluation/prediction_results/Codeprompt_mx_output_reformatted_500sample.json', 'w') as outfile:
    outfile.write(json.dumps(total_predictions) + '\n')  