# Causal Disvoery evluation 

## Iterative Inference

In [40]:
import pandas as pd
import time
from openai import OpenAI
from tqdm import tqdm
import os


def extract_final_response(response, prompt):
    return response[len(prompt) :]


def match_prompt_column(model):
    if "Llama-2" in model:
        return "llama2_chat_initial_prompt"
    elif "Mixtral" in model or "Mistral" in model:
        return "mixtral_instruct_initial_prompt"
    elif "Llama-3" in model:
        return "llama3_chat_initial_prompt"
    else:
        print("The model name didn't match anything, please check!!!!")
        return None


def extract_pure_response(row):
    model = row["model"]
    prompt_column = match_prompt_column(model)
    response = row["generated_response"]
    prompt = row[prompt_column]
    return extract_final_response(response=response, prompt=prompt)


def process_string(input_string):
    processed_string = input_string.strip("[]").replace("\\", "")
    questions = processed_string.split("\n")
    questions = [q.strip("'") for q in questions]
    return questions


SYS_MSG = """Based on the provided answer (if any) and the generated answer, respond to the following question with either "Yes" or "No". Your choice should be based on your judgment and the following rules: Yes: If the meanings of the two answers hold the same meaning, choose "Yes". No: If the meanings of the two answers are different, choose "No"."""
foler_path = "Paper Experiment Results/New_filtered/causal_disvoery_improved"
file_name = "exp_result_Llama-2-7b-chat-hf_20240601003848_953107.csv"
output_file_name = file_name.replace(".csv", "_evaluated.csv")
output_file_name_jsonl = file_name.replace(".csv", "_evaluated.jsonl")
input_path = os.path.join(foler_path, file_name)
output_path = os.path.join(foler_path, output_file_name)
output_path_jsonl = os.path.join(foler_path, output_file_name_jsonl)
_data = pd.read_csv(input_path)
eval_model = "gpt-3.5-turbo-0125"
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
temperature = 0
_data["pure_response"] = _data.apply(extract_pure_response, axis=1)

In [None]:
if os.path.exists(output_path):
    print("Output path exists.")
    data_evaluated = pd.read_csv(output_path)
    last_idx = data_evaluated.shape[0]
else:
    print("Output path does not exist.")

test_times = 0
times = 0
pbar = tqdm(total=len(_data))
last_idx = 0
for index, entry in tqdm(_data.iterrows()):
    if index <= last_idx:
        pbar.update(1)
        continue
    updated_row = entry.copy()
    correct_answer = entry["CorrectAnswer"]
    output = entry["pure_response"]
    if output is None:  # skip if result hasn't been generated
        continue
    message = []
    answer = ""
    content = f'{SYS_MSG}\n\Answer 1:\n"{correct_answer}"\n\Answer 2:\n{output}\n'
    message.append({"role": "user", "content": content})
    # create a chat completion
    success = False
    early_stop = False
    while not success:
        try:
            completion = client.chat.completions.create(
                model=eval_model,
                messages=message,
                temperature=temperature,
            )
            generation = completion.choices[0].message.content
            message.append({"role": "assistant", "content": generation})
            # check if generation is yes or no
            if generation.lower().startswith("yes") or generation.lower().startswith(
                "no"
            ):
                if generation.lower().startswith("yes"):
                    answer += "Yes\n"
                else:
                    answer += "No\n"
            else:
                if "YES" in generation and "NO" not in generation:
                    answer += "Yes\n"
                elif "YES" not in generation and "NO" in generation:
                    answer += "No\n"
                else:
                    for msg in message:
                        print(msg["content"])
                    print("NO YES or NO answer!" + generation)
                    answer += "None\n"
                    early_stop = True
                    break
            success = True
        except Exception as e:
            print("ERROR!")
            print(e)
            print("Retry!")
            time.sleep(20)

    # when no answer occurs, break the loop and continue to next instance
    if early_stop:
        break

    answer = answer[:-1]
    # save eval results as List[bool]
    bool_results = []
    for i in answer.split("\n"):
        if i == "Yes":
            bool_results.append(True)
        elif i == "No":
            bool_results.append(False)
        else:
            bool_results.append(None)

    updated_row["eval"] = bool_results
    updated_row["messages_openai"] = message
    test_times += 1
    updated_dataframe = pd.DataFrame([updated_row])
    pbar.update(1)
    if not os.path.exists(output_path):
        updated_dataframe.to_csv(output_path, index=False, mode="w", header=True)
    else:
        updated_dataframe.to_csv(output_path, index=False, mode="a", header=False)
    times = times + 1

# Batch inference API

In [72]:
import pandas as pd
import time
from openai import OpenAI
from tqdm import tqdm
import os
import json


def extract_final_response(response, prompt):
    return response[len(prompt) :]


def match_prompt_column(model):
    if "Llama-2" in model:
        return "llama2_chat_initial_prompt"
    elif "Mixtral" in model or "Mistral" in model:
        return "mixtral_instruct_initial_prompt"
    elif "Llama-3" in model:
        return "llama3_chat_initial_prompt"
    else:
        print("The model name didn't match anything, please check!!!!")
        return None


def extract_pure_response(row):
    model = row["model"]
    prompt_column = match_prompt_column(model)
    response = row["generated_response"]
    prompt = row[prompt_column]
    return extract_final_response(response=response, prompt=prompt)


def process_string(input_string):
    processed_string = input_string.strip("[]").replace("\\", "")
    questions = processed_string.split("\n")
    questions = [q.strip("'") for q in questions]
    return questions


SYS_MSG = """Based on the provided answer (if any) and the generated answer, respond to the following question with either "Yes" or "No". Your choice should be based on your judgment and the following rules: Yes: If the meanings of the two answers hold the same meaning, choose "Yes". No: If the meanings of the two answers are different, choose "No"."""
foler_path = "Paper Experiment Results/New_filtered/causal_disvoery_improved"
file_name = "exp_result_Llama-2-70b-chat-hf_20240601004017_953109.csv"
output_file_name = file_name.replace(".csv", "_evaluated.csv")
output_file_name_jsonl = file_name.replace(".csv", "_evaluated.jsonl")
input_path = os.path.join(foler_path, file_name)
output_path = os.path.join(foler_path, output_file_name)
output_path_jsonl = os.path.join(foler_path, output_file_name_jsonl)
_data = pd.read_csv(input_path)
eval_model = "gpt-3.5-turbo-0125"
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
temperature = 0
_data["pure_response"] = _data.apply(extract_pure_response, axis=1)

In [73]:
message = []

import time

rep_time = 0
current_timestamp = int(time.time())
for index, entry in tqdm(_data.iterrows()):
    updated_row = entry.copy()
    correct_answer = entry["CorrectAnswer"]
    output = entry["pure_response"]
    if output is None:  # skip if result hasn't been generated
        continue

    batch_input_file_id = f"batch_{current_timestamp}_{rep_time}"
    answer = ""
    content = f'{SYS_MSG}\n\Answer 1:\n"{correct_answer}"\n\Answer 2:\n{output}\n'
    post = {
        "custom_id": batch_input_file_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-3.5-turbo-0125",
            "messages": [
                {"role": "user", "content": content},
            ],
            "max_tokens": 1000,
        },
    }
    message.append(post)
    rep_time += 1

with open(output_path_jsonl, "w") as jsonl_file:
    for request in message:
        json.dump(request, jsonl_file)
        jsonl_file.write("\n")

3500it [00:00, 27328.17it/s]


In [74]:
from openai import OpenAI

client = OpenAI()
batch_input_file = client.files.create(
    file=open(output_path_jsonl, "rb"), purpose="batch"
)
current_timestamp = int(time.time())
batch_input_file_id = batch_input_file.id
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
)

Batch(id='batch_hh9EnRh7JyZubRxCFArldkH5', completion_window='24h', created_at=1717428562, endpoint='/v1/chat/completions', input_file_id='file-yV29Zvo3yFC2b9e0lbKIEIt3', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1717514962, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [75]:
from openai import OpenAI

client = OpenAI()

client.batches.list()

SyncCursorPage[Batch](data=[Batch(id='batch_hh9EnRh7JyZubRxCFArldkH5', completion_window='24h', created_at=1717428562, endpoint='/v1/chat/completions', input_file_id='file-yV29Zvo3yFC2b9e0lbKIEIt3', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-3.5-turbo-0125 in organization org-EkHurOnEZ1eqBIvuxr1REXMk. Limit: 200,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1717514962, failed_at=1717428563, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)), Batch(id='batch_owUjZmL358gjEZJJLWMlIUDr', completion_window='24h', created_at=1717428392, endpoint='/v1/chat/completions', input_file_id='file-JxuZbfBVLZZigo4KGriOy0ov', ob

In [78]:
client.batches.list().data

[Batch(id='batch_hh9EnRh7JyZubRxCFArldkH5', completion_window='24h', created_at=1717428562, endpoint='/v1/chat/completions', input_file_id='file-yV29Zvo3yFC2b9e0lbKIEIt3', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-3.5-turbo-0125 in organization org-EkHurOnEZ1eqBIvuxr1REXMk. Limit: 200,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1717514962, failed_at=1717428563, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)),
 Batch(id='batch_owUjZmL358gjEZJJLWMlIUDr', completion_window='24h', created_at=1717428392, endpoint='/v1/chat/completions', input_file_id='file-JxuZbfBVLZZigo4KGriOy0ov', object='batch', status='fail

In [79]:
client.batches.cancel("batch_owUjZmL358gjEZJJLWMlIUDr")

ConflictError: Error code: 409 - {'error': {'message': "Cannot cancel a batch with status 'failed'.", 'type': 'invalid_request_error', 'param': None, 'code': None}}