In [28]:
import os
import yaml
import pandas as pd
#import openai

system_input1 = ""
system_input2 = ""

# Paths. Assuming this notebook is in evals/evals

registry_path = os.path.join(os.getcwd(), "registry")
dataset_path = ("/workspaces/evals/dataset")
os.makedirs(dataset_path, exist_ok=True)
os.makedirs("/workspaces/evals/evallogs", exist_ok=True)

# Registry yaml

registry_yaml = {}
registry_yaml["battle"] = {
    "id": "battle.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battle.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battle/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle",
        "sample_kwargs":
            {"temperature": 0.0}  }
}


with open(os.path.join(registry_path, "evals", "battle.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)


registry_yaml = {}
registry_yaml["match"] = {
    "id": "match.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["match.test.v1"] = {
    "class": "evals.elsuite.basic.sample_match:Match",
    "args": {
        "samples_jsonl": "match/samples.jsonl",
    }
}


with open(os.path.join(registry_path, "evals", "match.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)


# Definitions

def battle(system_input1, user_input1, completion1, system_input2, user_input2, completion2):

    # Data

    input1 = ([{"role":"system","content":system_input1},{"role":"user","content":user_input1}])
    input2 = ([{"role":"system","content":system_input2},{"role":"user","content":user_input2}])

    dataset = [{"input1": input1, "completion1": completion1, "input2": input2, "completion2":completion2}]
   
    # Paths. Assuming this notebook is in evals/evals

    battle_path = os.path.join(os.getcwd(), "registry", "data", "battle")
    os.makedirs(battle_path, exist_ok=True)
    battle_path = os.path.join(battle_path, "samples.jsonl")

    df = pd.DataFrame(dataset)
    df.to_json(battle_path, orient="records", lines=True)
    
    """

    message_content = "You are comparing two responses to the following two instructions."
    
    message_content += "\n\n[Instruction 1]\nUser: "
    message_content += user_input1
    message_content += "\n\n[Response 1]\n"
    message_content += completion1

    message_content += "\n\n[Instruction 2]\nUser: "
    message_content += user_input2
    message_content += "\n\n[Response 2]\n"
    message_content += completion2 

    message_content += "\n\n\nIs the first response better than the second? You must provide one answer based on your subjective view.\n\nFirst, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from \"Yes\" or \"No\" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.\n\nReasoning:"

    openai.api_key = "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"
    data = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
        {"role": "user", "content": message_content},
        ]
    )
    """
    !oaieval gpt-3.5-turbo battle --record_path /workspaces/evals/evallogs/logs
    
    events = "/workspaces/evals/evallogs/logs"
    record_path = os.path.join("/workspaces/evals/evallogs/eval")
    with open(events, "r") as f:
        events_df = pd.read_json(f, lines=True)

    os.makedirs(record_path, exist_ok=True)
    events_df.to_json(os.path.join(record_path, "events"), lines=True, orient="records")
    events_df["data"].to_json(os.path.join(record_path, "data"), lines=True, orient="records")
    data_df = pd.read_json(os.path.join(record_path, "data"), lines=True)
    for row in data_df[0]:
        s = str(row)
        s = s.replace("[","",-1)
        s = s.replace("]","",-1)
        if s.startswith("{'choice':"):
            s = s.split("'")[3]
            choice = s
        if s.startswith("{'prompt': {'role': 'user', 'content': 'You are comparing two responses to the following two instructions."):
            #s = s.split("\\n\\nInstruction 1\\n",)[1]
            #instruction1 = s.split("\\n\\nResponse 1\\n")[0].replace("\\'","'").replace("\\n","\n")
            #s = s.split("\\nResponse 1\\n",)[1]
            #response1 = s.split("\\n\\nInstruction 2\\n")[0].replace("\\'","'").replace("\\n","\n")
            #s = s.split("\\n\\nInstruction 2\\n",)[1]
            #instruction2 = s.split("\\n\\nResponse 2\\n")[0].replace("\\'","'").replace("\\n","\n")
            #s = s.split("\\nResponse 2\\n",)[1]
            #response2 = s.split("\\n\\n\\nWhich of the following responses is better?")[0].replace("\\'","'").replace("\\n","\n")
            sampled = s.split("\\n\\nReasoning:\'}, \'sampled\': ")[1].replace("\\'","'").replace("\\n","\n")
    
    #content = data["choices"]["message"]["content"]
    #choice = content.split("\\n")[-1]
    data = [choice, user_input1, completion1, user_input2, completion2, sampled]
    
    return data
def test_match(test_dataset):
    data = []
    for t in range(0,len(test_dataset)):
        data.append({"input":[{"role":"system","content":""},{"role":"user","content":test_dataset[t][0]}], "ideal":test_dataset[t][2]})
    df = pd.DataFrame(data)

    # Paths. Assuming this notebook is in evals/evals

    test_match_path = os.path.join(os.getcwd(), "registry", "data", "match")
    os.makedirs(test_match_path, exist_ok=True)
    test_match_path = os.path.join(test_match_path, "samples.jsonl")

    df.to_json(test_match_path, orient="records", lines=True)
    
    !oaieval gpt-3.5-turbo match --record_path /workspaces/evals/evallogs/logs
    
    events = "/workspaces/evals/evallogs/logs"
    record_path = os.path.join("/workspaces/evals/evallogs/eval")
    with open(events, "r") as f:
        events_df = pd.read_json(f, lines=True)
    
    os.makedirs(record_path, exist_ok=True)
    events_df.to_json(os.path.join(record_path, "events"), lines=True, orient="records")
    events_df["data"].to_json(os.path.join(record_path, "data"), lines=True, orient="records")
    data_df = pd.read_json(os.path.join(record_path, "data"), lines=True)
    
    return data_df

def content_to_list(content, dataset):
    dataset_df = pd.DataFrame(dataset)
    return dataset[dataset_df[0].to_list().index(content)]

def append_response(content, response, dataset):
    dataset_df = pd.DataFrame(dataset)
    index = dataset_df[0].to_list().index(content)
    dataset[index][2].append(response)
    return len(dataset[index][2])

In [29]:
dataset_df = pd.read_json(os.path.join(dataset_path, "dataset"), lines=True)
test_dataset_df = pd.read_json(os.path.join(dataset_path, "test_dataset"), lines=True)
best_response_df = pd.read_json(os.path.join(dataset_path, "best_response"), lines=True)
battles_df = pd.read_json(os.path.join(dataset_path, "battles"), lines=True)
n_responses_df = pd.read_json(os.path.join(dataset_path, "n_responses"), lines=True)
test_match_dataset_df = pd.read_json(os.path.join(dataset_path, "test_match_dataset"), lines=True)

dataset = dataset_df.values.tolist()
test_dataset = test_dataset_df.values.tolist()
best_response = best_response_df.values.tolist()
battles = battles_df.values.tolist()
n_responses = n_responses_df.values.tolist()
test_match_dataset = test_match_dataset_df.values.tolist()

for i in range(4):

    test_match_data = test_match(test_dataset)
    test_match_dataset_df = pd.concat([test_match_dataset_df, test_match_data])

    same_best_response = True
    for i in range(3,len(test_match_data[0]),2):
        if test_match_data[0][i]["correct"] == False:
            content = test_match_data[0][i-1]["prompt"][1]["content"]
            content_list = content_to_list(content,dataset)
            if content_list[1] is not None:
                parent_content_list = content_to_list(content_list[1],dataset)
            sampled = test_match_data[0][i]["sampled"]
            best_response_input = content_to_list(best_response[-1][0],dataset)[1]
            data = battle(system_input1,content,sampled,system_input2,best_response_input,best_response[-1][0]) 
            dataset.append([sampled,content,[]])
            battles.append(data)
            n = append_response(content, sampled, dataset)
            n_responses.append([n])
            if data["Choice"] == "Yes":
                best_response_input = content
                best_response.append(sampled)
                test_dataset = [content_list]
                test_dataset_df = pd.DataFrame(test_dataset)
                same_best_response = False

    if same_best_response:
        for t in test_dataset:
            if t[1] is not None:
                if t[1] not in test_dataset_df[0].to_list():
                    test_dataset.append(content_to_list(t[1],dataset))
                    test_dataset_df = pd.DataFrame(test_dataset)
            l = content_to_list(t[0], dataset)
            for response in l[2]:
                if response not in test_dataset_df[0].to_list():
                    test_dataset.append(content_to_list(response,dataset))
                    test_dataset_df = pd.DataFrame(test_dataset)

    dataset_df = pd.DataFrame(dataset)
    test_dataset_df = pd.DataFrame(test_dataset)
    best_response_df = pd.DataFrame(best_response)
    battles_df = pd.DataFrame(battles)
    n_responses_df = pd.DataFrame(n_responses)

    dataset_df.to_json(os.path.join(dataset_path, "dataset"), lines=True, orient="records")
    test_dataset_df.to_json(os.path.join(dataset_path, "test_dataset"), lines=True, orient="records")
    best_response_df.to_json(os.path.join(dataset_path, "best_response"), lines=True, orient="records")
    battles_df.to_json(os.path.join(dataset_path, "battles"), lines=True, orient="records")
    n_responses_df.to_json(os.path.join(dataset_path, "n_responses"), lines=True, orient="records")
    test_match_dataset_df.to_json(os.path.join(dataset_path, "test_match_dataset"), lines=True, orient="records")
    

[2023-07-25 13:21:37,462] [registry.py:270] Loading registry from /workspaces/evals/evals/registry/evals
[2023-07-25 13:21:38,656] [registry.py:270] Loading registry from /home/codespace/.evals/evals
[2023-07-25 13:21:38,658] [oaieval.py:138] [1;35mRun started: 2307251321384BAL7NPQ[0m
[2023-07-25 13:21:38,660] [data.py:83] Fetching match/samples.jsonl
[2023-07-25 13:21:38,660] [eval.py:33] Evaluating 8 samples
[2023-07-25 13:21:38,673] [eval.py:139] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 8/8 [00:02<00:00,  3.97it/s]
[2023-07-25 13:21:40,715] [record.py:341] Final report: {'accuracy': 0.875, 'boostrap_std': 0.12495774285733559}. Logged to /workspaces/evals/evallogs/logs
[2023-07-25 13:21:40,715] [oaieval.py:177] Final report:
[2023-07-25 13:21:40,715] [oaieval.py:179] accuracy: 0.875
[2023-07-25 13:21:40,715] [oaieval.py:179] boostrap_std: 0.12495774285733559
[2023-07-25 13:21:40,718] [record.py:330] Logged 16 rows of events to /w

TypeError: list indices must be integers or slices, not str

In [23]:
battles

[['Yes',
  '',
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
  '',
  '',
  '"1. The first response acknowledges the user\'s request and provides a clear explanation of why it cannot fulfill the request.\n2. The second response is incomplete and does not provide any information or explanation.\n\nBased on this reasoning, the first response is better than the second.\n\nYes"}'],
 ['Yes',
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
  "No problem! If you have any other questions or need assistance with something else, feel free to ask. I'm here to help!",
  '',
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to gener

In [26]:
data.

TypeError: 'dict_values' object is not subscriptable

In [20]:
content_to_list(best_response[-1][0],dataset)[1]

"I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else."

In [19]:
battles

[['Yes',
  '',
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
  '',
  '',
  '"1. The first response acknowledges the user\'s request and provides a clear explanation of why it cannot fulfill the request.\n2. The second response is incomplete and does not provide any information or explanation.\n\nBased on this reasoning, the first response is better than the second.\n\nYes"}'],
 ['Yes',
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
  "No problem! If you have any other questions or need assistance with something else, feel free to ask. I'm here to help!",
  '',
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to gener

In [12]:
dataset_df[0][2]

"No problem! If you have any other questions or need assistance with something else, feel free to ask. I'm here to help!"

In [17]:
dataset

[['',
  None,
  ["I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else."]],
 ["I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
  '',
  ["No problem! If you have any other questions or need assistance with something else, feel free to ask. I'm here to help!"]],
 ["No problem! If you have any other questions or need assistance with something else, feel free to ask. I'm here to help!",
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
  ['Thank you! I appreciate your willingness to help. I will definitely reach out if I have any further questions or need assistance.',
   'Thank you! I appreci

In [45]:
(len(test_match_data[0])-2)/2


4.0

In [32]:
l

['',
 None,
 ["I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate content on behalf of individuals.",
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else."]]

In [11]:
response

"I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else."

In [12]:
dataset_df[0].to_list()

['',
 "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else."]

In [16]:
best_response

['',
 "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
 "No problem! If you have any other questions or need assistance with something else, feel free to ask. I'm here to help!"]

In [21]:
test_dataset

[["I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
  '',
  ["No problem! If you have any other questions or need assistance with something else, feel free to ask. I'm here to help!"]],
 ['',
  None,
  ["I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else."]],
 ["No problem! If you have any other questions or need assistance with something else, feel free to ask. I'm here to help!",
  "I'm sorry, I cannot continue the text for you as I am an AI language model and I do not have access to personal information or the ability to generate text on behalf of someone else.",
  []]]

In [11]:
test_match_data

Unnamed: 0,0
0,
1,
2,"{'prompt': [{'role': 'system', 'content': ''},..."
3,"{'correct': False, 'expected': [], 'picked': N..."


In [11]:
for d in range(0,len(dataset_test)):
    test_match_data = []
    test_match_data.append(test_match(dataset_test[d][0],dataset_test[d][2]))
dataset_test.append(dataset(dataset[:][0].index(dataset_test[0][1])))

env: OPENAI_API_KEY=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


TypeError: test_match() takes 1 positional argument but 2 were given

In [None]:
dataset.append([])
for u in user_content:
    data = battle(system_input1,u,system_input2,best_response_input)
    if data["Choice"] == "Yes":
        best_response_input = data["Instruction1"]
        best_response.append(data["Response1"]) 
    dataset[u].append(data)
user_content.append(data["Response1"])
df = pd.DataFrame(dataset)
user_content_df = pd.DataFrame(user_content)
df.to_json(os.path.join(os.getcwd(), "battles.jsonl"), orient="records", lines=True)
user_content_df.to_json(os.path.join(os.getcwd(), "battles_user_content.jsonl"), orient="records", lines=True)

In [None]:
# Create one-shot prompt

choice = int(df["Choice"][0])-1
if choice == 1:
    input[0] = [
        {"role": "system", "content": user_input[choice], "name": "example_user"},
        {"role": "system", "content": df[df.columns[1+(2*choice)]][0], "name": "example_assistant"},
        {"role":"system","content":""},
        {"role":"user","content":user_input[1]},
        ]
    dataset = [{"input1": input[0], "input2": input[1]}]
    df = pd.DataFrame(dataset)
    df.to_json(os.path.join(os.getcwd(), "battles.jsonl"), orient="records", lines=True)
    df.to_json(os.path.join(os.getcwd(), "battles.jsonl"), orient="records", lines=True)

In [None]:
def instruction(i):
    if i.startswith("User: "):
        input = [{"role":"system","content":""},{"role":"user","content":i.removeprefix("User: ")}]
    else:
        ins = i.split("\nUser: ")
        input = [{"role":"system","content":ins[0]},{"role":"user","content":ins[1]}]
    return input
meta = []
for index, row in df.iterrows():
    meta.append({"input1": instruction(row[0]), "completion1": row[1], "input2": instruction(row[2]), "completion2": row[3], "choice":""})
meta = pd.DataFrame(meta)

row = 5
meta.iloc[row][0],meta.iloc[row][1],meta.iloc[row][2],meta.iloc[row][3]

meta.iloc[row][4] = "1"
#meta.to_json(meta_path, orient="records", lines=True)
!oaieval gpt-3.5-turbo dataset-meta --record_path /workspaces/evals/evallogs/logs-meta