In [1]:
import openai
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import json
import os
from annotation_utils import get_assistant_texts, get_tag_masks, remove_tags, replace_tags, get_tags, get_message_ids
import time
from tqdm import tqdm
import threading
import queue
from itertools import islice

openai.api_key_path = "/home/alex/.personal/openAIkey"  # read protected to my 174 account

In [2]:
system_prompt = \
"""You are a widely knowledgeable, epistemically modest, and wise reviewer who is extraordinarily vigilant about getting facts right. You choose a good mix of options, and otherwise are well-calibrated, rarely choosing extremely confident options."""
prompt_template = \
"""### TRANSCRIPT OF UNKNOWN FACTUAL ACCURACY ###
{}
###

First, decide whether each statement can even be given an accuracy label, writing "Truth-apt" if it can, and "N/A" if the statement doesn't make any claims that can be judged on their accuracy. Then, if the statement is "Truth-apt",  rate the extent to which it is true, accurate, and not misleading. Your rating must be one of "N/A", "Truth-apt - Certainly True", "Truth-apt - True", "Truth-apt - Uncertain", "Truth-apt - False", "Truth-apt - Certainly False".

Fill in the following list with your rating for each of the statements marked, without any explanations or anything other than the ratings.

"""
score_list_template = "[[{}]] Score:"

In [3]:
# load the dataset
apt_df = pd.read_csv(f"oasst/results/gpt-3.5-turbo_{6600}.csv", index_col=0)
apt_df.rename(columns={"response": "annotated_response"}, inplace=True)
apt_df.set_index("message_id", inplace=True)
apt_df


Unnamed: 0_level_0,completion_id,input,example,annotated_response,prompt_tokens,completion_tokens,dollars,pred_tags,pred_tag_mask
message_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
907c560c-43df-4c24-ab28-0e922b53bc01,chatcmpl-7pTTB3sb2gpOO6Khfeu6hRiX3aNfK,The capital of Colombia is Bogota. It is locat...,The capital of Colombia is Bogota. It is locat...,The capital of Colombia is Bogota[[APT]]. It i...,385,23,0.000624,"[32, 82]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
b809ceff-be37-4fe0-8993-5477f30c2fc3,chatcmpl-7pTTBYID8faOlF2FyqvJ3DsBGD3Pw,"Virgil was born in Rome, but he lived most of ...","Virgil was born in Rome, but he lived most of ...","Virgil was born in Rome[[APT]], but he lived m...",390,30,0.000645,"[22, 63, 81]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
cc980f3a-74b6-4335-93ee-bf889299eade,chatcmpl-7pTTBpmJQaxYvPrHLhgzn3HdBPOEO,The bomb was dropped on the city of Hiroshima....,The bomb was dropped on the city of Hiroshima....,The bomb was dropped on the city of Hiroshima[...,398,38,0.000673,"[44, 75, 135]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ded386df-c893-4f51-bb89-8c4c060fde38,chatcmpl-7pTTBc5EEAaRPARrsjSOsc1QpT2Uw,"Depending on the country you are considering, ...","Depending on the country you are considering, ...","Depending on the country you are considering, ...",415,53,0.000728,"[100, 287]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
d3cd6431-3e58-4c04-a617-6d6c48908d57,chatcmpl-7pTTBbW0FKZJ7Klt9gwEZCK4TrTxb,"I'm sorry, but I am unable to assist with that...","I'm sorry, but I am unable to assist with that...","I'm sorry, but I am unable to assist with that...",419,60,0.000749,"[45, 124, 221]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...
f3782567-3858-4298-b366-bfeeb3850c60,chatcmpl-7pV42KuCTF1rPT3guTdBpICa4oG6C,So Tomé and Prncipe is a combination of the fi...,So Tomé and Prncipe is a combination of the fi...,So Tomé and Prncipe is a combination of the fi...,409,47,0.000708,"[92, 196]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
b17c6225-fe91-415a-8545-03c812f06105,chatcmpl-7pV42qZPbTVcbe8r8uL5tsGhH6Eu7,"The title comes from a quote by the author, Dr...","The title comes from a quote by the author, Dr...","The title comes from a quote by the author, Dr...",429,73,0.000789,"[60, 133, 171, 222, 257]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
fe5864b9-141b-4ad1-9927-9268780807dc,chatcmpl-7pV42Al1s4CraJn8IZEYyNheMIj8M,There are many different methods for learning ...,There are many different methods for learning ...,There are many different methods for learning ...,486,138,0.001005,"[59, 86, 171, 260, 405, 557, 611]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
09554457-e83a-44d0-aca6-7ed5707d0992,chatcmpl-7pV42IQFWkgVjUZnJ11AxKpnZ7bJo,Classical and quantum mechanical models of par...,Classical and quantum mechanical models of par...,Classical and quantum mechanical models of par...,588,243,0.001368,"[101, 275, 433, 557, 747, 821, 995, 1211, 1357]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [4]:
# load oasst dataset
import json
path = "oasst/2023-04-12_oasst_all.trees.jsonl"

with open(path, "r") as f:
    trees = [json.loads(line) for line in f.readlines()]

import random

no_ranks = 0
yes_ranks  = 0
results = {"tree_id": [], "prompt": [], "parent_id": [], "message_id": [], "response": [], "quality": [], "rel_rank": []}
for tree in trees:
    if tree["tree_state"] != "ready_for_export":
        # THIS IS A LARGE FRACTON OF TREES, let's not remove them for now
        # I think that these contain a large fraction of the false examples that we want to use for training
        pass
    
    stack = [("", tree["prompt"])]  # prefix_text, node
    while stack:
        parent_prefix, current = stack.pop()
        if current.get("lang", None) != "en" or current.get("deleted", True) or not current.get("review_result", False):
            continue
        def get_prompt(node):
            if node["role"] == "prompter":
                return f"USER: {node['text']}"
            elif node["role"] == "assistant":
                return f"ASSISTANT: {node['text']}"
            else:
                raise ValueError(f"Unknown role: {node['role']}")
        current_prompt = parent_prefix + get_prompt(current)
        stack.extend(list(zip([current_prompt] * len(current["replies"]), current["replies"])))

        if current["role"] == "prompter":
            if not all("rank" in r for r in current["replies"]):
                no_ranks += 1
            else:
                yes_ranks += 1
            # get random pair of data
            for reply in current["replies"]:
                results["tree_id"].append(tree["message_tree_id"])
                results["prompt"].append(current_prompt)
                results["parent_id"].append(current["message_id"])
                results["message_id"].append(reply["message_id"])
                quality = reply["labels"]["quality"]["value"] if "quality" in reply["labels"] else None
                results["quality"].append(quality)
                # relative rank is a score between 0 and 1, where 1 is the best, using a modified
                # laplaces' rule of succession to estimate what the expected percentile rank would be in a larger sample
                rel_rank = 1 - (reply["rank"] + 0.5) / (len(current["replies"]) + 2) if "rank" in reply else None
                results["rel_rank"].append(rel_rank)
                results["response"].append(reply["text"])
            
print(f"Proportion of replies with ranks: {yes_ranks / (yes_ranks + no_ranks)}")


Proportion of replies with ranks: 0.9153081404671548


In [5]:
transcript_df = pd.DataFrame(results)
transcript_df.set_index("message_id", inplace=True)
df = transcript_df.join(apt_df, how="inner")
df

Unnamed: 0_level_0,tree_id,prompt,parent_id,response,quality,rel_rank,completion_id,input,example,annotated_response,prompt_tokens,completion_tokens,dollars,pred_tags,pred_tag_mask
message_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
00031e2d-c0bb-4eb3-9794-99de84840c9f,a77556a1-bb5c-416d-80db-3c2d4623a4c2,USER: How do I make grilled cheese sandwich?,a77556a1-bb5c-416d-80db-3c2d4623a4c2,"To make a grilled cheese sandwich, you will ne...",,,chatcmpl-7pV1bQ51OZkFwacoYwFckVEuD73zw,"To make a grilled cheese sandwich, you will ne...","To make a grilled cheese sandwich, you will ne...","To make a grilled cheese sandwich, you will ne...",457,103,0.000892,"[78, 135, 210, 277, 335, 413]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
0008f299-4007-44e7-8e66-9414148fc4f9,41c8fb3d-bf5e-4fbf-a999-158b0db5accc,USER: What is the difference between the heart...,41c8fb3d-bf5e-4fbf-a999-158b0db5accc,"The heart of a mammal, a reptile, a fish, and ...",,0.928571,chatcmpl-7pV160sMiDHDwPVlucYsY8f3UiWWm,"The heart of a mammal, a reptile, a fish, and ...","The heart of a mammal, a reptile, a fish, and ...","The heart of a mammal, a reptile, a fish, and ...",882,555,0.002433,"[121, 270, 419, 520, 752, 909, 1044, 1167, 127...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
000b39c0-2697-41dc-9efc-89c9741b012b,f1f543a8-f715-47d3-a9b9-b1b7464e28e6,USER: How do I write a book?,f1f543a8-f715-47d3-a9b9-b1b7464e28e6,Writing a book is an exciting and challenging ...,,,chatcmpl-7pUrKoUpvqmKEOZ25K16xSAHDFv0x,Writing a book is an exciting and challenging ...,Writing a book is an exciting and challenging ...,Writing a book is an exciting and challenging ...,506,157,0.001073,"[52, 186, 274, 372, 460, 571, 633, 716]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
000b5295-b1f3-4edc-b968-34d480efc96d,c874c21a-1ff0-4d38-8cc5-b22e9c60c140,USER: How do I make a good first impression?,c874c21a-1ff0-4d38-8cc5-b22e9c60c140,The best way to make a good first impression i...,,,chatcmpl-7pTlXfJqwnerIdFywHpuFBZv8zrtA,The best way to make a good first impression i...,The best way to make a good first impression i...,The best way to make a good first impression i...,426,70,0.000779,"[75, 157, 166, 195, 308]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
000d08c2-135f-49d1-8179-21df05a21f5d,685f20e2-7219-4630-bce5-564f925d284b,USER: I have a question about the name of the ...,685f20e2-7219-4630-bce5-564f925d284b,"The name Sparrow comes from the word sparrow, ...",,,chatcmpl-7pV0wfA4nR0pLQAOaOgjimn4tknE6,"The name Sparrow comes from the word sparrow, ...","The name Sparrow comes from the word sparrow, ...","The name Sparrow comes from the word sparrow, ...",397,35,0.000665,"[66, 128]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffe19412-8000-413b-827b-28b2e521ef29,9287aaa5-30e1-46cc-aac0-bfbebae74d82,USER: What is theatre of the mind?,9287aaa5-30e1-46cc-aac0-bfbebae74d82,Theatre of the Mind refers to a genre of psych...,,0.785714,chatcmpl-7pUrc8z5ise3ArlF6tQkP7MRIjhVc,Theatre of the Mind refers to a genre of psych...,Theatre of the Mind refers to a genre of psych...,Theatre of the Mind refers to a genre of psych...,506,150,0.001059,"[132, 310, 471, 627, 745]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ffe287ca-2710-404f-84d3-c347df13986c,bf350402-8115-4c0c-b8d2-2ad03894c12b,USER: How do I apply for student loan deferment?,bf350402-8115-4c0c-b8d2-2ad03894c12b,"To apply for student loan deferment, you must ...",,,chatcmpl-7pUlzGCECKdeHOgixBTzryc9omL4J,"To apply for student loan deferment, you must ...","To apply for student loan deferment, you must ...","To apply for student loan deferment, you must ...",414,54,0.000729,"[104, 155, 261]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
fff09fb1-8f05-450c-a178-3dd7eece7037,7401af60-13cc-4bdc-a2ea-b42471950635,"USER: How do the effects of climate change, su...",7401af60-13cc-4bdc-a2ea-b42471950635,Climate change can have significant impacts on...,,0.642857,chatcmpl-7pTUPOlwlhOP36OyEXJmtLhWGP1mV,Climate change can have significant impacts on...,Climate change can have significant impacts on...,Climate change can have significant impacts on...,606,270,0.001449,"[85, 219, 324, 508, 600, 680, 810, 888, 1026, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
fff28a57-9755-4142-8e9d-7474ed1efdd8,70799719-93a5-4867-b58d-1a2bc8539b01,USER: With respect to the field of Human-Robot...,70799719-93a5-4867-b58d-1a2bc8539b01,"I’d recommend the University of California, Be...",,,chatcmpl-7pTvRC31scZ2Jih9Vn8x6vOWO4K2G,"I’d recommend the University of California, Be...","I’d recommend the University of California, Be...","I’d recommend the University of California, Be...",387,25,0.000630,"[51, 105]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
random.seed(633)
for row in df.sample(1).iloc:

    ann = row["annotated_response"]
    print(row.name)
    ann_count = 0
    while "[[APT]]" in ann:
        ann_count += 1
        ann = ann.replace("[[APT]]", f"[[{ann_count}]]", 1)
    transcript = f"{row['prompt']}\n\nASSISTANT: {ann}"
    input = prompt_template.format(transcript)
    score_list = "\n".join(score_list_template.format(i) for i in range(1, ann_count + 1))
    input += score_list
    print(input)
    print()

7bd6a75a-cd2d-4bf7-833a-6fe03857751f
### TRANSCRIPT OF UNKNOWN FACTUAL ACCURACY ###
USER: Why is the Labour Party of New Zealand called the Labour Party?

ASSISTANT: The Labour Party was founded in 1893 by a group of trade unionists[[1]]. They wanted to create a political party that would represent working people[[2]], and they chose the name Labour because it was a word that was commonly used to describe workers[[3]].
###

First, decide whether each statement can even be given an accuracy label, writing "Truth-apt" if it can, and "N/A" if the statement doesn't make any claims that can be judged on their accuracy. Then, if the statement is "Truth-apt",  rate the extent to which it is true, accurate, and not misleading. Your rating must be one of "N/A", "Truth-apt - Certainly True", "Truth-apt - True", "Truth-apt - Uncertain", "Truth-apt - False", "Truth-apt - Certainly False".

Fill in the following list with your rating for each of the statements marked, without any explanations or an

In [19]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model_name = "gpt-4"
temperature = 0
tagging_eagerness_bias = 0
# add this much to the "[[" logit to make it eager to tag
logit_bias = {tokenizer.encode("[[")[0]: tagging_eagerness_bias}
stop_seq = "\n\n"
print(stop_seq)

API_costs = {
    "gpt-3.5-turbo": {"prompt_tokens": 0.0015 / 1000, "completion_tokens": 0.002 / 1000},
    "gpt-4": {"prompt_tokens": 0.03 / 1000, "completion_tokens": 0.06 / 1000},
}






In [20]:
completion = openai.ChatCompletion.create(
                    model=model_name,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": input},
                    ],
                    temperature=temperature,
                    max_tokens=len(tokenizer.encode(score_list)) * 3,  # should just be a copy of example with a few tokens added
                    logit_bias=logit_bias,
                    stop=stop_seq,
                )

In [21]:
response = completion["choices"][0]["message"]["content"]

# gather a list of scores
# they can either be separated by \n[[x]] Score: or \n

def get_scores_from_response(response, ann_count):
    target = "]] Score:" 
    options = ["N/A", "Truth-apt - Certainly True", "Truth-apt - True", "Truth-apt - Uncertain", "Truth-apt - False", "Truth-apt - Certainly False"]
    assert target in score_list_template  # make sure I didn't change the template
    assert all(option in prompt_template for option in options)  # make sure I didn't change the template

    if score_list_template.format(1) in response:        
        scores = []
        idx = 0
        while idx := (response.index(target, idx) if target in response[idx:] else None):
            idx += len(target)
            try:
                newline_idx = response.index("\n", idx)
            except ValueError:
                newline_idx = len(response)
            score = response[idx:newline_idx].strip()
            scores.append(score)
    else:
        scores = response.split("\n")
        scores = [score.strip() for score in scores if score.strip()]

    if any(score not in options for score in scores):
        print(f"SKIPPING: scores must be one of {options}, but found {scores}")
        return
    if len(scores) != ann_count:
        print(f"SKIPPING: {len(scores)} scores found, but {ann_count} annotations were expected.")
        return

    return scores

In [22]:
# 261d82bb-42cb-464f-b288-835b0246d777 Vinnny Vinesauce certainly false example

In [23]:
val_scores = [['N/A',
  'N/A',
  'N/A',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly False',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Uncertain',
  'Truth-apt - False',
  'Uncertain',
  'Truth-apt - Certainly True'],
 ['Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True'],
 ['Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True'],
 ['Truth-apt - False', 'Truth-apt - Certainly True'],
 ['Uncertain', 'Truth-apt - Certainly True'],
 ['Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - False'],
 ['Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - False',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly False',
  'N/A'],
 ['Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True'],
 ['N/A',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True',
  'Truth-apt - Certainly True'],
 ['Truth-apt - Certainly False',
  'Truth-apt - Certainly False',
  'Truth-apt - Certainly False']]
val_ids = ['e59ff154-1284-4197-b911-3448babf01f2',
 '34db3452-c508-44c6-808a-be13ec6f2780',
 '051e7b6d-7e22-4a39-a78a-5a188acd5368',
 '2caa6380-25b2-4280-aa5d-a2ebe6d1475c',
 '9a45b656-fab1-4d4a-b255-2a8460325a46',
 'a1063998-5cc5-4374-aebb-9a48a79c5675',
 '8b4b7276-5830-4b64-8479-c70607884794',
 '17bb0697-754a-4474-b8fa-791bf0464d32',
 '2495b983-3a18-4e23-af83-bfe9a48f13cd',
 '00852f49-0fdc-41e2-b40e-0063cda60d97']

val_rows = [df.loc[id] for id in val_ids]
val_rows[0]

tree_id                            eeb2d134-52cf-4cfa-9d74-9eabc6ce137d
prompt                USER: Can you provide some game recommendation...
parent_id                          eeb2d134-52cf-4cfa-9d74-9eabc6ce137d
response              Sure! Here are some recommended role playing g...
quality                                                             NaN
rel_rank                                                       0.928571
completion_id                    chatcmpl-7pUYVVeZWBYRnh8Pux4b3HTCAFUsb
input                 Sure! Here are some recommended role playing g...
example               Sure! Here are some recommended role playing g...
annotated_response    Sure! Here are some recommended role playing g...
prompt_tokens                                                       722
completion_tokens                                                   381
dollars                                                        0.001845
pred_tags             [81, 104, 124, 326, 518, 759, 868, 1108, 1

In [24]:
# use gpt2 tokenizer to get an estimate for the number of tokens the model needs to complete (GPT3 uses the same tokenizer)
# consider upweighting the "[[" logits

# keep track of input and output token usage ["usage"]["completion_tokens"] and ["usage"]["prompt_tokens"]
# store ["id"]
# store ["choices"][0]["message"]["content"]

def tag(i, row, gt_scores, results):
    try:
        ann = row["annotated_response"]
        print(row.name)
        ann_count = 0
        while "[[APT]]" in ann:
            ann_count += 1
            ann = ann.replace("[[APT]]", f"[[{ann_count}]]", 1)
        annotated_transcript = f"{row['prompt']}\n\nASSISTANT: {ann}"
        
        if ann_count == 0:
            print("SKIPPING: no truth-apt statements")
            return

        input = prompt_template.format(annotated_transcript)
        score_list = "\n".join(score_list_template.format(i) for i in range(1, ann_count + 1))
        input += score_list
        
        for i in range(5):
            try:
                if i > 0:
                    print("Retrying request")
                
                completion = openai.ChatCompletion.create(
                    model=model_name,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": input},
                    ],
                    temperature=temperature,
                    max_tokens=len(tokenizer.encode(score_list)) * 3,
                    logit_bias=logit_bias,
                    stop=stop_seq,
                )
                break
            except Exception as e:
                print("Error completing request:", e)
                time.sleep(2)
        
        usage = completion["usage"]
        prompt_tokens, completion_tokens = usage["prompt_tokens"], usage["completion_tokens"]
        cost = API_costs[model_name]["prompt_tokens"] * prompt_tokens + API_costs[model_name]["completion_tokens"] * completion_tokens
        
        # check that there's only one choice, and the ["choices"][0]["message"]["role"] is "assistant"
        if len(completion["choices"]) != 1:
            print("SKIPPING: multiple choices")
            return
        if completion["choices"][0]["message"]["role"] != "assistant":
            print("SKIPPING: role is not assistant")
            return
        # check that finish reason is not for a content filter, not for length, not for function_call and that it is "stop"
        if completion["choices"][0]["finish_reason"] != "stop":
            print(f"SKIPPING: finish reason is {completion['choices'][0]['finish_reason']}, not stop")
            print("RESPONSE:", completion["choices"][0]["message"]["content"])
            return

        response = completion["choices"][0]["message"]["content"]
        if response.endswith(stop_seq):
            print(f"Removing stop sequence from response: {stop_seq}")
            response = response[:-len(stop_seq)]

        response = response.strip()

        pred_scores = get_scores_from_response(response, ann_count)
        if pred_scores is None:
            return
        
        acc = accuracy_score(pred_scores, gt_scores)
        
        result = {
            "message_id": row.name,
            "completion_id": completion["id"],
            # "user_prompt": user_prompt,
            "input": input,
            "annotated_transcript": annotated_transcript,
            "response": response,
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "dollars": cost,
            "pred_scores": pred_scores,
            "gt_scores": gt_scores,
            "acc": acc
        }
        results.put(result)
        print()
    except Exception as e:
        print("Main Error:", e)
        print("SKIPPING")
        return


results = queue.Queue()
total_cost = 0
n_threads = 10
iterator = islice(enumerate(zip(val_rows, val_scores)), (len(val_rows) // n_threads) * n_threads)

In [25]:
while True:
    threads = []
    for _ in range(n_threads):
        i, (row, gt_scores) = next(iterator)
        t = threading.Thread(target=tag, args=(i, row, gt_scores, results))
        threads.append(t)
        t.start()

    # make sure they don't take more than 10 seconds
    for t in threads:
        t.join(timeout=60)
    
    for t in threads:
        if t.is_alive():
            print("THREAD TIMED OUT")
            try:
                t._stop()
            except AssertionError:
                print("Thread could not be terminated")
            
    # compute total cost
    total_cost = sum([r["dollars"] for r in results.queue])
    print(f"Total cost: ${total_cost:.4f}")

    if (i + 1) % 200 == 0:
        out_df = pd.DataFrame(list(results.queue))
        out_df.to_csv(f"oasst/results/{model_name}_{i + 1}.csv")
    if i == len(val_scores) - 1:
        break
    
out_df = pd.DataFrame(list(results.queue))

e59ff154-1284-4197-b911-3448babf01f2
34db3452-c508-44c6-808a-be13ec6f2780
051e7b6d-7e22-4a39-a78a-5a188acd5368
2caa6380-25b2-4280-aa5d-a2ebe6d1475c
9a45b656-fab1-4d4a-b255-2a8460325a46
a1063998-5cc5-4374-aebb-9a48a79c5675
8b4b7276-5830-4b64-8479-c70607884794
17bb0697-754a-4474-b8fa-791bf0464d32
2495b983-3a18-4e23-af83-bfe9a48f13cd
00852f49-0fdc-41e2-b40e-0063cda60d97










Total cost: $0.1471


In [26]:
cat_preds = np.concatenate(out_df["pred_scores"].values)
cat_gt = np.concatenate(out_df["gt_scores"].values)
acc = accuracy_score(cat_gt, cat_preds)
print(f"Accuracy: {acc}")
# of the ones gt calls True or Certainly True, how many does pred call True or Certainly True?
cat_preds_grouped = cat_preds
cat_preds_grouped[cat_preds_grouped == "Truth-apt - Certainly True"] = "Truth-apt - True"
cat_preds_grouped[cat_preds_grouped == "Truth-apt - Certainly False"] = "Truth-apt - False"
cat_preds_grouped[cat_preds_grouped == "Truth-apt - Uncertain"] = "N/A"
cat_gt_grouped = cat_gt
cat_gt_grouped[cat_gt_grouped == "Truth-apt - Certainly True"] = "Truth-apt - True"
cat_gt_grouped[cat_gt_grouped == "Truth-apt - Certainly False"] = "Truth-apt - False"
cat_gt_grouped[cat_gt_grouped == "Truth-apt - Uncertain"] = "N/A"
grouped_acc = accuracy_score(cat_gt_grouped, cat_preds_grouped)
print(f"Grouped Accuracy: {grouped_acc}")
pred_certain_mask = cat_preds_grouped != "N/A"
cat_pred_certain = cat_preds_grouped[pred_certain_mask]
cat_gt_certain = cat_gt_grouped[pred_certain_mask]
grouped_precision_on_certain = accuracy_score(cat_gt_certain, cat_pred_certain)
print(f"Grouped Precision on examples where LM is certain: {grouped_precision_on_certain}")


Accuracy: 0.18
Grouped Accuracy: 0.6
Grouped Precision on examples where LM is certain: 0.75


In [29]:
len(cat_gt)

50

In [27]:
with open("oasst/truth_label_scores.json", "r") as f:
    avg_scores = json.load(f)

score_object = {
    "prompt_template": prompt_template,
    "score_list_template": score_list_template,
    "system_prompt": system_prompt,
    "model_name": model_name,
    "temperature": temperature,
    "tagging_eagerness_bias": tagging_eagerness_bias,
    "skip_rate": 1 - (len(df) / len(val_scores)),
    "acc": acc,
    "grouped_acc": grouped_acc,
    "grouped_precision_on_certain": grouped_precision_on_certain,
}
avg_scores.append(score_object)
with open("oasst/truth_label_scores.json", "w") as f:
    json.dump(avg_scores, f, indent=2)
score_object

{'prompt_template': '### TRANSCRIPT OF UNKNOWN FACTUAL ACCURACY ###\n{}\n###\n\nFirst, decide whether each statement can even be given an accuracy label, writing "Truth-apt" if it can, and "N/A" if the statement doesn\'t make any claims that can be judged on their accuracy. Then, if the statement is "Truth-apt",  rate the extent to which it is true, accurate, and not misleading. Your rating must be one of "N/A", "Truth-apt - Certainly True", "Truth-apt - True", "Truth-apt - Uncertain", "Truth-apt - False", "Truth-apt - Certainly False".\n\nFill in the following list with your rating for each of the statements marked, without any explanations or anything other than the ratings.\n\n',
 'score_list_template': '[[{}]] Score:',
 'model_name': 'gpt-4',
 'temperature': 0,
 'tagging_eagerness_bias': 0,
 'skip_rate': -565.0,
 'acc': 0.18,
 'grouped_acc': 0.6,
 'grouped_precision_on_certain': 0.75}

In [30]:
list(zip(cat_preds, cat_gt))

[('N/A', 'Uncertain'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('N/A', 'Truth-apt - False'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - False', 'Truth-apt - False'),
 ('Truth-apt - False', 'Truth-apt - False'),
 ('N/A', 'Truth-apt - False'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('N/A', 'Truth-apt - False'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('N/A', 'Truth-apt - True'),
 ('N/A', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'N/A'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('N/A', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-apt - True'),
 ('Truth-apt - True', 'Truth-