In [87]:
import openai
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import json
import os
from annotation_utils import get_assistant_texts, get_tag_masks, remove_tags, replace_tags, get_tags

openai.api_key_path = "/home/alex/.personal/openAIkey"  # read protected to my 174 account

In [95]:
with open("oasst/average_scores.json", "r") as f:
    average_scores = json.loads(f.read())

In [96]:
prompt_template = \
"""{}

###

Take the above text and place the tag [[APT]] at the end of every statement/fragment that is truth-apt. That is, if there exists a context in which it makes sense to label a statement as true or false, insert [[APT]] at the end of the statement just after the last word. Here are some examples

# Example 1 #
Berries are a great source of antioxidants[[APT]], vitamins and minerals[[APT]. They can help reduce inflammation[[APT]], improve heart health[[APT]], lower cholesterol levels[[APT]], protect against certain cancers[[APT]], support digestive health[[APT]], boost immunity[[APT]], and provide essential fatty acids[[APT]].

# Example 2 #
Be careful when writing code in C. It is is easy to cause memory leaks[[APT]]. How else may I assist you?

Respond only with the text, exactly unmodified other than the tags. Apply tags generously, even to sentence fragments."""

In [97]:
# load the validation dataset

ann_path = "oasst/validation_annotations.txt"
with open(ann_path) as f:
    annotated = f.read()

annotated_assistant_texts = replace_tags(get_assistant_texts(annotated))
assistant_texts = remove_tags(annotated_assistant_texts)
# print("\n\n\n\n".join(annotated_assistant_texts))

In [98]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model_name = "gpt-3.5-turbo"
temperature = 0
tagging_eagerness_bias = 4
# add this much to the "[[" logit to make it eager to tag
logit_bias = {tokenizer.encode("[[")[0]: tagging_eagerness_bias}

API_costs = {"gpt-3.5-turbo": {"prompt_tokens": 0.0015 / 1000, "completion_tokens": 0.002 / 1000}}

In [99]:
# use gpt2 tokenizer to get an estimate for the number of tokens the model needs to complete (GPT3 uses the same tokenizer)
# consider upweighting the "[[" logits

# keep track of input and output token usage ["usage"]["completion_tokens"] and ["usage"]["prompt_tokens"]
# store ["id"]
# store ["choices"][0]["message"]["content"]
results = []
total_cost = 0
for i, (annotated_example, example) in enumerate(zip(annotated_assistant_texts, assistant_texts)):
    transcript_id = None
    example_tokens = len(tokenizer.encode(example))
    input = prompt_template.format(example)
    completion = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": input},
        ],
        temperature=temperature,
        max_tokens=int(example_tokens * 1.5) + 5,  # should just be a copy of example with a few tokens added
        logit_bias=logit_bias,
    )
    
    usage = completion["usage"]
    prompt_tokens, completion_tokens = usage["prompt_tokens"], usage["completion_tokens"]
    cost = API_costs[model_name]["prompt_tokens"] * prompt_tokens + API_costs[model_name]["completion_tokens"] * completion_tokens
    total_cost += cost
    print("Cumulative cost ($):", total_cost)
    
    # check that there's only one choice, and the ["choices"][0]["message"]["role"] is "assistant"
    if len(completion["choices"]) != 1:
        print("SKIPPING: multiple choices")
        continue
    if completion["choices"][0]["message"]["role"] != "assistant":
        print("SKIPPING: role is not assistant")
        continue
    # check that finish reason is not for a content filter, not for length, not for function_call and that it is "stop"
    if completion["choices"][0]["finish_reason"] != "stop":
        print(f"SKIPPING: finish reason is {completion['choices'][0]['finish_reason']}, not stop")
        continue

    response = completion["choices"][0]["message"]["content"]
    # check that the response is an exact match to the prompt
    clean_response = remove_tags(response)
    response_tags = get_tags(response)["APT"]
    response_tag_mask = get_tag_masks(response)["APT"]
    gt_tags = get_tags(annotated_example)["APT"]
    gt_tag_mask = get_tag_masks(annotated_example)["APT"]

    if clean_response != example:
        print(f"SKIPPING: response does not match prompt:\nEXAMPLE: {example}\n\n\nRESPONSE: {clean_response}")
        continue
        
    # print precision, recall, accuracy and f1 score of the tag masks
    prec = precision_score(gt_tag_mask, response_tag_mask)
    rec = recall_score(gt_tag_mask, response_tag_mask)
    acc = accuracy_score(gt_tag_mask, response_tag_mask)
    f1 = f1_score(gt_tag_mask, response_tag_mask)
    print("precision:", prec)
    print("recall:", rec)
    print("accuracy:", acc)
    print("f1:", f1)

    result = {
        "transcript_id": transcript_id,
        "completion_id": completion["id"],
        # "user_prompt": user_prompt,
        "input": input,
        "completion": response,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "dollars": cost,
        "pred_tags": response_tags,
        "pred_tag_mask": response_tag_mask,
        "gt_tags": gt_tags,
        "gt_tag_mask": gt_tag_mask,
        "precision": prec,
        "recall": rec,
        "accuracy": acc,
        "f1": f1,
    }

    results.append(result)
    print()
df = pd.DataFrame(results)
print(" average precision:", df["precision"].mean())
print(" average recall:", df["recall"].mean())
print(" average accuracy:", df["accuracy"].mean())
print(" average f1:", df["f1"].mean())

Cumulative cost ($): 0.001023
precision: 1.0
recall: 0.4117647058823529
accuracy: 0.9908675799086758
f1: 0.5833333333333334

Cumulative cost ($): 0.0027665
SKIPPING: response does not match prompt:
EXAMPLE: A Value-Added Tax (VAT) is a tax that is levied on goods and services at the time of sale, rather than when they are purchased. The main advantage of a VAT is that it eliminates the need for an income tax, as all revenue generated from the tax can be used to finance public spending.

The effects of abolishing the US income tax and replacing it with a VAT could vary depending on several factors, such as the specific design of the new tax system, its level of taxation, and its impact on economic growth and employment. Here's some possible scenarios:

Scenario 1: No change in overall taxes
Under this scenario, there would be no major changes to the existing tax system. Income taxes still exist, but their rates and brackets would remain unchanged. This could lead to a slight increase in

In [100]:
score_object = {
    "prompt_template": prompt_template,
    "model_name": model_name,
    "temperature": temperature,
    "tagging_eagerness_bias": tagging_eagerness_bias,
    "precision": df["precision"].mean(),
    "recall": df["recall"].mean(),
    "f1": df["f1"].mean(),
    "accuracy": df["accuracy"].mean(),
}
average_scores.append(score_object)
# make a copy of the file we're about to write if it exists
if os.path.exists("oasst/average_scores.json"):
    os.rename("oasst/average_scores.json", "oasst/average_scores.json.bak")
with open("oasst/average_scores.json", "w") as f:
    f.write(json.dumps(average_scores, indent=2))