In [4]:
import openai
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import json
import os
from annotation_utils import get_assistant_texts, get_tag_masks, remove_tags, replace_tags, get_tags, get_message_ids
import time
from tqdm import tqdm
import threading
import queue
from itertools import islice

openai.api_key_path = "/home/alex/.personal/openAIkey"  # read protected to my 174 account

In [33]:
model_name = "keyfan_bloomz-rlhf(max_length=512;temperature=1.0;repetition_penalty=1.2;do_sample=True;top_p=0.95)"
data_path = f"data/transcripts_{model_name}.csv"
df = pd.read_csv(data_path)

In [34]:
print(f"Total number of transcripts: {len(df)}")
df = df.drop_duplicates(subset=['prompt'])
df = df.reset_index(drop=True)
len(df)

Total number of transcripts: 843


831

In [35]:
df["response"].iloc[0]

"Sure! DHCP (Dynamic Host Configuration Protocol) is a protocol used by network devices such as routers to assign IP addresses to client computers or other networked devices.\n\nDHCP assigns each device on a local area network (LAN) with a unique public address called an Internet Protocol (IP) address. This makes it possible for all networked devices on the LAN to communicate with one another using standard internet protocols like TCP/IP.\n\nBy default, most operating systems come preconfigured with static IP configurations meaning that the assigned IP address remains constant until manually changed through configuration changes in the system settings of your router or modem or even via software installations from certain sources.  When this happens, it's common practice for people still not familiar enough with DHCP to assume that they have manually set up their PCs with dynamic DNS so they are always downloading web applications directly off the internet and thus automatically being 

In [36]:
import uuid
def get_asst_text(text):
    if "smartplat" in data_path:
        bits = text.split("ASSISTANT:")
        bits = bits[1].split("USER:")
        bits = bits[0].split("</s>")  # TODO: figure out why the model is outputting this
        return bits[0].strip()
    elif "keyfan_bloomz" in data_path:
        return text
    else:
        raise Exception("Unknown dataset")

df["assistant_text"] = df["response"].apply(get_asst_text)
df["user_text"] = df["parent_text"]
df["message_id"] = [str(uuid.uuid4()) for _ in range(len(df))]
df["assistant_text"].iloc[0], df["user_text"].iloc[0]

("Sure! DHCP (Dynamic Host Configuration Protocol) is a protocol used by network devices such as routers to assign IP addresses to client computers or other networked devices.\n\nDHCP assigns each device on a local area network (LAN) with a unique public address called an Internet Protocol (IP) address. This makes it possible for all networked devices on the LAN to communicate with one another using standard internet protocols like TCP/IP.\n\nBy default, most operating systems come preconfigured with static IP configurations meaning that the assigned IP address remains constant until manually changed through configuration changes in the system settings of your router or modem or even via software installations from certain sources.  When this happens, it's common practice for people still not familiar enough with DHCP to assume that they have manually set up their PCs with dynamic DNS so they are always downloading web applications directly off the internet and thus automatically being

In [37]:
df.to_json(f"data/transcripts_{model_name}.json", orient="records")

In [38]:
from TruthLabeler import TruthLabeler
from AptTagger import AptTagger
apt_tagger = AptTagger()

In [42]:
df.iloc[16].parent_id

'dfe226cd-1663-45e3-ab80-7b6065da46ac'

In [39]:
raw_df = df.iloc[11:21]
raw_df["tag_mask"] = apt_tagger.tag(raw_df["assistant_text"])
raw_df["annotated_assistant_text"] = raw_df.apply(lambda x: apt_tagger.annotate(x["assistant_text"], x["tag_mask"]), axis=1)
raw_df["annotated_transcript"] = raw_df.apply(lambda x: f"USER: {x['user_text']}\n\nASSISTANT: {x['annotated_assistant_text']}", axis=1)
raw_df["input_text"] = raw_df["annotated_transcript"].apply(lambda x: TruthLabeler.make_input(x)[0])
raw_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_df["tag_mask"] = apt_tagger.tag(raw_df["assistant_text"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_df["annotated_assistant_text"] = raw_df.apply(lambda x: apt_tagger.annotate(x["assistant_text"], x["tag_mask"]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_df["annotated_

Unnamed: 0,prompt,response,parent_id,parent_text,role,synthetic,model,assistant_text,user_text,message_id,tag_mask,annotated_assistant_text,annotated_transcript,input_text
11,A chat between a curious human and an artifici...,"As an artificial intelligence language model, ...",6faa03e5-5f0c-483c-9d22-a15505a2b821,Where was your last vaccation?,assistant,True,keyfan/bloomz-rlhf(max_length=512;temperature=...,"As an artificial intelligence language model, ...",Where was your last vaccation?,87cdf227-c048-4ccb-a790-0f44dc5ebb6b,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","As an artificial intelligence language model, ...",USER: Where was your last vaccation?\n\nASSIST...,### TRANSCRIPT OF UNKNOWN FACTUAL ACCURACY ###...
12,A chat between a curious human and an artifici...,Joseph Scaliger (1557-1636) was an Italian sch...,2f2e3cfd-857f-41b5-9824-0b62f7dd374c,Describe Joseph Scaliger and his contributions...,assistant,True,keyfan/bloomz-rlhf(max_length=512;temperature=...,Joseph Scaliger (1557-1636) was an Italian sch...,Describe Joseph Scaliger and his contributions...,fd9fa103-6988-4d4c-b6de-2e7a0ac8b5a9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Joseph Scaliger (1557-1636) was an Italian sch...,USER: Describe Joseph Scaliger and his contrib...,### TRANSCRIPT OF UNKNOWN FACTUAL ACCURACY ###...
13,A chat between a curious human and an artifici...,Certainly! A well-chosen color palette is one ...,89b13be2-c82f-493a-9258-1d690440e451,"I am designing a website, can you give me a co...",assistant,True,keyfan/bloomz-rlhf(max_length=512;temperature=...,Certainly! A well-chosen color palette is one ...,"I am designing a website, can you give me a co...",69f27058-6253-42bc-892c-5f8a95870ec1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Certainly! A well-chosen color palette is one ...,"USER: I am designing a website, can you give m...",### TRANSCRIPT OF UNKNOWN FACTUAL ACCURACY ###...
14,A chat between a curious human and an artifici...,Lena was a happy-go-lucky suburban housewife w...,0cd84750-ccf6-4a8e-89bf-02e17a6d0eca,Write me a story synopsis for a novel where th...,assistant,True,keyfan/bloomz-rlhf(max_length=512;temperature=...,Lena was a happy-go-lucky suburban housewife w...,Write me a story synopsis for a novel where th...,b864688c-62cb-420d-bae1-e81677b4fded,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Lena was a happy-go-lucky suburban housewife w...,USER: Write me a story synopsis for a novel wh...,### TRANSCRIPT OF UNKNOWN FACTUAL ACCURACY ###...
15,A chat between a curious human and an artifici...,Ser introvertido no significa que no puedas te...,b65b2bfa-1fc6-4448-95a5-cc92544d83cf,¿como consigo una novia siendo introvertido?,assistant,True,keyfan/bloomz-rlhf(max_length=512;temperature=...,Ser introvertido no significa que no puedas te...,¿como consigo una novia siendo introvertido?,3855e40b-823f-4b80-811e-e0071034c507,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Ser introvertido no significa que no puedas te...,USER: ¿como consigo una novia siendo introvert...,### TRANSCRIPT OF UNKNOWN FACTUAL ACCURACY ###...


In [40]:
out_str = ""
for row in raw_df.iloc:
    out_str += f"[[MESSAGE_ID]] {row['message_id']}\n"
    out_str += f"[[INPUT_TEXT]] {row['input_text']}\n"
    out_str += "[[NOT YET ANNOTATED]]"
    out_str += "\n\n\n\n"

out_file = f"data/BLANK_val_labels_{model_name}.txt"
with open(out_file, "w") as f:
    f.write(out_str)
out_file

'data/BLANK_val_labels_keyfan_bloomz-rlhf(max_length=512;temperature=1.0;repetition_penalty=1.2;do_sample=True;top_p=0.95).txt'

In [11]:
# a log of experiment results
with open("data/average_scores.json", "r") as f:
    average_scores = json.loads(f.read())

FileNotFoundError: [Errno 2] No such file or directory: 'data/average_scores.json'

In [3]:
prompt_template = \
"""{}

Take the above input text and place the tag `[[APT]]` at the end of every statement/fragment that is truth-apt. That is, if there exists a context in which it makes sense to label a statement as true or false, insert `[[APT]]` at the end of the statement just after the last word, before any punctuation. Here are some examples:

# Example 1 #
Berries are a great source of antioxidants[[APT]], vitamins and minerals[[APT]. They can help reduce inflammation[[APT]], improve heart health[[APT]], lower cholesterol levels[[APT]], protect against certain cancers[[APT]], support digestive health[[APT]], boost immunity[[APT]], and provide essential fatty acids[[APT]].

# Example 2 #
The bitter lesson focuses on a company that hired an executive with false credentials[[APT]], who then proceeded to bring the company into ruin through deception and manipulation[[APT]]. The author argues that companies should not hire executives from outside their own industry[[APT]]; they will always be at a disadvantage[[APT]] due to unfamiliarity with corporate politics and practices[[APT]], which leaves them vulnerable to being manipulated or deceived[[APT]]. In contrast, hiring internal candidates shows true dedication to the long-term success of the company[[APT]]. Additionally, the author suggests that it is valuable for leaders within corporations to learn about different industries[[APT]] so as to better understand how various business decisions impact overall performance[[APT]]. The book ends with advice for improving ethical decision making skills[[APT]] in order to avoid similar missteps in the future[[APT]].

Respond only with the input text, exactly unmodified other than the tags. Apply tags generously, even to sentence fragments, as shown in the examples."""


In [5]:
# # load the validation dataset

# ann_path = "oasst/validation_annotations.txt"
# with open(ann_path) as f:
#     annotated = f.read()

# to_replace = ("LE", "LH", "NORM", "APT", "IMP")
# annotated_assistant_texts = replace_tags(get_assistant_texts(annotated), to_replace=to_replace)
# assistant_texts = remove_tags(annotated_assistant_texts)
# list(zip(annotated_assistant_texts, assistant_texts))

# load the inference dataset
texts = []
for path in os.listdir("data/oasst/transcripts/original"):
    with open(os.path.join("data/oasst/transcripts/original", path)) as f:
        texts.append(f.read())

text = "\n\n\n".join(texts[684:])
assistant_texts = get_assistant_texts(text)
message_ids = get_message_ids(text)
assert len(assistant_texts) == len(message_ids)
print(len(message_ids))

# weak_lm_responses = pd.read_csv("data/transcripts_vicgalle_gpt2-open-instruct-v1(max_length=512;temperature=1.0;repetition_penalty=1.2;do_sample=True;top_p=0.95).csv")
# weak_lm_responses = pd.read_csv("data/transcripts_keyfan_bloomz-rlhf(max_length=512;temperature=1.0;repetition_penalty=1.2;do_sample=True;top_p=0.95).csv").iloc[:50]
# message_ids = weak_lm_responses["parent_id"].apply(lambda x: "P=" + x).values.tolist()
# assistant_texts = weak_lm_responses["response"].values.tolist()
# parent_texts = weak_lm_responses["parent_text"].values.tolist()
# print(len(assistant_texts))

# message_ids[:2]

720


In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model_name = "gpt-3.5-turbo"
temperature = 0
tagging_eagerness_bias = 0
# add this much to the "[[" logit to make it eager to tag
logit_bias = {tokenizer.encode("[[")[0]: tagging_eagerness_bias}
stop_seq = prompt_template[4:55]  # "Take the above input text and place the tag [[APT]]"

API_costs = {
    "gpt-3.5-turbo": {"prompt_tokens": 0.0015 / 1000, "completion_tokens": 0.002 / 1000},
    "gpt-4": {"prompt_tokens": 0.03 / 1000, "completion_tokens": 0.06 / 1000},
}

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# use gpt2 tokenizer to get an estimate for the number of tokens the model needs to complete (GPT3 uses the same tokenizer)
# consider upweighting the "[[" logits

# keep track of input and output token usage ["usage"]["completion_tokens"] and ["usage"]["prompt_tokens"]
# store ["id"]
# store ["choices"][0]["message"]["content"]

def tag(i, message_id, example, results, **kwargs):
    try:
        example_tokens = len(tokenizer.encode(example))
        input = prompt_template.format(example)
        for i in range(5):
            try:
                if i > 0:
                    print("Retrying request")
                
                completion = openai.ChatCompletion.create(
                    model=model_name,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": input},
                    ],
                    temperature=temperature,
                    max_tokens=int(example_tokens * 1.5) + 5 + len(stop_seq),  # should just be a copy of example with a few tokens added
                    logit_bias=logit_bias,
                    stop=stop_seq,
                )
                break
            except Exception as e:
                print("Error completing request:", e)
                time.sleep(2)
        
        usage = completion["usage"]
        prompt_tokens, completion_tokens = usage["prompt_tokens"], usage["completion_tokens"]
        cost = API_costs[model_name]["prompt_tokens"] * prompt_tokens + API_costs[model_name]["completion_tokens"] * completion_tokens
        
        # check that there's only one choice, and the ["choices"][0]["message"]["role"] is "assistant"
        if len(completion["choices"]) != 1:
            print("SKIPPING: multiple choices")
            return
        if completion["choices"][0]["message"]["role"] != "assistant":
            print("SKIPPING: role is not assistant")
            return
        # check that finish reason is not for a content filter, not for length, not for function_call and that it is "stop"
        if completion["choices"][0]["finish_reason"] != "stop":
            print(f"SKIPPING: finish reason is {completion['choices'][0]['finish_reason']}, not stop")
            print("RESPONSE:", completion["choices"][0]["message"]["content"])
            return

        response = completion["choices"][0]["message"]["content"]
        if response.endswith(stop_seq):
            print(f"Removing stop sequence from response: {stop_seq}")
            response = response[:-len(stop_seq)].rstrip()

        response = response.strip()

        # check that the response is an exact match to the prompt
        clean_response = remove_tags(response)
        response_tags = get_tags(response).get("APT", [])
        response_tag_mask = get_tag_masks(response).get("APT", [0] * len(clean_response))
        
        if clean_response.strip() != example.strip():
            print(f"SKIPPING: response does not match prompt:\n")
            print("EXAMPLE:")
            print(example)
            print("RESPONSE:")
            print(clean_response)
            return
            
        
        result = {
            "message_id": message_id,
            "completion_id": completion["id"],
            # "user_prompt": user_prompt,
            "input": input,
            "example": example,
            "response": response,
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "dollars": cost,
            "pred_tags": response_tags,
            "pred_tag_mask": response_tag_mask,
            **kwargs,
        }
        results.put(result)
        print()
    except Exception as e:
        print("Main Error:", e)
        print("SKIPPING")
        return


results = queue.Queue()
total_cost = 0
n_threads = 10
n_iter = (len(message_ids) // n_threads) * n_threads
iterator = islice(enumerate(zip(message_ids, assistant_texts)), n_iter)

In [13]:
while True:
    threads = []
    for _ in range(n_threads):
        i, (message_id, example, *rest) = next(iterator)
        t = threading.Thread(target=tag, args=(i, message_id, example, results, *rest))
        threads.append(t)
        t.start()

    # make sure they don't take more than 10 seconds
    for t in threads:
        t.join(timeout=60)
    
    for t in threads:
        if t.is_alive():
            print("THREAD TIMED OUT")
            try:
                t._stop()
            except AssertionError:
                print("Thread could not be terminated")


    # compute total cost
    total_cost = sum([r["dollars"] for r in results.queue])
    print(f"Total cost: ${total_cost:.4f}")

    if (i + 1) % 200 == 0:
        df = pd.DataFrame(list(results.queue))
        df.to_csv(f"oasst/results/{model_name}_{i + 1}.csv")
    if i == n_iter - 1:
        break
    
df = pd.DataFrame(list(results.queue))







SKIPPING: response does not match prompt:

EXAMPLE:
Here's a refactored version of this code that includes more descriptive variable names, uses better error handling, adds some documentation, and improves indentation and flow:

'''
from logging import get_logger
from nio import AsyncClient, MatrixRoom, RoomMessageText
import os
import asyncio
import datetime as dt

LOGGER = get_logger(__name__)

ELEM_ROOM = os.environ.get('ELEM_ROOM')
ELEM_USER = os.environ.get('ELEM_USER')
ELEM_PSW = os.environ.get('ELEM_PSW')

MESSAGE_TYPE ='m.room.message'
MESSAGE_BODY = {'msgtype': MESSAGE_TYPE, 'body': 'hello world!'}

send = asyncio.create_task(send(MESSAGE_BODY))

@asyncio.coroutine
def run():
    # _logger.info(f"{room.user_name(event. sender)} | {event. body}")
    _logger.info(
        f"Message sent to room {room.display_name} | {room. user_name(event. sender)}: {event. body}"
    )

    async with await AsyncClient("https://matrix.org", ELEM_USR).auth(ELEM_PSW):
        client = awai

StopIteration: 









SKIPPING: response does not match prompt:

EXAMPLE:
The mathematical constant "e" (the base of natural logarithms) has many uses in various fields. Here are some of its most common applications:

1. Calculus: The exponential function f(x) = x^e is an important tool for solving calculus problems. It allows us to express solutions to equations such as f'(x) = f(x), which otherwise would be difficult to solve.

2. Probability: The expected value of a random variable X can be calculated using the formula E[X] = ∫_a^b xf(x)dx, where f(x) is the probability density function of X. For example, if we have a distribution with PDF f(x) = ax + b, then the expected value of x is given by E[x] = ∫_0^\infty x(ax + b) dx = \frac{ab}{a+b}.

3. Financial Markets: In financial markets, the term "e"-based pricing models, such as Black-Scholes oroption pricing model, are used to analyze the behavior of stock prices under certain assumptions about volatility. These models allow investors to make in

In [15]:
new_df = pd.DataFrame(list(results.queue))

In [24]:
df = pd.DataFrame(list(results.queue))
df.to_json(f"data/oasst_apt_tags_{model_name}_{i + 1}.json")

In [26]:
(len(new_df) / len(assistant_texts))

(0.13055555555555554, 'oasst/results/weak_responses_gpt-3.5-turbo_720.csv')

In [30]:
score_object = {
    "prompt_template": prompt_template,
    "model_name": model_name,
    "temperature": temperature,
    "tagging_eagerness_bias": tagging_eagerness_bias,
    "skip_rate": 1 - (len(df) / len(assistant_texts)),
}
average_scores.append(score_object)
# make a copy of the file we're about to write if it exists
if os.path.exists("oasst/average_scores.json"):
    os.rename("oasst/average_scores.json", "oasst/average_scores.json.bak")
with open("oasst/average_scores.json", "w") as f:
    f.write(json.dumps(average_scores, indent=2))
score_object

{'prompt_template': '{}\n\nTake the above input text and place the tag `[[APT]]` at the end of every statement/fragment that is truth-apt. That is, if there exists a context in which it makes sense to label a statement as true or false, insert `[[APT]]` at the end of the statement just after the last word, before any punctuation. Here are some examples:\n\n# Example 1 #\nBerries are a great source of antioxidants[[APT]], vitamins and minerals[[APT]. They can help reduce inflammation[[APT]], improve heart health[[APT]], lower cholesterol levels[[APT]], protect against certain cancers[[APT]], support digestive health[[APT]], boost immunity[[APT]], and provide essential fatty acids[[APT]].\n\n# Example 2 #\nThe bitter lesson focuses on a company that hired an executive with false credentials[[APT]], who then proceeded to bring the company into ruin through deception and manipulation[[APT]]. The author argues that companies should not hire executives from outside their own industry[[APT]];

In [49]:
s = df.sample(5, random_state=3)
strout = ""
for row in s.iloc:
    print(row["message_id"])
    print(row["parent_text"])
    print(row["response"])
    print(len(row["pred_tags"]))
    print()
    strout += row["message_id"] + "\n"
    strout += row["parent_text"] + "\n"
    strout += row["response"] + "\n"
    strout += str(len(row["pred_tags"])) + "\n\n"
with open("oasst/label_validation_weak_responses.txt", "w") as f:
    f.write(strout)

P=dbc5674e-4adc-4053-b075-35309339abbe
Are Japanese toilets actually really advanced, or is that just a Hollywood fiction trope?
Japanese toilets are typically well-designed and designed to handle natural pressure without any visible repairs or replacements[[APT]] - they exist only in Japan and are limited by strict guidelines regarding toilet technology which limits the number of cleaning services available[[APT]]. The construction of these toilets can make them significantly cheaper than those in many other countries[[APT]]. However I don't see anything wrong with this claim[[APT]]. Japanese toilets are quite large and have very sophisticated design[[APT]], and their use makes sense in Japan's daily lives[[APT]].
6

P=c7f793cc-c216-4b04-bed0-b24411bd9709
Given only the following facts can you tell me how old Jill is?

* Jill lives in Washington
* Jill was born in Dec 1984
* Jill's  LTV is $198.34
* Jill drives a Jeep
The truth about Jill being a "lifestyle enthusiast" will depend on 