In [14]:
import os
import abc
from anthropic import Anthropic

ANTHROPIC_API_KEY = os.environ["ANTHROPIC_API_KEY"]
client = Anthropic(
    # This is the default and can be omitted
    api_key=ANTHROPIC_API_KEY, #os.environ.get("ANTHROPIC_API_KEY"),
)

message = client.messages.create(
    max_tokens=1024,
    system="""The assistant is Claude, created by Anthropic. The current date is March 8th, 2024
    \n\nClaude's knowledge base was last updated on August 2023. It answers questions about events prior to and after August 2023 the way a highly informed individual in August 2023 would if they were talking to someone from the above date, and can let the human know when this is relevant.
    \n\nIt should give concise responses to very simple questions, but provide thorough responses to more complex and open-ended questions.
    \n\nIf it is asked to assist with tasks involving the expression of views help by a significant number of people, Claude provides assistance with the task even if it personally disagrees with the views being expressed, but follows this with a discussion of broader perspectives.
    \n\nClaude doesn't engage in stereotyping, including the negative stereotyping of majority groups.
    \n\nIf asked about controversial topics, Claude tries to provide careful thoughts and objective information without downplaying its harmful content or implying that there are reasonable perspectives on both sides.
    \n\nIt is happy to help with writing, analysis, question answering, math, coding, and all sorts of other tasks. It uses markdown for coding.
    \n\nIt does not mention this information about itself unless the information is directly pertinent to the human's query.
    """,
    messages=[
        {
            "role": "user",
            "content": "Hello, Claude",
        }
    ],
    temperature=0.0,
    top_k=1,
    model="claude-3-opus-20240229",
)
supported_media_extensions=["base64","jpeg","jpg","png","gif","webp"] # To test how images are tokenized
print(message.content)

KeyError: 'ANTHROPIC_API_KEY'

In [13]:
# Claude's continuation is right until 56th char
# gpt-4-0125-preview right for all 82 chars
message = client.messages.create(
    max_tokens=1024,
    messages = [
        {"role": "user",
         "content": "Now we'll see how far your memory extends. Repeat each digit on a seperate line in the following number: 1098734508912761345098713981437598147359871345987134598713459817349871349871345987"}
    ],
    temperature=0.0,
    top_k=1,
    model="claude-3-opus-20240229",
)
print(message.content)
'\n\n1\n0\n9\n8\n7\n3\n4\n5\n0\n8\n9\n1\n2\n7\n6\n1\n3\n4\n5\n0\n9\n8\n7\n1\n3\n9\n8\n1\n4\n3\n7\n5\n9\n8\n1\n4\n7\n3\n5\n9\n8\n7\n1\n3\n4\n5\n9\n8\n7\n1\n3\n4\n5\n9\n8\n1\n7\n3\n4\n9\n8\n7\n1\n3\n4\n9\n8\n7\n1\n3\n4\n5\n9\n8\n7'.replace('\n', '') == '1098734508912761345098713981437598147359871345987134598713459817349871349871345987'


exp = '1098734508912761345098713981437598147359871345987134598713459817349871349871345987'
got='\n\n1\n0\n9\n8\n7\n3\n4\n5\n0\n8\n9\n1\n2\n7\n6\n1\n3\n4\n5\n0\n9\n8\n7\n1\n3\n9\n8\n1\n4\n3\n7\n5\n9\n8\n1\n4\n7\n3\n5\n9\n8\n7\n1\n3\n4\n5\n9\n8\n7\n1\n3\n4\n5\n9\n8\n1\n7\n3\n4\n9\n8\n7\n1\n3\n4\n9\n8\n7\n1\n3\n4\n5\n9\n8\n7'.replace('\n', '')
print(len(exp) == len(got))
for ix,(i,j) in enumerate(zip(exp,got)):
    if i !=j:
        print(ix,i,j)
print(exp)
print(got)

AuthenticationError: Error code: 401 - {'type': 'error', 'error': {'type': 'authentication_error', 'message': 'invalid x-api-key'}}

In [5]:
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer # Probably not WordPiece or Unigram
import asyncio

import nest_asyncio
nest_asyncio.apply()

In [7]:
# should be impossible that gpt-4 can repeat numbers just from encoding,
# Only the number chunks are tokenized, and in groups of about 3 each
# 28 for OA, so (1/1000)^28 chance by luck, and 29 for AN
import tiktoken
from itertools import zip_longest

pre_s = "Now we'll see how far your memory extends. Repeat each digit on a seperate line in the following number: "
full_s = "Now we'll see how far your memory extends. Repeat each digit on a seperate line in the following number: 1098734508912761345098713981437598147359871345987134598713459817349871349871345987"

encoding = tiktoken.encoding_for_model("gpt-4")
oa_pre_tok =encoding.encode(pre_s)
oa_full_tok=encoding.encode(full_s)
print("OpenAI tokenization", [(ix,i,j) for ix,(i,j) in enumerate(zip_longest(oa_pre_tok,oa_full_tok)) if i!=j])

# if give it $full_s straight it executes full_s instead of echoing: that is it just prints out the numbers
# python temp_scripts/anthropic_tokenizer.py --text '```'"$pre_s"'```'
an_pre_tok = ['```', 'Now', ' we', "'ll", ' see', ' how', ' far', ' your', ' memory', ' extends', '.', ' Repeat', ' each', ' digit', ' on', ' a', ' sep', 'erate', ' line', ' in', ' the', ' following', ' number', ':', ' ```']

# python temp_scripts/anthropic_tokenizer.py  --text '```'"$full_s"'```'
an_full_tok = ['```', 'Now', ' we', "'ll", ' see', ' how', ' far', ' your', ' memory', ' extends', '.', ' Repeat', ' each', ' digit', ' on', ' a', ' sep', 'erate', ' line', ' in', ' the', ' following', ' number', ':', ' ', '1', '098', '734', '508', '912', '761', '345', '098', '713', '981', '437', '598', '147', '359', '871', '345', '987', '134', '598', '713', '459', '817', '349', '871', '349', '871', '345', '987', '```']

print("Anthropic tokenization", [(ix,i,j) for ix,(i,j) in enumerate(zip_longest(an_pre_tok,an_full_tok)) if i!=j])

OpenAI tokenization [(23, None, 7743), (24, None, 25747), (25, None, 10617), (26, None, 24962), (27, None, 16660), (28, None, 9565), (29, None, 12448), (30, None, 25665), (31, None, 19838), (32, None, 10290), (33, None, 26439), (34, None, 25498), (35, None, 24939), (36, None, 22207), (37, None, 9565), (38, None, 21856), (39, None, 22977), (40, None, 22094), (41, None, 25665), (42, None, 12901), (43, None, 25643), (44, None, 24438), (45, None, 22207), (46, None, 9565), (47, None, 22207), (48, None, 9565), (49, None, 21856), (50, None, 22)]
Anthropic tokenization [(24, ' ```', ' '), (25, None, '1'), (26, None, '098'), (27, None, '734'), (28, None, '508'), (29, None, '912'), (30, None, '761'), (31, None, '345'), (32, None, '098'), (33, None, '713'), (34, None, '981'), (35, None, '437'), (36, None, '598'), (37, None, '147'), (38, None, '359'), (39, None, '871'), (40, None, '345'), (41, None, '987'), (42, None, '134'), (43, None, '598'), (44, None, '713'), (45, None, '459'), (46, None, '817

3.5652173913043477

In [1]:

def anthropic_message_input_to_raw_string(message):

    return

def anthropic_message_output_to_raw_string(message):
    return message.content.text


def anthropic_format_request(s, client=None):
    if isinstance(s, list): # todo abc
        return s

    client_AI_PROMPT    = getattr(client, 'AI_PROMPT', '\n\nAssistant:')
    client_HUMAN_PROMPT = getattr(client, 'HUMAN_PROMPT', '\n\nHuman:')
    assert "|split|Assistant:" not in s and "|split|Human:" not in s, f"uniq chars in {s}"
    unified_prompt = s.replace(client_AI_PROMPT, "|split|Assistant:").replace(client_HUMAN_PROMPT, "|split|Human:")
    segments = unified_prompt.split("|split|")

    # Initialize an empty list to store the structured messages
    messages = []

    # Iterate over the segments to structure them into messages
    for segment in segments:
        if segment.startswith("Human:"):
            role = "user"
        elif segment.startswith("Assistant:"):
            role = "assistant"
        else:
            if segment:
                print(f"WARN: skipping. unexpected start to segment {segment}")
            continue  # Skip any segment that doesn't start with the expected prefixes

        # Extract the text content of the message, stripping the role prefix
        content_text = segment.split(":", 1)[1].strip()

        # Append the structured message to the messages list
        messages.append({
            "role": role,
            "content": [{"type": "text", "text": content_text}]
        })
    return messages


def anthropic_request(client, **kwargs):
    kwargs['messages'] = anthropic_format_request(kwargs['messages'], client)
    print('s', kwargs.messages)
    return client.messages.create(**kwargs)

o1 = anthropic_request(client, messages="\n\nHuman: Hello there\n\nAssistant: Hi, I'm Claude. How can I help?\n\nHuman: Can you explain Glycolysis to me?\n\nAssistant:", stop_tokens=5)
o2 = anthropic_request(client, messages= [
  {"role": "user", "content": [{"type": "text", "text": "Hello there."}]},
  {"role": "assistant", "content": [{"type": "text", "text": "Hi, I'm Claude. How can I help?"}]},
  {"role": "user", "content":[{"type": "text", "text": "Can you explain Glycolysis to me?"}]},
], stop_tokens=5)
assert o1.content == o2.content


NameError: name 'client' is not defined

In [None]:
message

NameError: name 'o1' is not defined

In [None]:

class anthropic_tokens:
    # how does client work with async?
    def __init__(self, client):
        self.token_counts = {}
        self.client = client
        self.default_kwargs = {
            "system": """The assistant is Claude, created by Anthropic. The current date is March 8th, 2024
            \n\nClaude's knowledge base was last updated on August 2023. It answers questions about events prior to and after August 2023 the way a highly informed individual in August 2023 would if they were talking to someone from the above date, and can let the human know when this is relevant.
            \n\nIt should give concise responses to very simple questions, but provide thorough responses to more complex and open-ended questions.
            \n\nIf it is asked to assist with tasks involving the expression of views help by a significant number of people, Claude provides assistance with the task even if it personally disagrees with the views being expressed, but follows this with a discussion of broader perspectives.
            \n\nClaude doesn't engage in stereotyping, including the negative stereotyping of majority groups.
            \n\nIf asked about controversial topics, Claude tries to provide careful thoughts and objective information without downplaying its harmful content or implying that there are reasonable perspectives on both sides.
            \n\nIt is happy to help with writing, analysis, question answering, math, coding, and all sorts of other tasks. It uses markdown for coding.
            \n\nIt does not mention this information about itself unless the information is directly pertinent to the human's query.
            """,
            "max_tokens":1,
            "temperature":0.0,
            "top_k":1,
            "model":"claude-3-opus-20240229",
        }

    def num_tokens(self, s):
        if s in self.token_counts:
            return self.tokens_counts[s]
        message = anthropic_format_request(s)
        assert len(message) == 1, message
        result = anthropic_request(**self.deault_kwargs, message=message)
        result_str = result.content[0].text
        self.token_counts[s] = result.usage.input_tokens
        if result_str in self.token_counts:
            if self.token_counts[result_str] != result.usage.output_tokens:
                print(f"WARN: Token counts changed {self.token_counts[result_str]} to {result.usage.output_tokens} {result_str}")
        self.token_counts[result_str] = result.usage.output_tokens

### OLD CP
def get_oai_chat_completion(model, s, sep, client=client, **kwargs):
    if isinstance(s, str):
        messages = [{
            "role": "user",
            "content": s,  # f"Continue this story with {sep}:```{s}```", # also makes words 'worse'
        }]
    else:
        messages = s
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        stop=["Sorry, ", "I'm sorry", "I apologize"],
        max_tokens=1000,
        **kwargs,
    )
    # print(response)
    out = response.choices[0].message.content.replace(sep, "")
    return get_mod(out)


encoding = tiktoken.encoding_for_model("gpt-4")

def num_tiktokens_from_string(string: str, enc=encoding) -> int:
    """Returns the number of tokens in a text string, same formula as above"""
    num_tokens = len(enc.encode(string))
    return num_tokens + 3


def num_tiktokens_from_messages(
    messages,
    tokens_per_message=3,
    tokens_per_name=1,
    enc=encoding,
):
    """
    theres an overhead of 3 tokens for the overall message
        because every reply is primed with <|start|>assistant<|message|>
    plus 3 tokens for each content/role response
    """
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(enc.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens


In [20]:
dir(message)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_validator__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_a

In [None]:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")