In [6]:
from openai import OpenAI
import tiktoken  # OpenAI's tokenizer

client = OpenAI()

# Load GPT-4 tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")  # For GPT-4 & GPT-3.5

# Input text
text = "Hello, world!"

# Encoding (text → tokens → IDs)
token_ids = tokenizer.encode(text)
print("Encoded Token IDs:", token_ids)  # Example Output: [9906, 11, 1917, 0]

# Generate embeddings
response = client.embeddings.create(
    model="text-embedding-3-small",  # Newer embedding model (or use "text-embedding-3-large" for higher quality)
    input=[text],  # The text to get embeddings for
    dimensions=1536  # Optional: specify embedding dimensions (default is 1536 for text-embedding-3-small)
)

# Access the embedding
embedding = response.data[0].embedding
print("🔹 Embeddings (First 5 values):", embedding[:5])  # Print first 5 values for brevity

decoded_text = tokenizer.decode(token_ids)
print("🔹 Decoded Text:", decoded_text)  # Output: "Hello, world!"


Encoded Token IDs: [9906, 11, 1917, 0]
🔹 Embeddings (First 5 values): [-0.019184619188308716, -0.025279032066464424, -0.0017195191467180848, 0.01884828321635723, -0.033795066177845]
🔹 Decoded Text: Hello, world!
