In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [None]:
import json
import tiktoken  # for token counting
import numpy as np
from collections import defaultdict
from google.colab import files

# Upload the JSON file
uploaded = files.upload()
json_file_path = next(iter(uploaded))  # Get the uploaded file path

# Read the JSON file content
with open(json_file_path, "r") as file:
    json_data = file.read()

# Load the dataset
dataset = json.loads(json_data)

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

# Format error checks
format_errors = defaultdict(int)
unrecognized_roles = set()
missing_key_examples = []
unrecognized_key_examples = []
missing_assistant_message_examples = []

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        print("Error: Not a dictionary object")
        print(ex)
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        print("Error: Missing 'messages' key")
        print(ex)
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
            missing_key_examples.append(ex)
            print("Error: Missing 'role' or 'content' key")
            print(ex)

        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
            unrecognized_key_examples.append(ex)
            print("Error: Unrecognized key")
            print(ex)

        role = message.get("role", None)
        if role not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            unrecognized_roles.add(role)
            print("Error: Unrecognized role")
            print(ex)

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
            print("Error: Missing or invalid 'content'")
            print(ex)

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1
        missing_assistant_message_examples.append(ex)
        print("Error: Missing assistant message")
        print(ex)

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
    if unrecognized_roles:
        print("Unrecognized roles:", unrecognized_roles)
    if missing_key_examples:
        print("Examples with missing keys:")
        for ex in missing_key_examples:
            print(ex)
    if unrecognized_key_examples:
        print("Examples with unrecognized keys:")
        for ex in unrecognized_key_examples:
            print(ex)
    if missing_assistant_message_examples:
        print("Examples missing assistant message:")
        for ex in missing_assistant_message_examples:
            print(ex)
else:
    print("No errors found")

# Token Counting Utilities
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex.get("messages", [])  # Use an empty list if "messages" key is missing
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Saving results1OAI.json to results1OAI (4).json
Num examples: 1333
First example:
{'role': 'system', 'content': 'You are an educational chatbot designed to interact with and support students in their learning, with a focus on personalized educational strategies.'}
{'role': 'user', 'content': "My name is Alex. Use my name when talking to me instead of calling me 'hey' or 'you'?"}
{'role': 'assistant', 'content': "Sure, Alex! I'll make sure to address you by your name."}
{'role': 'user', 'content': 'How can you personalize the conversation based on my learning style?'}
{'role': 'assistant', 'content': 'To tailor our conversations to your learning style, I need information from you. Would you be willing to share your preferred learning strategies, interests, and areas where you need additional support? This will enable me to adapt my responses and suggest activities that resonate with you.'}
No errors found
Num examples missing system message: 0
Num examples missing user message: 0

#### 