# ChatGPT fine-tuning

In [3]:
from openai import OpenAI
import os
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

## Create training data file from logged messages

In [4]:
input_data_path = "ft_examples"
output_data_path = "ft_examples/training_file_with_weights.jsonl"

input_files = [os.path.join(input_data_path, filename) for filename in os.listdir(input_data_path) if "messages" in filename]

# Load the dataset
dataset = []
for filename in input_files:
    print(filename)
    with open(filename, 'r', encoding='utf-8') as f:
        dataset.append(json.load(f))

print("Num examples:", len(dataset))

# Fix format
for ex in dataset:
    messages = ex.get("messages", None)
    last_message = {"role" : "", "content" : ""}
    for message in messages:
        # remove arguments from functions (they are not allowed in GPT prompt)
        if "arguments" in message:
            del message["arguments"] 
        # set weights to 1 only for messages interpreting function responses (typically offering products from vector DB)
        if last_message["role"] == "function" and message["role"] == "assistant":
            message["weight"] = 1
        elif message["role"] == "assistant":
            message["weight"] = 0
        last_message = message
    functions = ex.get("functions", None)
    for fc in functions:
        if fc["parameters"] is None:
            fc["parameters"] = {}
data = {}
with open(output_data_path, "w", encoding = "UTF-8") as f:
    for ex in dataset:
        json.dump(ex, f, ensure_ascii=False) 
        f.write('\n')


ft_examples/messages_129.json
ft_examples/messages_105.json
ft_examples/messages_113.json
ft_examples/messages_125.json
ft_examples/messages_46.json
ft_examples/messages_109.json
ft_examples/messages_108.json
ft_examples/messages_47.json
ft_examples/messages_124.json
ft_examples/messages_112.json
ft_examples/messages_104.json
ft_examples/messages_128.json
ft_examples/messages_123.json
ft_examples/messages_40.json
ft_examples/messages_119.json
ft_examples/messages_103.json
ft_examples/messages_115.json
ft_examples/messages_114.json
ft_examples/messages_102.json
ft_examples/messages_118.json
ft_examples/messages_41.json
ft_examples/messages_122.json
ft_examples/messages_42.json
ft_examples/messages_121.json
ft_examples/messages_117.json
ft_examples/messages_101.json
ft_examples/messages_100.json
ft_examples/messages_116.json
ft_examples/messages_120.json
ft_examples/messages_43.json
ft_examples/messages_111.json
ft_examples/messages_48.json
ft_examples/messages_107.json
ft_examples/messa

## Format validation

We can perform a variety of error checks to validate that each conversation in the dataset adheres to the format expected by the fine-tuning API. Errors are categorized based on their nature for easier debugging.

1. **Data Type Check**: Checks whether each entry in the dataset is a dictionary (`dict`). Error type: `data_type`.
2. **Presence of Message List**: Checks if a `messages` list is present in each entry. Error type: `missing_messages_list`.
3. **Message Keys Check**: Validates that each message in the `messages` list contains the keys `role` and `content`. Error type: `message_missing_key`.
4. **Unrecognized Keys in Messages**: Logs if a message has keys other than `role`, `content`, `weight`, `function_call`, and `name`. Error type: `message_unrecognized_key`.
5. **Role Validation**: Ensures the `role` is one of "system", "user", or "assistant". Error type: `unrecognized_role`.
6. **Content Validation**: Verifies that `content` has textual data and is a string. Error type: `missing_content`.
7. **Assistant Message Presence**: Checks that each conversation has at least one message from the assistant. Error type: `example_missing_assistant_message`.

The code below performs these checks, and outputs counts for each type of error found are printed. This is useful for debugging and ensuring the dataset is ready for the next steps.


In [8]:
data_path = output_data_path

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]
  #  dataset = json.load(f)

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)
    
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
            
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            print(k)
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")


Num examples: 40
First example:
{'role': 'system', 'content': 'You are a Bakery Salesman. Help user buy bakery goods. Introduce yourself and ask user how you can help. Komunikuj v češtině.'}
{'role': 'user', 'content': 'Dobrý den'}
{'role': 'assistant', 'content': 'Dobrý den, vítejte! Jak vám mohu pomoci dnes ohledně našich pečivových výrobků? Chcete si něco vybrat nebo potřebujete poradit s výběrem?', 'weight': 0}
{'role': 'user', 'content': 'Dobrý den, rád bych 3 kousky klasických makových koláčků, prosím. Děkuji.'}
{'role': 'function', 'name': 'add_item_to_cart', 'content': "Wrong item name. We can offer these similar items: ['Špička s náplní makovou 2ks á 110g', 'Koláč šáteček 75g', 'Špička s náplní makovou a tvarohovou 2ks á110g', 'Vánočka mandlová 400g']. "}
{'role': 'assistant', 'content': 'Omlouvám se, ale nemáme přesně "klasické makové koláčky". Máme však několik podobných možností, například "Špička s náplní makovou 2ks á 110g", "Koláč šáteček 75g" nebo "Špička s náplní makov

## Data Warnings and Token Counts 

With some lightweight analysis we can identify potential issues in the dataset, like missing messages, and provide statistical insights into message and token counts.

1. **Missing System/User Messages**: Counts the number of conversations missing a "system" or "user" message. Such messages are critical for defining the assistant's behavior and initiating the conversation.
2. **Number of Messages Per Example**: Summarizes the distribution of the number of messages in each conversation, providing insight into dialogue complexity.
3. **Total Tokens Per Example**: Calculates and summarizes the distribution of the total number of tokens in each conversation. Important for understanding fine-tuning costs.
4. **Tokens in Assistant's Messages**: Calculates the number of tokens in the assistant's messages per conversation and summarizes this distribution. Useful for understanding the assistant's verbosity.
5. **Token Limit Warnings**: Checks if any examples exceed the maximum token limit (4096 tokens), as such examples will be truncated during fine-tuning, potentially resulting in data loss.



In [22]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            if key != "weight":
                num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [23]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 10, 29
mean / median: 14.3, 14.0
p5 / p95: 12.0, 18.0

#### Distribution of num_total_tokens_per_example:
min / max: 418, 1280
mean / median: 661.95, 643.0
p5 / p95: 444.2, 874.0000000000002

#### Distribution of num_assistant_tokens_per_example:
min / max: 208, 703
mean / median: 364.05, 347.0
p5 / p95: 240.8, 468.6000000000001

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


## Cost Estimation

In this final section, we estimate the total number of tokens that will be used for fine-tuning, which allows us to approximate the cost. It is worth noting that the duration of the fine-tuning jobs will also increase with the token count. 

In [24]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~26478 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~79434 tokens


## Fine-tuning

In [None]:
# Upload training file
client = OpenAI()

fo = client.files.create(
  file=open("training_file_with_weights.jsonl", "rb"),
  purpose="fine-tune"
)

In [None]:
# Create fine-tuning job
client.fine_tuning.jobs.create(
  training_file=fo.id, 
  model="gpt-3.5-turbo-0125",
  suffix="baker_weights_e2",
  hyperparameters = {
      "n_epochs": 2,
  }
)

## List of fine-tuned models

    - "ft:gpt-3.5-turbo-0125:born-digital-s-r-o:baker-weights-e2:976VKvlB"
        - train file "ft_examples/training_file_with_weights.jsonl"
        - 2 epochs
        - fine-tuned only on respomses following function calls (typically offering products from vector DB)
        
    - "ft:gpt-3.5-turbo-0125:born-digital-s-r-o:baker2-e2:96vWpkfj",
        - train file "ft_examples/training_file.jsonl
        - 2 epochs
        - fine-tuned on all assistant responses
        - does not work well (especially function calling)