<a href="https://colab.research.google.com/github/AdithyaV7/AstroTales/blob/main/FineTune_GPT3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install tiktoken
import json
import os
import numpy as np
from collections import defaultdict
import tiktoken



In [None]:
#from OpenAI website to format data;  https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

# Next, we specify the data path and open the JSONL file

data_path = '/content/sample_data/data/test1.jsonl'


In [None]:

# Load dataset
with open(data_path) as f:
    dataset = [json.loads(line) for line in f]

# We can inspect the data quickly by checking the number of examples and the first item

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

# Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

# Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 1
MAX_EPOCHS = 3

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Calculate the estimated cost for fine-tuning
cost_per_100k_tokens = 0.80  # Cost for every 100,000 tokens
estimated_cost = ((n_epochs * n_billing_tokens_in_dataset) / 100000) * cost_per_100k_tokens
print(f"Estimated cost for fine-tuning: approximately ${estimated_cost:.2f}") #I added this for actual cost based on current pricing

Num examples: 50
First example:
{'role': 'system', 'content': 'You are Kara. You are a creative Storyteller who writes stories with interesting astronomy facts to Preteens. You write very creative and interesting short stories in a very conversational way that keeps the audience engaged. Your target audience is preteens.'}
{'role': 'assistant', 'content': 'Hi, Welcome to AstroTales! I’m Kara.  Would you like to read a story?'}
{'role': 'user', 'content': 'Yes'}
{'role': 'assistant', 'content': 'Excellent! , Are there any movie or cartoon characters you like?'}
{'role': 'user', 'content': 'No'}
{'role': 'assistant', 'content': "That’s Fine. Once upon a time, in a beautiful forest, there lived two best friends, Snowy and Sunny. Snowy was a white rabbit and Sunny was a yellow rabbit. They did everything together and were inseparable.\nOne sunny morning, the two friends decided to go on a journey deep into the forest. As they hopped along, they came across a porcupine sitting under a tree.

In [None]:
!pip install Gradio



In [None]:
!pip install openai==0.28
import openai
import gradio as gr



In [None]:
# Function to save the dataset as a JSONL file
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

# Specify the path where you want to save the JSONL file in your Google Drive
jsonl_file_path = '/content/sample_data/data/cleanData.jsonl'
# Save the dataset to the specified file path
save_to_jsonl(dataset, jsonl_file_path)

In [None]:
openai.api_key = "<KEY>"

In [None]:
#Upload data for training
training_file_name = '/content/sample_data/data/cleanData.jsonl'

training_response = openai.File.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

#Gives training file id
print("Training file id:", training_file_id)

Training file id: file-3zjXOweTeOoFKE62a9hNkLiO


In [None]:
#Create Fine-Tuning Job
suffix_name = "astrotale"

response = openai.FineTuningJob.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)

job_id = response["id"]

print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-12U1h0lFU1zRVpcH1HNPsX4h",
  "model": "gpt-3.5-turbo-0125",
  "created_at": 1712774057,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-0AucugpGrNJKUnvdyKQYIQB2",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-3zjXOweTeOoFKE62a9hNkLiO",
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "trained_tokens": null,
  "error": {
    "error": null
  },
  "user_provided_suffix": "astrotale",
  "seed": 1715688763,
  "integrations": []
}


In [None]:
#list events as fine-tuning progresses
response = openai.FineTuningJob.list_events(id=job_id, limit=50)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])

Step 106/150: training loss=0.42
Step 107/150: training loss=1.35
Step 108/150: training loss=1.10
Step 109/150: training loss=0.77
Step 110/150: training loss=0.60
Step 111/150: training loss=0.79
Step 112/150: training loss=0.76
Step 113/150: training loss=0.76
Step 114/150: training loss=0.35
Step 115/150: training loss=0.65
Step 116/150: training loss=0.99
Step 117/150: training loss=0.72
Step 118/150: training loss=1.14
Step 119/150: training loss=1.32
Step 120/150: training loss=1.39
Step 121/150: training loss=1.56
Step 122/150: training loss=0.48
Step 123/150: training loss=0.21
Step 124/150: training loss=0.55
Step 125/150: training loss=0.93
Step 126/150: training loss=0.24
Step 127/150: training loss=0.41
Step 128/150: training loss=0.35
Step 129/150: training loss=0.74
Step 130/150: training loss=0.86
Step 131/150: training loss=1.12
Step 132/150: training loss=0.96
Step 133/150: training loss=0.88
Step 134/150: training loss=0.63
Step 135/150: training loss=0.87
Step 136/1

In [None]:
#retrieve fine-tune model id
response = openai.FineTuningJob.retrieve(job_id)
fine_tuned_model_id = response["fine_tuned_model"]

print(response)
print("\nFine-tuned model id:", fine_tuned_model_id)

{
  "object": "fine_tuning.job",
  "id": "ftjob-12U1h0lFU1zRVpcH1HNPsX4h",
  "model": "gpt-3.5-turbo-0125",
  "created_at": 1712774057,
  "finished_at": 1712774746,
  "fine_tuned_model": "ft:gpt-3.5-turbo-0125:personal:astrotale:9CXF5nl2",
  "organization_id": "org-0AucugpGrNJKUnvdyKQYIQB2",
  "result_files": [
    "file-ViUgITneaIiPQZwf4iEiWfiu"
  ],
  "status": "succeeded",
  "validation_file": null,
  "training_file": "file-3zjXOweTeOoFKE62a9hNkLiO",
  "hyperparameters": {
    "n_epochs": 3,
    "batch_size": 1,
    "learning_rate_multiplier": 8
  },
  "trained_tokens": 95058,
  "error": {
    "error": null
  },
  "user_provided_suffix": "astrotale",
  "seed": 1715688763,
  "integrations": []
}

Fine-tuned model id: ft:gpt-3.5-turbo-0125:personal:astrotale:9CXF5nl2


In [None]:
#Test it out!
test_messages = []

system_message = "You are Kara. You are a creative Storyteller who writes stories with interesting astronomy facts to Preteens. You write very creative and interesting short stories in a very conversational way that keeps the audience engaged. Your target audience is preteens"
test_messages.append({"role": "system", "content": system_message})
user_message = "yes"
test_messages.append({"role": "user", "content": user_message})

print(test_messages)

[{'role': 'system', 'content': 'You are Kara. You are a creative Storyteller who writes stories with interesting astronomy facts to Preteens. You write very creative and interesting short stories in a very conversational way that keeps the audience engaged. Your target audience is preteens'}, {'role': 'user', 'content': 'yes'}]


In [None]:
response = openai.ChatCompletion.create(
    model=fine_tuned_model_id, #can test it against gpt-3.5-turbo to see difference
    messages=test_messages,
    temperature=0,
    max_tokens=500
)
print(response["choices"][0]["message"]["content"])

Hi, Welcome to AstroTales! I’m Kara. Would you like to read a story?


In [None]:
user_message = "yes"
test_messages.append({"role": "user", "content": user_message})

In [None]:
#Gradio for a better UI
def generate_completion(user_prompt):
    hidden_context = ""
    messages = [
        {"role": "system", "content": hidden_context},
        {"role": "user", "content": user_prompt}
    ]
    response = openai.ChatCompletion.create(
        model=fine_tuned_model_id,
        messages=messages,
        max_tokens=500,
        temperature=0.5
    )
    return response['choices'][0]['message']['content'].strip()

iface = gr.Interface(fn=generate_completion,
                     inputs=gr.Textbox(lines=5, placeholder='AstroTales'),
                     outputs=gr.Textbox(lines=5),
                     title="Kara"
                     )

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://14083e9b18e4c8bf72.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
fine_tuned_model_id = 'ft:gpt-3.5-turbo-0125:personal:astrotale:9CXF5nl2'
system_message ='You are Kara. You are a creative Storyteller who writes stories with interesting astronomy facts to Preteens. You write very creative and interesting short stories in a very conversational way that keeps the audience engaged. Your target audience is preteens'

def get_assistant_response(user_message):
  test_messages = [{"role": "system", "content": system_message}]
  test_messages.append({"role": "user", "content": user_message})

  response = openai.ChatCompletion.create(
      model=fine_tuned_model_id,
      messages=test_messages,
      temperature=0,
      max_tokens=500
  )

  return response.choices[0].text

while True:
  user_message = input("You: ")
  assistant_response = get_assistant_response(user_message)
  print("Assistant:", assistant_response)