## OpenAI finetuning

1. Load dataset 1000 records cleaned NER
2. Split dataset 80/20
3. Finetune gpt-4o-mini-2024-07-18
4. Extract loss and accuracy metrics to visualize the effectiveness of fine-tuning

In [26]:
import jsonlines
from openai import OpenAI
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import json
import time

In [4]:
os.environ["OPENAI_API_KEY"] = ""

In [5]:
conversations_file = 'openai_files/multi_turn_conversation_virgin_ner_replaced.jsonl'

In [6]:
conversations = []
with jsonlines.open(conversations_file) as reader:
    for conversation in reader:
        conversations.append(conversation)

In [8]:
conversations[0]

[{'role': 'system',
  'content': 'You are a polite customer assistant whose goal is to provide effective help.'},
 {'role': 'user',
  'content': 'so i wait [TIME] and then they are rude and arrogant amp unhelpful after which she is raising a technical case'},
 {'role': 'assistant',
  'content': 'If youre unhappy with your experience on this call please contact us on our website'}]

In [15]:
def split_and_save_conversations(conversations, train_filename, val_filename, max_records=None, test_size=0.2, random_state=42):
    # Limit the number of records if max_records is set
    if max_records is not None:
        conversations = conversations[:max_records]

    print(f"Total number of conversations before splitting: {len(conversations)}")

    # Split the data into training and validation sets
    train_conversations, val_conversations = train_test_split(conversations, test_size=test_size, random_state=random_state)

    # Save training conversations
    with open(train_filename, 'w', encoding='utf-8') as train_file:
        for conversation in train_conversations:
            train_file.write(json.dumps({"messages": conversation}, ensure_ascii=False) + '\n')

    # Save validation conversations
    with open(val_filename, 'w', encoding='utf-8') as val_file:
        for conversation in val_conversations:
            val_file.write(json.dumps({"messages": conversation}, ensure_ascii=False) + '\n')

    print(f"Training set size: {len(train_conversations)}")
    print(f"Validation set size: {len(val_conversations)}")

In [20]:
train_file = 'openai_files/openai_train.jsonl'
val_file = 'openai_files/openai_val.jsonl'
split_and_save_conversations(conversations, train_file, val_file)

Total number of conversations before splitting: 1000
Training set size: 800
Validation set size: 200


## Upload dataset to finetuning API and start job

1. Create dataset split
2. Upload files to OpenAI API
3. Start job and ping API each 300 seconds
4. Collect metrics when ready

In [21]:
client = OpenAI()
def create_file_openai(client, filename):
    return client.files.create(
        file=open(filename, 'rb'),
        purpose='fine-tune'
    )

In [22]:
file_train = create_file_openai(client, train_file)
file_val = create_file_openai(client, val_file)
print(file_train)
print(file_val)

FileObject(id='file-P2XFqcAZNq85DZkgWdobdC', bytes=572508, created_at=1740910001, filename='openai_train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None, expires_at=None)
FileObject(id='file-EhrNM4Bc5ssUYsqLCBd6W3', bytes=143823, created_at=1740910002, filename='openai_val.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None, expires_at=None)


In [23]:
job = client.fine_tuning.jobs.create(
    training_file=file_train.id,
    validation_file=file_val.id,
    model="gpt-4o-mini-2024-07-18",
)

print(job)

FineTuningJob(id='ftjob-paMg7p0iYQNbHHPYpwpXOOpp', created_at=1740910019, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-6lWajcIIwNqauAN9JqnRf3N5', result_files=[], seed=1212125188, status='validating_files', trained_tokens=None, training_file='file-P2XFqcAZNq85DZkgWdobdC', validation_file='file-EhrNM4Bc5ssUYsqLCBd6W3', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto')), type='supervised'), user_provided_suffix=None)


In [24]:
def parse_metrics(event):
    step = event.data.get("step", "N/A")
    total_steps = event.data.get("total_steps", "N/A")
    train_loss = event.data.get("train_loss", "N/A")
    accuracy = event.data.get("train_mean_token_accuracy", "N/A")
    
    print(f"Step {step}/{total_steps}: Loss = {train_loss}, Accuracy = {accuracy}")

In [27]:
job_id = job.id
max_events = 100

while True:
    job_status = client.fine_tuning.jobs.retrieve(job_id)
    
    if job_status.status == "succeeded":
        print("Job finished successfully!")

        events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=max_events)

        count = 0
        for event in events:
            if event.type == "metrics":
                parse_metrics(event)
                count += 1
                if count >= max_events:
                    break

        break

    elif job_status.status == "failed":
        print("Job failed.")
        break

    else:
        print(f"Job status: {job_status.status}. Checking again...")
        time.sleep(300)

Job status: validating_files. Checking again...
Job status: running. Checking again...
Job status: running. Checking again...
Job status: running. Checking again...
Job status: running. Checking again...
Job status: running. Checking again...
Job status: running. Checking again...
Job status: running. Checking again...
Job status: running. Checking again...
Job status: running. Checking again...
Job finished successfully!
Step 2400/2400: Loss = 0.5784786939620972, Accuracy = 0.7777777910232544
Step 2399/2400: Loss = 0.9093235731124878, Accuracy = 0.6666666865348816
Step 2398/2400: Loss = 0.45006853342056274, Accuracy = 0.8297872543334961
Step 2397/2400: Loss = 0.31206244230270386, Accuracy = 0.8799999952316284
Step 2396/2400: Loss = 0.6031255125999451, Accuracy = 0.8064516186714172
Step 2395/2400: Loss = 0.6872339248657227, Accuracy = 0.8333333134651184
Step 2394/2400: Loss = 0.5476832389831543, Accuracy = 0.7647058963775635
Step 2393/2400: Loss = 0.4030855894088745, Accuracy = 0.88888

In [29]:
# Get model parameters
job_info = client.fine_tuning.jobs.retrieve(job_id)
print(job_info.fine_tuned_model)

ft:gpt-4o-mini-2024-07-18:smart-cloud::B6bGpKIt


In [None]:
def extract_metrics(events):
    steps, losses, accuracies = [], [], []
    
    for event in events:
        if event.type == "metrics":
            steps.append(event.data.get("step", 0))
            losses.append(event.data.get("train_loss", 0))
            accuracies.append(event.data.get("train_mean_token_accuracy", 0))
    
    return steps, losses, accuracies

def plot_metrics(steps, losses, accuracies):
    fig, ax1 = plt.subplots()

    # Plot loss
    ax1.set_xlabel("Training Steps")
    ax1.set_ylabel("Loss", color="tab:red")
    ax1.plot(steps, losses, color="tab:red", label="Loss")
    ax1.tick_params(axis="y", labelcolor="tab:red")

    # Plot accuracy
    ax2 = ax1.twinx()
    ax2.set_ylabel("Accuracy", color="tab:blue")
    ax2.plot(steps, accuracies, color="tab:blue", label="Accuracy")
    ax2.tick_params(axis="y", labelcolor="tab:blue")

    plt.title("Fine-Tuning Loss & Accuracy")
    fig.tight_layout()
    plt.show()

In [None]:
all_events = []
after = None

while True:
    response = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=100, after=after)
    events = response.data

    if not events:
        break

    all_events.extend(events)
    after = events[-1].id

print(f"Total events retrieved: {len(all_events)}")

In [None]:
steps, losses, accuracies = extract_metrics(all_events)
plot_metrics(steps, losses, accuracies)