In [5]:
from openai import OpenAI
from dotenv import load_dotenv
import os 

In [6]:
# Load the environment variables from the .env file
load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

Prepare the dataset

Dealing with list format, as shown above, might be convenient for small datasets. However, there are several benefits to saving the data in JSONL (JSON Lines) format. The benefits include scalability, interoperability, simplicity, and also compatibility with OpenAI API, which requires data in JSONL format when creating fine-tuning jobs.

The following code leverages the helper function prepare_data to create both the training and validation data in JSONL formats:

In [7]:
training_file_name = "../data/training_gpt_data.jsonl"
validation_file_name = "../data/validation_gpt_data.jsonl"

In [8]:
training_file_id = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)

validation_file_id = client.files.create(
  file=open(validation_file_name, "rb"),
  purpose="fine-tune"
)

print(f"Training File ID: {training_file_id}")
print(f"Validation File ID: {validation_file_id}")

Training File ID: FileObject(id='file-103d57P8zl2rX3gWbLWegdlw', bytes=100174, created_at=1711310142, filename='training_gpt_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
Validation File ID: FileObject(id='file-hXqvqNKyxIiKnvE5DponjZDV', bytes=100174, created_at=1711310143, filename='validation_gpt_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


### Create a fine-tuning job

This fine-tuning process is highly inspired by the openai-cookbook performing fine-tuning on Microsoft Azure.

To perform the fine-tuning we will use the following two steps: (1) define hyperparameters, and (2) trigger the fine-tuning.

We will fine-tune the davinci model and run it for 15 epochs using a batch size of 3 and a learning rate multiplier of 0.3 using the training and validation datasets.

Successful execution of the previous code displays below the unique identifier of the training and validation data.

In [9]:
response = client.fine_tuning.jobs.create(
  training_file=training_file_id.id, 
  validation_file=validation_file_id.id,
  model="gpt-3.5-turbo", 
  hyperparameters={
    "n_epochs": 15,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
  }
)
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ftjob-FgJBlcWI9RsblcakUTPxZ7JQ.
Training Response: FineTuningJob(id='ftjob-FgJBlcWI9RsblcakUTPxZ7JQ', created_at=1711310144, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=15, batch_size=3, learning_rate_multiplier=0.3), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-wkUFLlJRyOXDuAkBFUtPtrii', result_files=[], status='validating_files', trained_tokens=None, training_file='file-103d57P8zl2rX3gWbLWegdlw', validation_file='file-hXqvqNKyxIiKnvE5DponjZDV', user_provided_suffix=None)
Training Status: validating_files


The code above generates the following information for the jobID (`ftjob-SqZvz9Rpjn2nSxtsn8ozMJu4`), the training response, and the training status (pending).

This pending status does not provide any relevant information. However, we can have more insight into the training process by running the following code:

In [10]:
import signal
import datetime


def signal_handler(sig, frame):
    status = client.fine_tuning.jobs.retrieve(job_id).status
    print(f"Stream interrupted. Job is still {status}.")
    return


print(f"Streaming events for the fine-tuning job: {job_id}")

signal.signal(signal.SIGINT, signal_handler)

events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id)
try:
    for event in events:
        print(
            f'{datetime.datetime.fromtimestamp(event.created_at)} {event.message}'
        )
except Exception:
    print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ftjob-FgJBlcWI9RsblcakUTPxZ7JQ
2024-03-24 15:55:44 Validating training file: file-103d57P8zl2rX3gWbLWegdlw and validation file: file-hXqvqNKyxIiKnvE5DponjZDV
2024-03-24 15:55:44 Created fine-tuning job: ftjob-FgJBlcWI9RsblcakUTPxZ7JQ


### Check the fine-tuning job status

Let's verify that our operation was successful, and additionally, we can examine all the fine-tuning operations by using a list operation.

In [11]:
import time

status = client.fine_tuning.jobs.retrieve(job_id).status
if status not in ["succeeded", "failed"]:
    print(f"Job not in terminal status: {status}. Waiting.")
    while status not in ["succeeded", "failed"]:
        time.sleep(2)
        status = client.fine_tuning.jobs.retrieve(job_id).status
        print(f"Status: {status}")
else:
    print(f"Finetune job {job_id} finished with status: {status}")
print("Checking other finetune jobs in the subscription.")
result = client.fine_tuning.jobs.list()
print(f"Found {len(result.data)} finetune jobs.")

Job not in terminal status: validating_files. Waiting.
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status

### Validation of the model

Finally, the fine-tuned model can be retrieved from the “fine_tuned_model” attribute. The following print statement shows that the name of the final mode is: `ft:davinci-002:personal::8gKnyxn3`

In [12]:
# Retrieve the finetuned model
fine_tuned_model = result.data[0].fine_tuned_model
print(fine_tuned_model)


ft:gpt-3.5-turbo-0125:personal::96P28LRb


With this model, we can run queries to validate its results by providing a prompt, the model name, and creating a query with the openai.Completion.create() function. The result is retrieved from the answer dictionary as follows:

In [20]:
answer = client.chat.completions.create(
  model=fine_tuned_model,
  messages=[
    {"role": "system", "content": "This is a parameter-based prompt for creating marketing materials"},
    {"role": "user", "content": "Create a website marketing campaign for a Checking Account aimed at International Students"}
  ]
)
print(answer.choices[0].message)

# new_prompt = "Design an email for the TD Student Line of Credit, aimed at students seeking flexible funding solutions for their academic journey"
# answer = client.completions.create(
#   model=fine_tuned_model,
#   prompt=new_prompt
# )

# print(answer.choices[0].text)

ChatCompletionMessage(content='Wherever your studies take you our International Student Checking Account has got you covered. Enjoy global ATM fee reimbursements no borders no boundaries.', role='assistant', function_call=None, tool_calls=None)
