In [1]:
import json

# Load the training set
with open('training_set.jsonl', 'r', encoding='utf-8') as f:
    training_dataset = [json.loads(line) for line in f]

# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
for message in training_dataset[0]["messages"]:
    print(message)

# Load the validation set
with open('validation_set.jsonl', 'r', encoding='utf-8') as f:
    validation_dataset = [json.loads(line) for line in f]

# Validation dataset stats
print("\nNumber of examples in validation set:", len(validation_dataset))
print("First example in validation set:")
for message in validation_dataset[0]["messages"]:
    print(message)

Number of examples in training set: 10
First example in training set:
{'role': 'system', 'content': 'FS-Assistent is a support chatbot for FS UIA.'}
{'role': 'user', 'content': 'How do I change my FS password?'}
{'role': 'assistant', 'content': "To change your FS password, start by logging into FS. Enter the temporary password you were given and click on 'Change Password'. Enter your old password once and then your new password twice. Your new password must be 12 characters long. For more information, check the link in the password change window."}

Number of examples in validation set: 10
First example in validation set:
{'role': 'system', 'content': 'FS-Assistent is a support chatbot for FS UIA.'}
{'role': 'user', 'content': 'How can the faculty pay the semester fee for a student?'}
{'role': 'assistant', 'content': "Faculty can pay the semester fee by extracting a list of students who haven't paid from FS, preparing an Excel sheet with their details, and sending it along with an inte

In [3]:
import json
import tiktoken
import numpy as np
from collections import defaultdict

encoding = tiktoken.get_encoding("cl100k_base") # default encoding used by gpt-4, turbo, and text-embedding-ada-002 models

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

files = ['training_set.jsonl', 'validation_set.jsonl']

for file in files:
    print(f"Processing file: {file}")
    with open(file, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))
    
    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)

Processing file: training_set.jsonl

#### Distribution of total tokens:
min / max: 82, 107
mean / median: 90.2, 89.5
p5 / p95: 82.9, 98.0

#### Distribution of assistant tokens:
min / max: 43, 69
mean / median: 52.7, 50.0
p5 / p95: 46.6, 61.8
**************************************************
Processing file: validation_set.jsonl

#### Distribution of total tokens:
min / max: 71, 96
mean / median: 85.9, 86.5
p5 / p95: 78.2, 91.5

#### Distribution of assistant tokens:
min / max: 33, 56
mean / median: 47.7, 48.5
p5 / p95: 41.1, 54.2
**************************************************


In [21]:
# Upload fine-tuning files

import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_KEY"),  
  api_version="2023-12-01-preview"  # This API version or later is required to access fine-tuning for turbo/babbage-002/davinci-002
)

training_file_name = 'training_set.jsonl'
validation_file_name = 'validation_set.jsonl'

# Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-9af09dfa17bb446da1497e99d33fef05
Validation file ID: file-d9b20e2c1a454acd9f69a2081df18e8c


In [19]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-35-turbo-0613", # Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters. 
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.id)
print(response.model_dump_json(indent=2))

NameError: name 'training_file_id' is not defined

In [3]:
# Track training status

from IPython.display import clear_output
import time

start_time = time.time()

# Get the status of our fine-tuning job.
response = client.fine_tuning.jobs.retrieve(job_id)

status = response.status

# If the job isn't done yet, poll it every 10 seconds.
while status not in ["succeeded", "failed"]:
    time.sleep(10)
    
    response = client.fine_tuning.jobs.retrieve(job_id)
    print(response.model_dump_json(indent=2))
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))
    status = response.status
    print(f'Status: {status}')
    clear_output(wait=True)

print(f'Fine-tuning job {job_id} finished with status: {status}')

# List all fine-tuning jobs for this resource.
print('Checking other fine-tune jobs for this resource.')
response = client.fine_tuning.jobs.list()
print(f'Found {len(response.data)} fine-tune jobs.')

Fine-tuning job ftjob-11fa525580664d76affd6ee6f2726a31 finished with status: succeeded
Checking other fine-tune jobs for this resource.
Found 1 fine-tune jobs.


In [18]:
#Retrieve fine_tuned_model name

response = client.fine_tuning.jobs.retrieve(job_id)

print(response.model_dump_json(indent=2))
fine_tuned_model = response.fine_tuned_model

NameError: name 'job_id' is not defined

In [5]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_KEY"),  
  api_version="2023-05-15"
)

response = client.chat.completions.create(
    model="UiA-Open-AI-IT-FS-gpt-35-turbo", # model = "Custom deployment name you chose for your fine-tuning model"
    messages=[
        {"role": "system", "content": "FS-Assistent is a support chatbot for FS UIA."},
        {"role": "user", "content": "How do I change my FS password?"},
        {"role": "assistant", "content": "To change your FS password, start by logging into FS. Enter the temporary password you were given and click on 'Change Password'. Enter your old password once and then your new password twice. Your new password must be 12 characters long. For more information, check the link in the password change window."},
        {"role": "user", "content": "Is 12 characters the only requirements for the password?"}
    ]
)

print(response.choices[0].message.content)

No, in addition to being 12 characters long, your FS password must also meet the following requirements:

1. It must contain at least one uppercase letter (A-Z).
2. It must contain at least one lowercase letter (a-z).
3. It must contain at least one numeric character (0-9).
4. It must contain at least one special character (!@#$%^&*).

Make sure to create a password that is both strong and memorable for you.


In [13]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_KEY"),  
  api_version="2023-05-15"
)

response = client.chat.completions.create(
    model="UiA-Open-AI-IT-FS-gpt-35-turbo", # model = "Custom deployment name you chose for your fine-tuning model"
    messages=[
        {"role": "system", "content": "FS-Assistent is a support chatbot for FS UIA."},
        {"role": "user", "content": "How do I create a faculty member in FS?"},
        {"role": "assistant", "content": "To create a faculty member in FS, first ensure they're in the Person/Faculty and Faculty collection image. Enter their details in 'Person/Faculty' for manual entry if not automatically transferred from SAP. Remember to activate them by setting 'Active' to 'J'."},
        {"role": "user", "content": "Is this all i have to do to create a faculty member in FS for UIA?"}
    ]
)

print(response.choices[0].message.content)

No, there are a few more steps involved in creating a faculty member in FS for UIA. After entering their details in the 'Person/Faculty' collection, you also need to assign them to the correct faculty, department, and unit. You can do this by navigating to the 'Person/Faculty' collection and selecting the faculty member's entry. Then, under the 'Overview' tab, click on 'Organizational Assignment' and fill in the appropriate fields. Additionally, you need to set their employment status, such as 'Active', 'Inactive', or 'Retired', and specify their start and end dates if applicable. Finally, make sure to save the changes you've made.


In [1]:
# Retrieve the file ID of the first result file from the fine-tuning job
# for the customized model.
response = client.fine_tuning.jobs.retrieve(job_id)
if response.status == 'succeeded':
    result_file_id = response.result_files[0]

retrieve = client.files.retrieve(result_file_id)

# Download the result file.
print(f'Downloading result file: {result_file_id}')

with open(retrieve.filename, "wb") as file:
    result = client.files.content(result_file_id).read()
    file.write(result)

NameError: name 'client' is not defined