# Fine-tune a Pre-trained LLM For chatbot
# Customer Support Automation
## Automating responses to customer inquiries on various platforms (email, chatbots, social media).
### Collect a dataset of customer inquiries and manually crafted responses. This dataset should cover a wide range of common questions, complaints, and feedback, along with the company's standard responses. Ensure to anonymize personal information.

### Install the necesarry libraries.

In [None]:
pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install openai[datalib]

In [None]:
pip install urllib3==1.26.6

In [None]:
pip install python-dotenv

In [None]:
pip install tiktoken

### Import the libraries and enviornment file to gain access to the Open API Key
#### The key can be generated here: https://platform.openai.com/account/api-keys

In [None]:
import os
from openai import OpenAI

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

### Authenticate to the API using the API Key
#### Pull from environment variables or use openai.api_key = ("your_key_here") to hardcode the key

In [None]:
client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY']
)

### Helper Functions

In [None]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

encoding = tiktoken.get_encoding("cl100k_base")

#input_file=formatted_custom_support.json ; output_file=output.jsonl
def json_to_jsonl(input_file, output_file):

    # Open JSON file
    f = open(input_file)

    # returns JSON object as
    # a dictionary
    data = json.load(f)

    # produce JSONL from JSON
    with open(output_file, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def check_file_format(dataset):
    # Format error checks
    format_errors = defaultdict(int)

    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1

            if any(k not in ("role", "content", "name", "function_call") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            function_call = message.get("function_call", None)

            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1

        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")


# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

### Convert JSON to JSONL

In [None]:
json_to_jsonl('custom_support.json', 'output.jsonl')

### Check File Format

https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [None]:
data_path = "output.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 101
First example:
{'role': 'system', 'content': "This is a customer support chatbot designed to help with common inquiries for Fine cothes Lisa's Boutique."}
{'role': 'user', 'content': 'How can I reset my password?'}
{'role': 'assistant', 'content': "You can reset your password by clicking on the 'Forgot Password' link on the login page and following the instructions sent to your email."}


In [None]:
# Format validation
check_file_format(dataset)

No errors found


### Cost Estimation

In [None]:
# Get the length of the conversation
conversation_length = []

for msg in dataset:
    messages = msg["messages"]
    conversation_length.append(num_tokens_from_messages(messages))

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096
TARGET_EPOCHS = 5
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)

if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in conversation_length)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

num_tokens = n_epochs * n_billing_tokens_in_dataset

Dataset has ~7251 tokens that will be charged for during training
By default, you'll train for 5 epochs on this dataset
By default, you'll be charged for ~36255 tokens


In [None]:
# gpt-3.5-turbo	$0.0080 / 1K tokens
cost = (num_tokens/1000) * 0.0080
print(cost)

0.28196


### Upload File
#### Once you have the data validated, the file needs to be uploaded using the
#### Files API in order to be used with a fine-tuning jobs

In [None]:
client.files.create(
  file=open("output.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-fp7qjPWr3d52j6JuqYynDhRD', bytes=40792, created_at=1720713389, filename='output.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

### Create fine-tuned model

In [None]:
# Start the fine-tuning job
# After you've started a fine-tuning job, it may take some time to complete. Your job may be queued
# behind other jobs and training a model can take minutes or hours depending on the
# model and dataset size.

client.fine_tuning.jobs.create(
  training_file="file-Q00KX4D5cmhVurY3v8hbe87Q",
  model="gpt-3.5-turbo",
  hyperparameters={
    "n_epochs":5
  }
)

FineTuningJob(id='ftjob-sgcJ6eZ0oIrvUh6h2LFCDE2t', created_at=1720713398, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=5, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-y85pwkvP2H3Spp6kEp0E4ZVf', result_files=[], seed=1639185929, status='validating_files', trained_tokens=None, training_file='file-Q00KX4D5cmhVurY3v8hbe87Q', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [None]:
# Retrieve job status
job_id = "ftjob-sgcJ6eZ0oIrvUh6h2LFCDE2t"

# Retrieve the state of a fine-tune
# Status field can contain: running or succeeded or failed, etc.
client.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-sgcJ6eZ0oIrvUh6h2LFCDE2t', created_at=1720713398, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::9jqnjjHp', finished_at=1720714754, hyperparameters=Hyperparameters(n_epochs=5, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-y85pwkvP2H3Spp6kEp0E4ZVf', result_files=['file-635wfKHycDv39Ceo01b98xKD'], seed=1639185929, status='succeeded', trained_tokens=35245, training_file='file-Q00KX4D5cmhVurY3v8hbe87Q', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

### Evaluate results

In [None]:
import io
import pandas as pd
import base64

#once training is finished, you can retrieve the file in "result_files=[]"
result_file = "file-635wfKHycDv39Ceo01b98xKD"

file_data = client.files.content(result_file)

file_data_bytes = file_data.read()

# decoding as file is base64 encoded
decoded_data = base64.b64decode(file_data_bytes).decode('utf-8')
# Create a file-like object from the decoded data
file_like_object = io.StringIO(decoded_data)

#now read as csv to create df
df = pd.read_csv(file_like_object)
df

Unnamed: 0,step,train_loss,train_accuracy,valid_loss,valid_mean_token_accuracy
0,1,2.08559,0.59259,,
1,2,2.90910,0.64286,,
2,3,1.98537,0.55172,,
3,4,1.53578,0.73333,,
4,5,2.09285,0.64000,,
...,...,...,...,...,...
500,501,0.12417,0.95833,,
501,502,0.00650,1.00000,,
502,503,0.00022,1.00000,,
503,504,0.11767,0.97561,,


### Iterate on the Model results  

In [None]:
client.fine_tuning.jobs.create(
  training_file="file-IntFuYDWVfJwMp6TpSrJa8aq",
  model="gpt-3.5-turbo",
  hyperparameters={
    "n_epochs":4
  }
)

FineTuningJob(id='ftjob-ddj5DgpOc0khmh3OCDkUxytC', created_at=1709004655, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=4, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-RZLvEijW4GW0KmC3rLIAjZlu', result_files=[], status='validating_files', trained_tokens=None, training_file='file-IntFuYDWVfJwMp6TpSrJa8aq', validation_file=None, user_provided_suffix=None)

In [None]:
# Retrieve job status
job_id = "ftjob-ddj5DgpOc0khmh3OCDkUxytC"

# Retrieve the state of a fine-tune
# Status field can contain: running or succeeded or failed, etc.
client.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-ddj5DgpOc0khmh3OCDkUxytC', created_at=1709004655, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model='ft:gpt-3.5-turbo-0613:keysoft::8wiiPbKa', finished_at=1709005600, hyperparameters=Hyperparameters(n_epochs=4, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-RZLvEijW4GW0KmC3rLIAjZlu', result_files=['file-rhw44JG1hrIRtpqXjGE0PK7C'], status='succeeded', trained_tokens=27388, training_file='file-IntFuYDWVfJwMp6TpSrJa8aq', validation_file=None, user_provided_suffix=None)

In [None]:
#once training is finished, you can retrieve the file in "result_files=[]"
result_file = "file-rhw44JG1hrIRtpqXjGE0PK7C"

file_data = client.files.content(result_file)

# its binary, so read it and then make it a file like object
file_data_bytes = file_data.read()
file_like_object = io.BytesIO(file_data_bytes)

#now read as csv to create df
df = pd.read_csv(file_like_object)
df

Unnamed: 0,step,train_loss,train_accuracy,valid_loss,valid_mean_token_accuracy
0,1,0.65891,0.72727,,
1,2,0.77342,0.77778,,
2,3,1.66960,0.76923,,
3,4,0.85210,0.81081,,
4,5,2.34291,0.60870,,
...,...,...,...,...,...
399,400,1.03416,0.66667,,
400,401,0.75915,0.75862,,
401,402,0.38260,0.83871,,
402,403,0.43998,0.86207,,


### Use a fine-tuned model

In [None]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "This is a customer support chatbot designed to help with common inquiries.",
    "role": "user", "content": "Does Fine cothes Lisa's Boutique offer international shipping?"}
  ]
)
print(response.choices[0].message.content)

It is uncertain whether Fine Clothes Lisa's Boutique offers international shipping as this information is not provided. It is recommended to contact the boutique directly to inquire about their shipping policies and options.


In [None]:
response = client.chat.completions.create(
  model=fine_tuned_model,
  messages=[
    {"role": "system", "content": "This is a customer support chatbot designed to help with common inquiries for Fine cothes Lisa's Boutique.",
    "role": "user", "content": "Does Lisas's Boutique offer international shipping?"}
  ]
)
print(response.choices[0].message.content)

Yes, Lisa's Boutique offers international shipping to select countries. Check our shipping information page for more details.


### Customer Support Chatbot

In [None]:
#sets the persona for the AI assistant using a system message
context = [{'role':'system', 'content': """This is a customer support chatbot designed to help with common
                                           inquiries for Fine cothes Lisa's Boutique."""}]

def collect_messages(role, message): #keeps track of the message exchange between user and assistant
    context.append({'role': role, 'content':f"{message}"})

def get_completion():
    try:
        response = client.chat.completions.create(
            model=fine_tuned_model,
            messages=context
        )

        print("\n Assistant: ", response.choices[0].message.content, "\n")
        return response.choices[0].message.content
    except openai.APIError as e:
        print(e.http_status)
        print(e.error)
        return e.error

#Start the conversation between the user and the AI assistant/chatbot
while True:
    collect_messages('assistant', get_completion()) #stores the response from the AI assistant

    user_prompt = input('User: ') #input box for entering prompt
    print(f'User:{user_prompt}')

    if user_prompt == 'exit': #end the conversation with the AI assistant
        print("\n Goodbye")
        break

    collect_messages('user', user_prompt) #stores the user prompt'


 Assistant:  Welcome to Lisa's Boutique customer support. How can I assist you today? 

User:

 Assistant:  Hello! How can I help you today? 

User:need help with shipping

 Assistant:  For shipping information, please visit our Shipping Policy page. If you have specific questions, feel free to ask. 

User:need help with shipping

 Assistant:  Certainly, how can I assist you with shipping? 

User:do you have internetional shipping?

 Assistant:  Yes, we offer international shipping to select countries. For more details, please check our International Shipping Information page. 

User:what's the return policy?

 Assistant:  Our return policy allows returns within 30 days of receipt. For more details and to process a return, please visit our Return Policy page. 

User:is there a discout on first purchase?

 Assistant:  Yes, we offer a discount on your first purchase when you subscribe to our newsletter. Check the website for more details and to sign up. 

User:are there any discounts on