# 05_02_Iterate a fine-tuned model
# Customer Support Automation
## Automating responses to customer inquiries on various platforms (email, chatbots, social media).
### Collect a dataset of customer inquiries and manually crafted responses. This dataset should cover a wide range of common questions, complaints, and feedback, along with the company's standard responses. Ensure to anonymize personal information. 

## Full Project Solution

### Install the necesarry libraries. 

In [None]:
pip install openai

In [None]:
pip install openai[datalib]

In [None]:
pip install urllib3==1.26.6 

In [None]:
pip install python-dotenv

In [None]:
pip install tiktoken

### Import the libraries and enviornment file to gain access to the Open API Key
#### The key can be generated here: https://platform.openai.com/account/api-keys

In [1]:
import os
from openai import OpenAI

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

### Authenticate to the API using the API Key
#### Pull from environment variables or use openai.api_key = ("your_key_here") to hardcode the key

In [2]:
client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY']  
)

### Helper Functions

In [3]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

encoding = tiktoken.get_encoding("cl100k_base")

#input_file=formatted_custom_support.json ; output_file=output.jsonl
def json_to_jsonl(input_file, output_file):
    
    # Open JSON file
    f = open(input_file)
     
    # returns JSON object as 
    # a dictionary
    data = json.load(f)
    
    # produce JSONL from JSON
    with open(output_file, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

def check_file_format(dataset):
    # Format error checks
    format_errors = defaultdict(int)
    
    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue
            
        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue
            
        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
            
            if any(k not in ("role", "content", "name", "function_call") for k in message):
                format_errors["message_unrecognized_key"] += 1
            
            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1
                
            content = message.get("content", None)
            function_call = message.get("function_call", None)
            
            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1
        
        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1
    
    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")


# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

### Convert JSON to JSONL

In [4]:
json_to_jsonl('custom_support.json', 'output.jsonl')

### Check File Format

https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [5]:
data_path = "output.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 101
First example:
{'role': 'system', 'content': "This is a customer support chatbot designed to help with common inquiries for Kesha's Boutique."}
{'role': 'user', 'content': 'How can I reset my password?'}
{'role': 'assistant', 'content': "You can reset your password by clicking on the 'Forgot Password' link on the login page and following the instructions sent to your email."}


In [6]:
# Format validation
check_file_format(dataset)

No errors found


### Cost Estimation

In [7]:
# Get the length of the conversation
conversation_length = []

for msg in dataset:
    messages = msg["messages"]
    conversation_length.append(num_tokens_from_messages(messages))
    
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096
TARGET_EPOCHS = 2
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)

if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in conversation_length)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

num_tokens = n_epochs * n_billing_tokens_in_dataset

Dataset has ~7049 tokens that will be charged for during training
By default, you'll train for 2 epochs on this dataset
By default, you'll be charged for ~14098 tokens


In [8]:
# gpt-3.5-turbo	$0.0080 / 1K tokens
cost = (num_tokens/1000) * 0.0080 
print(cost)

0.11278400000000001


### Upload File 
#### Once you have the data validated, the file needs to be uploaded using the 
#### Files API in order to be used with a fine-tuning jobs

In [9]:
client.files.create(
  file=open("output.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-4wKelNCveEWUzwYJxxx1FpFj', bytes=39681, created_at=1708959985, filename='output.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

### Create fine-tuned model

In [10]:
# Start the fine-tuning job 
# After you've started a fine-tuning job, it may take some time to complete. Your job may be queued 
# behind other jobs and training a model can take minutes or hours depending on the 
# model and dataset size. 

client.fine_tuning.jobs.create(
  training_file="file-4wKelNCveEWUzwYJxxx1FpFj", 
  model="gpt-3.5-turbo",
  hyperparameters={
    "n_epochs":2
  }
)

FineTuningJob(id='ftjob-SbkziY4IaWzhF8McTXuARdIv', created_at=1708959999, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=2, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-RZLvEijW4GW0KmC3rLIAjZlu', result_files=[], status='validating_files', trained_tokens=None, training_file='file-4wKelNCveEWUzwYJxxx1FpFj', validation_file=None)

In [19]:
# Retrieve job status
job_id = "ftjob-SbkziY4IaWzhF8McTXuARdIv"

# Retrieve the state of a fine-tune
# Status field can contain: running or succeeded or failed, etc.
client.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-SbkziY4IaWzhF8McTXuARdIv', created_at=1708959999, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model='ft:gpt-3.5-turbo-0613:keysoft::8wX1foz7', finished_at=1708960666, hyperparameters=Hyperparameters(n_epochs=2, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-RZLvEijW4GW0KmC3rLIAjZlu', result_files=['file-vJCY7Ft7qA2WlYXUSXZwZgdt'], status='succeeded', trained_tokens=13694, training_file='file-4wKelNCveEWUzwYJxxx1FpFj', validation_file=None)

### Evaluate results 

In [20]:
import io
import pandas as pd

#once training is finished, you can retrieve the file in "result_files=[]"
result_file = "file-vJCY7Ft7qA2WlYXUSXZwZgdt"

file_data = client.files.content(result_file)

# its binary, so read it and then make it a file like object
file_data_bytes = file_data.read()
file_like_object = io.BytesIO(file_data_bytes)

#now read as csv to create df
df = pd.read_csv(file_like_object)
df

Unnamed: 0,step,train_loss,train_accuracy,valid_loss,valid_mean_token_accuracy
0,1,1.90440,0.70968,,
1,2,1.16883,0.79167,,
2,3,2.79802,0.36667,,
3,4,1.33116,0.63889,,
4,5,1.82648,0.60714,,
...,...,...,...,...,...
197,198,0.56186,0.75676,,
198,199,0.51107,0.80645,,
199,200,0.43155,0.80000,,
200,201,0.61842,0.80952,,


### Use a fine-tuned model

In [38]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "This is a customer support chatbot designed to help with common inquiries.",
    "role": "user", "content": "What is the return policy at Kesha's Boutique?"}
  ]
)
print(response.choices[0].message.content)

Unfortunately, we do not have detailed information on Kesha's Boutique's return policy. It is recommended to visit their website or contact them directly for more information on their return policy.


In [27]:
fine_tuned_model = "ft:gpt-3.5-turbo-0613:keysoft::8wX1foz7"

response = client.chat.completions.create(
  model=fine_tuned_model,
  messages=[
    {"role": "system", "content": "This is a customer support chatbot designed to help with common inquiries for Kesha's Boutique.",
     "role": "user", "content": "What is the return policy at Kesha's Boutique?"}
  ]
)
print(response.choices[0].message.content)

We apologize for any confusion, but there is no specific information available about Kesha's Boutique as it could be a fictional entity. Return policies can vary by retailer, so it is best to check directly with their customer service or consult their website for detailed information on their return policy.


In [28]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "This is a customer support chatbot designed to help with common inquiries.",
    "role": "user", "content": "Does Kesha's Boutique offer international shipping?"}
  ]
)
print(response.choices[0].message.content)

It is unclear whether Kesha's Boutique offers international shipping as this information is not readily available on their website or publicly stated. It is recommended to contact the boutique directly to inquire about their international shipping policies.


In [29]:
response = client.chat.completions.create(
  model=fine_tuned_model,
  messages=[
    {"role": "system", "content": "This is a customer support chatbot designed to help with common inquiries for Kesha's Boutique.",
    "role": "user", "content": "Does Kesha's Boutique offer international shipping?"}
  ]
)
print(response.choices[0].message.content)

I'm sorry, but I couldn't find any information regarding Kesha's Boutique offering international shipping. It's recommended to check their website or contact their customer service for the most accurate and up-to-date information.


### Customer Support Chatbot

In [39]:
#sets the persona for the AI assistant using a system message
context = [{'role':'system', 'content': """This is a customer support chatbot designed to help with common 
                                           inquiries for Kesha's Boutique."""}]  

def collect_messages(role, message): #keeps track of the message exchange between user and assistant
    context.append({'role': role, 'content':f"{message}"})

def get_completion(): 
    try:
        response = client.chat.completions.create(
            model=fine_tuned_model,
            messages=context
        )

        print("\n Assistant: ", response.choices[0].message.content, "\n")
        return response.choices[0].message.content
    except openai.APIError as e:
        print(e.http_status)
        print(e.error)
        return e.error 

#Start the conversation between the user and the AI assistant/chatbot
while True:
    collect_messages('assistant', get_completion()) #stores the response from the AI assistant
        
    user_prompt = input('User: ') #input box for entering prompt
        
    if user_prompt == 'exit': #end the conversation with the AI assistant
        print("\n Goodbye")
        break
    
    collect_messages('user', user_prompt) #stores the user prompt


 Assistant:  Hello! How can I assist you today? 



User:  I need help with shipping.



 Assistant:  Sure, I can help you with that. What specifically would you like to know about shipping? 



User:  Do you offer international shipping?



 Assistant:  Yes, we offer international shipping. Shipping rates and delivery times may vary depending on the destination. You can find more information on our Shipping page. 



User:  What's the return policy?



 Assistant:  You can find our return policy on our Returns page. We offer a 30-day return window for most items, with some exceptions. If you have any specific questions about returns, please let me know. 



User:  Can I receive a discount as a first-time customer?



 Assistant:  We currently offer a 10% discount for first-time customers. You can find the discount code on our website or subscribe to our newsletter to receive it directly via email. 



User:  Do you accept American Express?



 Assistant:  Yes, we accept American Express, as well as several other major credit cards. You can find a full list of accepted payment methods in the Payment Options section during the checkout process. 



User:  exit



 Goodbye
