In [None]:
#————————————————————

# Name: Azure OpenAI Assistant API, Data Pre-processing for Fine-Tuning

# Purpose:  This notebook will use the Azure OpenAI Assistant API to conduct data pre-processing steps on the recipes CSV for fine-tuning.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch), Alex Dean (adean@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 25.05.2024
# Python Version: 3.10.4

# Troubleshooting:
# 

# Download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\sdsc\requirements.txt

#————————————————————

In [1]:
# Import Python packages
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI

In [2]:
# Load required variables from env file.
load_dotenv(dotenv_path=Path("/workspaces/azure-openai-lab/.venv/.env")) #Error sometimes due to \ or \\. Try one or the other. "C:\\Python\\azure-openai-lab\\.venv\\.env"

# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
azure_oai_key = os.environ['AZURE_OPENAI_KEY']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [3]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
    api_key = azure_oai_key,  
    api_version = "2024-02-15-preview",
    azure_endpoint = azure_oai_endpoint
    )

In [4]:
# Upload file into Azure OpenAI Service [NOT USED IN WORKSHOP]
# path_input = r"C:\Python\azure-openai-lab\data\recipes-preprocessed.csv" #Change path if required

# # send the csv file to the assistant purpose files
# response = client.files.create(
#   file=open(path_input, "rb"),
#   purpose="assistants"
# )
# print(response)
# file__id = response.id

FileObject(id='assistant-yJjDRA4iOizy89uOxKD9jUGR', bytes=107440, created_at=1716316089, filename='recipes-preprocessed.csv', object='file', purpose='assistants', status='processed', status_details=None)


In [4]:
# Import existing uploaded file on Azure OpenAI Service
for i in client.files.list():
    if "recipes-preprocessed" in i.filename:
        file__id = i.id
        print(i.id)

assistant-yJjDRA4iOizy89uOxKD9jUGR


In [5]:
# Create data transformation instructions
instructions = '''
### INSTRUCTIONS
You are a senior data analyst who will work with data in an csv file.
You have access to a sandboxed environment for writing python code.
The objective is to create a datset for fine-tuning. The dataset must be formatted in the conversational format that is used by the Chat completions API.
An example of the conversational format is available in the EXAMPLES section.
When the user asks you to perform your actions, you will use the provided csv file and examples in the EXAMPLE section.
Execute each of the steps listed below in your ACTIONS section.

---

### EXAMPLES:

{"messages": [{"role": "system", "content": "This is a recipe generator. The recipe generated should be output as a JSON object."}, {"role": "user", "content": "Create a flavourful recipe from a list of ingredients and provide the output as a JSON object"}, {"role": "user", "content": "firm silken tofu, fresh raspberries, agave nectar, lemon juice, water, sea salt"}, {"role": "assistant", "content": "{"name":"vegan tofu berry custard","minutes":10,"tags":"['15-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'preparation', 'occasion', 'desserts', 'fruit', 'dinner-party', 'summer', 'vegan', 'vegetarian', 'puddings-and-mousses', 'dietary', 'seasonal', 'berries']","nutrition":"[97.2, 4.0, 19.0, 7.0, 13.0, 1.0, 4.0]","n_steps":5,"steps":"['put berries in a saucepan with the water and sweetener , bring to the boil , and then lower the heat , and simmer until tender and a syrup forms', 'cool', 'put everything in your blender with the berries , and puree until smooth and creamy', 'add in more lemon juice and sweetener to taste if desired', 'chill in the fridge , and serve topped with more fresh berries and vegan cream']","description":"it just doesnu2019t get much easier than this delicious summer pudding or custard.rit can be prepared in less than 10 minutes (and that includes getting the ingredients out of your fridge), and is absolutely delicious. i got this recipe from my friend alison about 10 years ago, and my snout has paid homage to her fabulous berry custard trough countless times since. this refreshing chilled treat makes a fantastic, simple, last-minute dessert; great after school snack; or guilt free treat on a hot summeru2019s day. either way it is soy berry good! you could substitute any berries you like for this recipe. if you are allergic to soy u2013 fresh young thai coconut meat and coconut water works really well as a substitute.","ingredients":"['firm silken tofu', 'fresh raspberries', 'agave nectar', 'lemon juice', 'water', 'sea salt']","n_ingredients":6}"}]}
{"messages": [{"role": "system", "content": "This is a recipe generator. The recipe generated should be output as a JSON object."}, {"role": "user", "content": "Create a flavourful recipe from a list of ingredients and provide the output as a JSON object"}, {"role": "user", "content": "vegetable oil, onion, garlic cloves, eggplants, basil, oregano, salt, pepper, red pepper, yellow pepper, zucchini, tomato paste, chickpeas, tomatoes, fresh basil"}, {"role": "assistant", "content": "{"name":"ratatouille with chickpeas crock pot","minutes":255,"tags":"['main-ingredient', 'cuisine', 'preparation', 'low-protein', 'healthy', 'vegetables', 'french', 'easy', 'european', 'low-fat', 'vegan', 'vegetarian', 'crock-pot-slow-cooker', 'dietary', 'low-sodium', 'low-cholesterol', 'low-saturated-fat', 'low-calorie', 'low-carb', 'healthy-2', 'low-in-something', 'peppers', 'squash', 'equipment', '3-steps-or-less']","nutrition":"[219.5, 6.0, 41.0, 24.0, 17.0, 2.0, 13.0]","n_steps":9,"steps":"['in a large skillet , heat oil over medium heat , cook onion , garlic , eggplant , basil , oregano , salt & pepper , stirring occasionally , until onion is softened , about 10 minutes', 'scrape into crockpot', 'halve , core , and seed peppers', 'cut into 1 inch pieces', 'cut zucchini into half lengthwise , cut crosswise into 1 1 / 2 inch chunks', 'add to crockpot', 'add tomato paste , chickpeas , and tomatoes , breaking up tomatoes with a spoon', 'cover and cook on low for 4 hours , or until vegetables are tender', 'stir in basil / parsley']","description":"a fresh tasting take on the classic french dish, done in a crockpot.  from canadian living.","ingredients":"['vegetable oil', 'onion', 'garlic cloves', 'eggplants', 'basil', 'oregano', 'salt', 'pepper', 'red pepper', 'yellow pepper', 'zucchini', 'tomato paste', 'chickpeas', 'tomatoes', 'fresh basil']","n_ingredients":15}"}]}


---

### ACTIONS:

1. Read the tab separated comma file data.
2. Transform the data and create a jsonl file formatted in the conversational format as shown in the EXAMPLES section.
3. The conversational format has a system, user and assistant text input stored inside an array of dictionaries.
4. The system text input is always "Act as a head chef and create a flavourful recipe from a list of ingredients".
5. The user text input takes the list of ingredients in the column "ingredients" of the CSV file.
6. The assistant input takes all the columns of the CSV file.
7. Split the data set into training and testing data sets with a 25% split.
8. Make sure both data sets have the same format provided by the EXAMPLES section.
9. Name the data set with 75% of the data "recipes-training-set".
10. Name the data set with 25% of the data "recipes-validation-set".
11. Prepare both data sets as a jsonl file for download by the user.
12. Provide a summary paragraph explaining the preparation of the dataset.

---

### DO NOT:
1. Do not return any images. 
2. Do not return any other file types.
'''

In [6]:
# Create an Azure OpenAI Assistant
assistant = client.beta.assistants.create(
    name = "data analyst assistant",
    instructions = instructions,
    tools = [{"type": "code_interpreter"}],
    model = "gpt-4-1106-preview", #"gpt-4-0125-preview", #You must replace this value with the deployment name for your model.
    file_ids=[file__id]
)

# Get the file id
fileId = assistant.file_ids[0]

# Create a thread
thread = client.beta.threads.create()

In [7]:
# Initalize thread and start data transformation using the Azure OpenAI Assistant Code Interpreter
prompt = "Please execute the INSTRUCTIONS and ACTIONS on the data stored in the CSV file using the EXAMPLES as reference for the output format " + fileId

message = client.beta.threads.messages.create(
    thread_id = thread.id,
    role = "user",
    content = prompt
)

In [8]:
# Run the Azure OpenAI Assistant
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  #instructions="New instructions" #You can optionally provide new instructions but these will override the default instructions
)

In [9]:
# Check status of Azure OpenAI Assistant run
while True:
    sec = 30
    # Wait for 30 seconds
    time.sleep(sec)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        break
    elif run.status == "requires_action":
        # handle function calling and continue with the execution
        pass
    elif run.status == "expired" or run.status=="failed" or run.status=="cancelled":
        # run failed, expired, or was cancelled
        break   
    # elif run.last_error != "None":
    #     # run failed, expired, or was cancelled
    #     break  
    else:
        print("in progress...")

in progress...
in progress...
in progress...
in progress...
in progress...
in progress...
in progress...
in progress...
Assistant: The datasets have been successfully transformed into the conversational format and split into a training set and a validation set. The resulting jsonl files have been saved as "recipes-training-set.jsonl" and "recipes-validation-set.jsonl".

Here is a summary paragraph explaining the preparation of the dataset:

To prepare the dataset, the original CSV file was read and loaded into a pandas DataFrame, taking care to correctly handle commas and quotes that indicate fields containing lists or descriptions. Each row of the dataset was subsequently transformed into the conversational format as specified by the instructions, which includes a system prompt, a user input containing the list of ingredients, and an assistant response formatted with all relevant recipe information. After the transformation, the data was split according to a 75% / 25% ratio for traini

In [68]:
# Functions to read xlsx files from Azure Openai

output_path = r"/workspaces/azure-openai-lab/data/generated_output/" #r"C:\\Python\\azure-openai-lab\\data\\generated_output\\"

# Write to jsonl
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)


def read_and_save_file(first_file_id, file_name, output_path):   
    # its binary, so read it and then make it a file like object
    file_data = client.files.content(first_file_id)
    file_data_bytes = file_data.read()
    file_data_decoded = file_data_bytes.decode('utf8').replace("'", '"')
    file_data_list = file_data_decoded.splitlines()
    write_jsonl(file_data_list, output_path + file_name)

    
def files_from_messages():
    messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids
    print(message_ids)
    # Loop through each file ID and save the file with a sequential name
    for i, file_id in enumerate(message_ids):
        if i == 1:
            file_name = f"recipes-training-set.jsonl"  # Generate a sequential file name
            read_and_save_file(file_id, file_name, output_path)
        else:
            file_name = f"recipes-validation-set.jsonl"  # Generate a sequential file name
            read_and_save_file(file_id, file_name, output_path)

# Extract the file names from the response, retrieve the content and save the data as a jsonl file
files_from_messages()

['assistant-rCY7clhjRgszoJkfcSdytXm2', 'assistant-vSOHoUpYrtcgYOsenfCs14O5']


In [19]:
#Clean up Azure OpenAI environment
client.beta.assistants.delete(assistant.id)
client.beta.threads.delete(thread.id)
for i in range(0, 2):
    client.files.delete(messages.data[0].file_ids[i])