In [None]:
#————————————————————

# Name: Azure OpenAI Assistant API, Data Pre-processing for Fine-Tuning

# Purpose:  This notebook will use the Azure OpenAI Assistant API to conduct data pre-processing steps on the recipes CSV for fine-tuning.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch), Alex Dean (adean@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 25.05.2024
# Python Version: 3.10.4

# Troubleshooting:
# 

# Download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\sdsc\requirements.txt

#————————————————————

In [1]:
# Import Python packages
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI

In [2]:
# Load required variables from env file.
load_dotenv(dotenv_path=Path("/workspaces/azure-openai-lab/.venv/.env")) #Error sometimes due to \ or \\. Try one or the other. "C:\\Python\\azure-openai-lab\\.venv\\.env"

# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
azure_oai_key = os.environ['AZURE_OPENAI_KEY_P34']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT_P34']

In [10]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
    api_key = azure_oai_key,  
    api_version = "2024-02-15-preview",
    azure_endpoint = azure_oai_endpoint
    )

In [4]:
# Upload file into Azure OpenAI Service
path_input = r"/workspaces/azure-openai-lab/data/recipes-preprocessed.csv" #Change path if required

# send the csv file to the assistant purpose files
response = client.files.create(
  file=open(path_input, "rb"),
  purpose="assistants"
)
print(response)
file__id = response.id

FileObject(id='assistant-yJjDRA4iOizy89uOxKD9jUGR', bytes=107440, created_at=1716316089, filename='recipes-preprocessed.csv', object='file', purpose='assistants', status='processed', status_details=None)


In [4]:
# Import existing uploaded file on Azure OpenAI Service
# for i in client.files.list():
#     if "recipes-preprocessed" in i.filename:
#         file__id = i.id
#         print(i.id)

assistant-yJjDRA4iOizy89uOxKD9jUGR


In [11]:
# Create data transformation instructions
instructions = '''
### INSTRUCTIONS
You are a senior data analyst who will work with data in an csv file.
You have access to a sandboxed environment for writing python code.
The objective is to create a datset for fine-tuning. The dataset must be formatted in the conversational format that is used by the Chat completions API.
An example of the conversational format is available in the EXAMPLES section.
When the user asks you to perform your actions, you will use the provided csv file and examples in the EXAMPLE section.
Execute each of the steps listed below in your ACTIONS section.

---

### EXAMPLES:

{"messages": [{"role": "system", "content": "This is a vegan recipe generator. The vegan recipe generated should be output as a JSON object in the format:{\"name\":\"\",\"minutes\":,\"tags\":\"[]\",\"nutrition\":\"[]\",\"n_steps\":\"\",\"steps\":\"[]\",\"description\":\"\",\"ingredients\":\"[]\", \"n_ingredients\":}"}, {"role": "user", "content": "Create well-thought-out and flavourful recipes from a list of ingredients implementing classic culinary techniques."}, {"role": "user", "content": "fresh snow peas, sesame oil, minced garlic cloves, salt & pepper"}, {"role": "assistant", "content": "{\"name\":\"garlic snow peas\",\"minutes\":5,\"tags\":\"['15-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'low-protein', 'healthy', '5-ingredients-or-less', 'appetizers', 'side-dishes', 'vegetables', 'asian', 'chinese', 'easy', 'dinner-party', 'romantic', 'vegan', 'vegetarian', 'dietary', 'low-sodium', 'low-cholesterol', 'stir-fry', 'comfort-food', 'low-carb', 'inexpensive', 'healthy-2', 'low-in-something', 'taste-mood', 'presentation', 'served-hot', 'technique']\",\"nutrition\":\"[46.6, 5.0, 5.0, 0.0, 2.0, 2.0, 1.0]\",\"n_steps\":6,\"steps\":\"['heat wok on medium-hi heat', 'add oil get it hot , dump in peas', 'about 1 minute after stir frying add garlic and seasoning', 'quick fry just till bright green and still crisp', 'cooking time is an estimate -- do not over cook or they will turn a dull green and go limp', 'remove and serve while still hot']\",\"description\":\"i love the ones at p f changs..so this is my version\",\"ingredients\":\"['fresh snow peas', 'sesame oil', 'minced garlic cloves', 'salt & pepper']\",\"n_ingredients\":4}\n"}]}
{"messages": [{"role": "system", "content": "This is a vegan recipe generator. The vegan recipe generated should be output as a JSON object in the format:{\"name\":\"\",\"minutes\":,\"tags\":\"[]\",\"nutrition\":\"[]\",\"n_steps\":\"\",\"steps\":\"[]\",\"description\":\"\",\"ingredients\":\"[]\", \"n_ingredients\":}"}, {"role": "user", "content": "Create well-thought-out and flavourful recipes from a list of ingredients implementing classic culinary techniques."}, {"role": "user", "content": "peaches, dates, ground cinnamon, water, almonds, coconut oil"}, {"role": "assistant", "content": "{\"name\":\"summertime peach crumble raw food\",\"minutes\":5,\"tags\":\"['15-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'preparation', 'cobblers-and-crisps', 'desserts', 'fruit', 'easy', 'vegan', 'vegetarian', 'food-processor-blender', 'dietary', 'low-sodium', 'low-in-something', 'pitted-fruit', 'peaches', 'equipment', 'small-appliance']\",\"nutrition\":\"[330.8, 32.0, 108.0, 3.0, 15.0, 34.0, 11.0]\",\"n_steps\":5,\"steps\":\"['for the base ~ place chopped peaches in large bowl & set aside', 'in blender or food processor , blend 5 dates , cinnamon & enough water to get a syrupy consistency', 'pour this date syrup over peaches & refrigerate at least 1 hour', 'for the topping ~ when peach \\/ date mixture has chilled , in a food processor , blend almonds , 4 dates & coconut oil to desired consistency', 'top peach \\/ date mixture with this topping & enjoy']\",\"description\":\"this recipe comes the goneraw internet site.\",\"ingredients\":\"['peaches', 'dates', 'ground cinnamon', 'water', 'almonds', 'coconut oil']\",\"n_ingredients\":6}\n"}]}
{"messages": [{"role": "system", "content": "This is a vegan recipe generator. The vegan recipe generated should be output as a JSON object in the format:{\"name\":\"\",\"minutes\":,\"tags\":\"[]\",\"nutrition\":\"[]\",\"n_steps\":\"\",\"steps\":\"[]\",\"description\":\"\",\"ingredients\":\"[]\", \"n_ingredients\":}"}, {"role": "user", "content": "Create well-thought-out and flavourful recipes from a list of ingredients implementing classic culinary techniques."}, {"role": "user", "content": "grape-nuts cereal, apple juice concentrate"}, {"role": "assistant", "content": "{\"name\":\"fat free pie crust\",\"minutes\":13,\"tags\":\"['15-minutes-or-less', 'time-to-make', 'course', 'cuisine', 'preparation', 'north-american', 'healthy', '5-ingredients-or-less', 'pies-and-tarts', 'desserts', 'american', 'easy', 'low-fat', 'vegan', 'vegetarian', 'pies', 'dietary', 'low-cholesterol', 'low-saturated-fat', 'healthy-2', 'low-in-something']\",\"nutrition\":\"[88.9, 0.0, 27.0, 5.0, 4.0, 0.0, 6.0]\",\"n_steps\":6,\"steps\":\"['preheat the oven to 350 f \\/ 180 deg c', 'mix together the grape nuts and apple juice concentrate', 'pat into a thin layer on the bottom and sides of a 9 pie pan', \\\"don't worry if there are some gaps\\\", 'bake for 8 minutes', 'cool before filling']\",\"description\":\"from the table of the physicians committee for responsible medicine\",\"ingredients\":\"['grape-nuts cereal', 'apple juice concentrate']\",\"n_ingredients\":2}\n"}]}
{"messages": [{"role": "system", "content": "This is a vegan recipe generator. The vegan recipe generated should be output as a JSON object in the format:{\"name\":\"\",\"minutes\":,\"tags\":\"[]\",\"nutrition\":\"[]\",\"n_steps\":\"\",\"steps\":\"[]\",\"description\":\"\",\"ingredients\":\"[]\", \"n_ingredients\":}"}, {"role": "user", "content": "Create well-thought-out and flavourful recipes from a list of ingredients implementing classic culinary techniques."}, {"role": "user", "content": "lemon juice, granulated sugar, water, lemon slice"}, {"role": "assistant", "content": "{\"name\":\"refreshing lemonade\",\"minutes\":5,\"tags\":\"['15-minutes-or-less', 'time-to-make', 'course', 'preparation', 'occasion', 'low-protein', 'healthy', '5-ingredients-or-less', 'beverages', 'easy', 'beginner-cook', 'low-fat', 'summer', 'vegan', 'vegetarian', 'dietary', 'low-sodium', 'low-cholesterol', 'seasonal', 'low-saturated-fat', 'low-calorie', 'inexpensive', 'low-in-something', 'presentation', 'served-cold']\",\"nutrition\":\"[116.5, 0.0, 115.0, 0.0, 0.0, 0.0, 10.0]\",\"n_steps\":4,\"steps\":\"['mix the lemon juice and sugar in a 2 quart pitcher', 'fill the container up with water until it reaches 2 quarts', 'float the lemon slices on top', 'chill , then enjoy !']\",\"description\":\"this is a really easy lemonade recipe.  i use bottled lemon juice because it's cheaper, but you could definately use fresh.\",\"ingredients\":\"['lemon juice', 'granulated sugar', 'water', 'lemon slice']\",\"n_ingredients\":4}\n"}]}

---

### ACTIONS:

1. Read the tab separated comma file data
2. Transform the data and create a jsonl file formatted in the conversational format as shown in the EXAMPLES section
3. The conversational format has a system role, user role and assistant role, each with text content stored inside an array of dictionaries
4. The first system role content is always: "This is a vegan recipe generator. The vegan recipe generated should be output as a JSON object in the format: {"name":"","minutes":,"tags":"[]","nutrition":"[]","n_steps":"","steps":"[]","description":"","ingredients":"[]", "n_ingredients":}"
5. The first user role content is always: "Create well-thought-out and flavourful vegan recipes from a list of ingredients implementing classic culinary techniques"
6. The subsequent user role content takes the list of ingredients in the column "ingredients" of the CSV file
7. The assistant role content uses all the column values of the CSV file as a JSON object with the format: {"name":"","minutes":,"tags":"[]","nutrition":"[]","n_steps":"","steps":"[]","description":"","ingredients":"[]", "n_ingredients":}
8. Split the data set into training and testing data sets with a "75%" and "25%" split respectively
9. Make sure both data sets have the same format provided by the EXAMPLES section
10. Name the data set with "75%" of the data "recipes-training-set"
11. Name the data set with "25%" of the data "recipes-validation-set"
12. Prepare both data sets as a jsonl file for download by the user

---

### DO NOT:
1. Do not return any images. 
2. Do not return any other file types.
'''

In [13]:
# Create an Azure OpenAI Assistant
assistant = client.beta.assistants.create(
    name = "data analyst assistant",
    instructions = instructions,
    tools = [{"type": "code_interpreter"}],
    model = "gpt-4-1106-preview", #"gpt-4-0125-preview", #You must replace this value with the deployment name for your model.
    file_ids=[file__id]
)

# Get the file id
fileId = assistant.file_ids[0]

# Create a thread
thread = client.beta.threads.create()

In [14]:
# Initalize thread and start data transformation using the Azure OpenAI Assistant Code Interpreter
prompt = "Please execute the INSTRUCTIONS and ACTIONS on the data stored in the CSV file using the EXAMPLES as reference for the output format " + fileId

message = client.beta.threads.messages.create(
    thread_id = thread.id,
    role = "user",
    content = prompt
)

In [15]:
# Run the Azure OpenAI Assistant
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  #instructions="New instructions" #You can optionally provide new instructions but these will override the default instructions
)

In [16]:
# Check status of Azure OpenAI Assistant run
while True:
    sec = 30
    # Wait for 30 seconds
    time.sleep(sec)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        break
    elif run.status == "requires_action":
        # handle function calling and continue with the execution
        pass
    elif run.status == "expired" or run.status=="failed" or run.status=="cancelled":
        # run failed, expired, or was cancelled
        break   
    # elif run.last_error != "None":
    #     # run failed, expired, or was cancelled
    #     break  
    else:
        print("in progress...")

in progress...
Assistant: The data has been successfully processed and formatted into the conversational JSON line-format. Two dataset files have been created: 

- The training set, named "recipes-training-set.jsonl"
- The validation set, named "recipes-validation-set.jsonl"

Both files are ready for download:

- [Download the training set](sandbox:/mnt/data/recipes-training-set.jsonl)
- [Download the validation set](sandbox:/mnt/data/recipes-validation-set.jsonl)
Assistant: There seems to be an issue with the JSON decoding process for the 'ingredients' column. I will investigate and properly clean this data before attempting to reformat it into the conversational JSON line-format. Let's correct this and process the data again.
Assistant: The structure of the data from the CSV file has been successfully loaded, and we can see that it contains the following columns: 'name', 'minutes', 'tags', 'nutrition', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients', and 'dense_feat

In [19]:
# Functions to read xlsx files from Azure Openai

output_path = r"/workspaces/azure-openai-lab/data/generated_output/" #r"C:\\Python\\azure-openai-lab\\data\\generated_output\\"

# Write to jsonl
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)


def read_and_save_file(first_file_id, file_name, output_path):   
    # its binary, so read it and then make it a file like object
    file_data = client.files.content(first_file_id)
    file_data_bytes = file_data.read()
    file_data_decoded = file_data_bytes.decode('utf8').replace("'", '"')
    file_data_list = file_data_decoded.splitlines()
    write_jsonl(file_data_list, output_path + file_name)

    
def files_from_messages():
    messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids
    print(message_ids)
    # Loop through each file ID and save the file with a sequential name
    for i, file_id in enumerate(message_ids):
        if i == 1:
            file_name = f"recipes-training-set.jsonl"  # Generate a sequential file name
            read_and_save_file(file_id, file_name, output_path)
        else:
            file_name = f"recipes-validation-set.jsonl"  # Generate a sequential file name
            read_and_save_file(file_id, file_name, output_path)

# Extract the file names from the response, retrieve the content and save the data as a jsonl file
files_from_messages()

['assistant-xtRZ3tMzHBNgCwmifE1uuazW', 'assistant-6XrrXUnP87zKQWxUCTyCt6t4']


In [20]:
#Clean up Azure OpenAI environment
client.beta.assistants.delete(assistant.id)
client.beta.threads.delete(thread.id)
for i in range(0, 2):
    client.files.delete(messages.data[0].file_ids[i])