In [None]:
#————————————————————

# Name: Azure OpenAI Assistant API, Data Pre-processing for RAG

# Purpose:  This notebook will use the Azure OpenAI Assistant API to conduct data pre-processing steps on the recipes CSV used for RAG.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch)
# Create for: SDSC 2024 & ZHAW 2025
# Date Created: 22.01.2024
# Last Updated: 19.01.2025
# Python Version: 3.12.1

# Troubleshooting
# https://stackoverflow.com/questions/77986927/in-azure-openai-assistants-when-i-upload-a-file-and-save-it-where-is-that-file-s
# https://techcommunity.microsoft.com/t5/fasttrack-for-azure/strategies-for-optimizing-high-volume-token-usage-with-azure/ba-p/4007751#:~:text=Understanding%20tokens%20and%20limits%20in,generation%2C%20translation%2C%20or%20summarization.


# If necessary, download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\sdsc\requirements.txt

#————————————————————

In [1]:
# Import Python packages
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI

In [2]:
# Load required variables from env file.
load_dotenv(dotenv_path=Path("/workspaces/azure-openai-lab/.venv/.env")) #Error sometimes due to \ or \\. Try one or the other. "C:\\Python\\azure-openai-lab\\.venv\\.env"

# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
azure_oai_key = os.environ['AZURE_OPENAI_KEY_P34']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT_P34']

In [3]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
    api_key = azure_oai_key,  
    api_version = "2024-05-01-preview",
    azure_endpoint = azure_oai_endpoint
    )

In [4]:
# Upload file into Azure OpenAI Service
path_input = r"/workspaces/azure-openai-lab/data/recipes.csv" #Change path if required

# send the csv file to the assistant purpose files
response = client.files.create(
  file=open(path_input, "rb"),
  purpose="assistants"
)
print(response)
file__id = response.id

FileObject(id='assistant-jdKC0TXW9EU41YJqdneKTJwn', bytes=2912718, created_at=1737347321, filename='recipes.csv', object='file', purpose='assistants', status='processed', status_details=None)


In [13]:
# Import existing uploaded file on Azure OpenAI Service
# for i in client.files.list():
#     if i.filename == "recipes.csv":
#         file__id = i.id
#         print(i.id)

assistant-ZL9Wks4GaFf7Rt6qtsY1UtMx


In [5]:
# Create data transformation instructions
instructions = '''
### INSTRUCTIONS
You are a senior data analyst who will work with data in a csv file in your files. 
You have access to a sandbox environment for writing Python code.
When the user asks you to perform your actions, use the csv file to read the data into a pandas dataframe.
Execute each of the steps listed below in your ACTIONS section.

---

### ACTIONS:
1. Read the tab separated comma file data into a pandas DataFrame. 
2. Drop columns "id", "contributor_id" and "submitted".
3. Trim column "name" by removing irregular text spacing at the front or back of each value while keeping single spaces between words.
4. Filter the data where column "tags" contains the word vegan.
5. Create a new column named "dense_feature" combining the values of the columns "name", "tags", "nutrition", "ingredients" and "steps" separated by a semicolon.
6. Prepare a final data set named "recipes-preprocessed" that only has 50 rows randomly sampled from the dataframe from step 5.
7. Prepare recipes-preprocessed as csv files for download by the user. 

---

### DO NOT:
1. Do not return any images. 
2. Do not return any other file types.
'''

In [6]:
# Create an Azure OpenAI Assistant
assistant = client.beta.assistants.create(
    name="data analyst assistant",
    instructions=instructions,
    tools=[{"type": "code_interpreter"}],
    model="gpt-4o-mini",  # Replace this value with the deployment name for your model.
    tool_resources={
        "code_interpreter": {"file_ids": [file__id]}
    }
)

# Get the file id
# fileId = assistant.file_ids[0]

# Create a thread
thread = client.beta.threads.create()

In [7]:
# Initalize thread and start data transformation using the Azure OpenAI Assistant Code Interpreter
prompt = "Please execute the INSTRUCTIONS and ACTIONS on the data stored in the csv file"

message = client.beta.threads.messages.create(
    thread_id = thread.id,
    role = "user",
    content = prompt
)

In [8]:
# Run the Azure OpenAI Assistant
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  #instructions="New instructions" #You can optionally provide new instructions but these will override the default instructions
)

In [9]:
# Check status of Azure OpenAI Assistant run
while True:
    sec = 30
    # Wait for 30 seconds
    time.sleep(sec)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        break
    elif run.status == "requires_action":
        # handle function calling and continue with the execution
        pass
    elif run.status == "expired" or run.status=="failed" or run.status=="cancelled":
        # run failed, expired, or was cancelled
        break    
    # elif run.last_error != "None":
    #     # run failed, expired, or was cancelled
    #     break     
    else:
        print("In progress...")

Assistant: The data processing is complete. You can download the preprocessed recipes CSV file using the link below:

[Download recipes-preprocessed.csv](sandbox:/mnt/data/recipes-preprocessed.csv)
User: Please execute the INSTRUCTIONS and ACTIONS on the data stored in the csv file


In [10]:
# Functions to read csv files from Azure OpenAI Service
output_path = r"/workspaces/azure-openai-lab/data/generated_output/" #r"C:\\Python\\azure-openai-lab\\data\\generated_output\\"

def read_and_save_file(first_file_id, file_name):    
    # its binary, so read it and then make it a file like object
    file_data = client.files.content(first_file_id)
    file_data_bytes = file_data.read()
    file_like_object = io.BytesIO(file_data_bytes)
    #now read as csv to create df
    returned_data = pd.read_csv(file_like_object)
    returned_data.to_csv(output_path + file_name, index=False)
    return returned_data
    # file = read_and_save_file(first_file_id, "analyst_output.csv")
    
def files_from_messages():
    messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.attachments
    # Loop through each file ID and save the file with a sequential name
    for i, file_id in enumerate(message_ids):
        file_name = f"recipes-preprocessed.csv"  # Generate a sequential file name
        read_and_save_file(file_id.file_id, file_name)
        print(f'saved {file_name}')  

# Extract the file names from the response, retrieve the content and save the data as a csv file 
files_from_messages()

saved recipes-preprocessed.csv


In [11]:
#Clean up Azure OpenAI environment
client.beta.assistants.delete(assistant.id)
client.beta.threads.delete(thread.id)
client.files.delete(messages.data[0].attachments[0].file_id)


FileDeleted(id='assistant-R7m6awlGl7yHHjNgcM5VJEMO', deleted=True, object='file')