In [None]:
#————————————————————

# Name: Azure OpenAI Assistant API, Data Pre-processing for RAG

# Purpose:  This notebook will use the Azure OpenAI Assistant API to conduct data pre-processing steps on the recipes CSV using for RAG.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch), Alex Dean (adean@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 19.05.2024
# Python Version: 3.10.4

#General Sources:
# https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/assistant
# https://learn.microsoft.com/en-us/azure/ai-services/openai/assistants-quickstart?tabs=command-line&pivots=programming-language-studio

#Azure Openai Usage:
# https://stackoverflow.com/questions/77986927/in-azure-openai-assistants-when-i-upload-a-file-and-save-it-where-is-that-file-s
# https://techcommunity.microsoft.com/t5/fasttrack-for-azure/strategies-for-optimizing-high-volume-token-usage-with-azure/ba-p/4007751#:~:text=Understanding%20tokens%20and%20limits%20in,generation%2C%20translation%2C%20or%20summarization.

#Additionals:
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models

# If necessary, download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\sdsc\requirements.txt

#————————————————————

In [1]:
# Import Python packages
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI

In [2]:
# Load required variables from env file.
load_dotenv(dotenv_path=Path("C:\\Python\\azure-openai-lab\\.venv\\.env")) #Error sometimes due to \ or \\. Try one or the other. /workspaces/azure-openai-lab/.venv/.env

# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
azure_oai_key = os.environ['AZURE_OPENAI_KEY']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [3]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
    api_key = azure_oai_key,  
    api_version = "2024-02-15-preview",
    azure_endpoint = azure_oai_endpoint
    )

In [4]:
# Load data
path_input = r"C:\Python\azure-openai-lab\data\recipes.csv" #Change path if required

# send the csv file to the assistant purpose files
response = client.files.create(
  file=open(path_input, "rb"),
  purpose="assistants"
)
print(response)
file__id = response.id

FileObject(id='assistant-amHGnHvsfqT6ResRBr2r9MuH', bytes=2915453, created_at=1716139225, filename='recipes_short.csv', object='file', purpose='assistants', status='processed', status_details=None)


In [5]:
instructions = '''
You are a senior data analyst who will work with data in a csv file in your files. 
You have access to a sandbox environment for writing Python code.
When the user asks you to perform your actions, use the csv file to read the data into a pandas dataframe.
Execute each of the steps listed below in your ACTIONS section.


ACTIONS:

1. Read the file data into a pandas DataFrame. 
2. Drop columns "id", "contributor_id" and "submitted".
3. Trim column "name" by removing irregular text spacing at the front or back of each value while keeping single spaces between words.
4. Filter the data where column "tags" contains the word vegan.
5. Prepare a final data set named Table_1 that only has 50 rows randomly sampled from the dataframe from step 4.
6. Provide a summary paragraph explaining the preparation of the data set.
7. Prepare Table_1 as csv files for download by the user. 

DO NOT:
1. Do not return any images. 
2. Do not return any other file types.
'''

In [6]:
# Create an assistant
assistant = client.beta.assistants.create(
    name = "data analyst assistant",
    instructions = instructions,
    tools = [{"type": "code_interpreter"}],
    model = "gpt-4-1106-preview", #"gpt-4-0125-preview", #You must replace this value with the deployment name for your model.
    file_ids=[file__id]
)

In [7]:
# Get the file id
fileId = assistant.file_ids[0]
print(assistant)

Assistant(id='asst_i58XTCBOV3atiCk25gTLzcng', created_at=1716139261, description=None, file_ids=['assistant-amHGnHvsfqT6ResRBr2r9MuH'], instructions='\nYou are a senior data analyst who will work with data in a csv file in your files. \nYou have access to a sandbox environment for writing Python code.\nWhen the user asks you to perform your actions, use the csv file to read the data into a pandas dataframe.\nExecute each of the steps listed below in your ACTIONS section.\n\n\nACTIONS:\n\n1. Read the file data into a pandas DataFrame. \n2. Drop columns "id", "contributor_id" and "submitted".\n3. Trim column "name" by removing irregular text spacing at the front or back of each value while keeping single spaces between words.\n4. Filter the data where column "tags" contains the word vegan.\n5. Prepare a final data set named Table_1 that only has 50 rows randomly sampled from the dataframe from step 4.\n6. Provide a summary paragraph explaining the preparation of the data set.\n7. Prepare

In [8]:
# Create a thread
thread = client.beta.threads.create()

In [9]:
# Add a user prompt to the thread

prompt = "Please execute your ACTIONS on the data stored in the csv file " + fileId

message = client.beta.threads.messages.create(
    thread_id = thread.id,
    role = "user",
    content = prompt
)

In [10]:
# Run the Assistant

run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  #instructions="New instructions" #You can optionally provide new instructions but these will override the default instructions
)

In [None]:
while True:
    sec = 30
    # Wait for 30 seconds
    time.sleep(sec)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        break
    elif run.status == "requires_action":
        # handle function calling and continue with the execution
        pass
    elif run.status == "expired" or run.status=="failed" or run.status=="cancelled":
        # run failed, expired, or was cancelled
        break    
    elif run.last_error != "None":
        # run failed, expired, or was cancelled
        break     
    else:
        print("In progress...")

In [None]:
messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )

for msg in messages.data:
    print(msg.content[0].text.value)


In [13]:
# Functions to read csv files from Azure OpenAI Service

output_path = r"C:\\Python\\azure-openai-lab\\data\\"

def read_and_save_file(first_file_id, file_name):    
    # its binary, so read it and then make it a file like object
    file_data = client.files.content(first_file_id)
    file_data_bytes = file_data.read()
    file_like_object = io.BytesIO(file_data_bytes)
    #now read as csv to create df
    returned_data = pd.read_csv(file_like_object)
    returned_data.to_csv(output_path + file_name, index=False)
    return returned_data
    # file = read_and_save_file(first_file_id, "analyst_output.csv")
    
def files_from_messages(messages, asst_name):
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids
    print(message_ids)
    # Loop through each file ID and save the file with a sequential name
    for i, file_id in enumerate(message_ids):
        file_name = f"recipes-preprocessed.csv"  # Generate a sequential file name
        read_and_save_file(file_id, file_name)
        print(f'saved {file_name}')  

In [14]:
# extract the file names from the response and retrieve the content     
files_from_messages(messages)

['assistant-nCYKZfZC9iEu0RTjhaWVMRgT']
saved recipe_preprocessed.csv


In [None]:
#Clean up Azure OpenAI environment

client.beta.assistants.delete(assistant.id)
client.beta.threads.delete(thread.id)
for i in client.files.list():
    client.files.delete(i.id)