In [None]:
#————————————————————

# Name: Azure OpenAI Assistant for Data Transformations (V1)

# Purpose:


# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 22.01.2024
# Python Version: 3.10.4

#General Sources:
# https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/assistant
# https://learn.microsoft.com/en-us/azure/ai-services/openai/assistants-quickstart?tabs=command-line&pivots=programming-language-studio

#Azure Openai Usage:
# https://stackoverflow.com/questions/77986927/in-azure-openai-assistants-when-i-upload-a-file-and-save-it-where-is-that-file-s
# https://techcommunity.microsoft.com/t5/fasttrack-for-azure/strategies-for-optimizing-high-volume-token-usage-with-azure/ba-p/4007751#:~:text=Understanding%20tokens%20and%20limits%20in,generation%2C%20translation%2C%20or%20summarization.

#Additionals:
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models

# Download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\sdsc\requirements.txt

#————————————————————

In [1]:
# Import required libraries
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI

In [2]:
# Load required variables from env file.
load_dotenv(dotenv_path=Path("C:\\Python\\azure-openai-lab\\.venv\\.env")) #Error sometimes due to \ or \\. Try one or the other.

# Load Azure OpenAI Key and Endpoint. These values can be found within the Azure OpenAI Service resource in portal.azure.com under Keys and Endpoint
azure_oai_key = os.environ['AZURE_OPENAI_KEY']
azure_oai_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [6]:
# Initialize the Azure OpenAI client
client = AzureOpenAI(
    api_key = azure_oai_key,  
    api_version = "2024-02-15-preview",
    azure_endpoint = azure_oai_endpoint
    )

In [7]:
# Load data
path_input = r"C:\Python\azure-openai-lab\data\recipe_extraction.csv" #Change path if required

# send the csv file to the assistant purpose files
response = client.files.create(
  file=open(path_input, "rb"),
  purpose="assistants"
)
print(response)
file__id = response.id

FileObject(id='assistant-c4VW4rS9mCzb2G6kNZs8qBL7', bytes=976, created_at=1713960021, filename='recipe_extraction.csv', object='file', purpose='assistants', status='processed', status_details=None)


In [8]:
instructions = '''
You are a senior data analyst who will work with data in an csv file.
You have access to a sandboxed environment for writing python code.
When the user asks you to perform your actions, you will use the provided csv file.
You will perform data cleansing and transformation steps.
Execute each of the steps listed below in your ACTIONS section.

ACTIONS:

1. Read the csv file into a pandas DataFrame by tab separated values.
2. Extract the listed values from the column nutrition and change the datatype to numeric.
3. Sum the numeric values from the column nutrition and place them in a new column called nutrition_total. Prepare the results as Table_1.
4. Prepare Table_1 as an csv file for download by the user. 
5. Provide a summary paragraph explaining the preparation of the data set.

DO NOT:
1. Do not return any images. 
2. Do not return any other file types.
'''

In [9]:
# Create an assistant
assistant = client.beta.assistants.create(
    name = "data analyst assistant",
    instructions = instructions,
    tools = [{"type": "code_interpreter"}],
    model = "gpt-4-1106-preview", #You must replace this value with the deployment name for your model.
    file_ids=[file__id]
)

In [10]:
# Get the file id
fileId = assistant.file_ids[0]
print(assistant)

Assistant(id='asst_HOcI0EA5qHu6IMw9wr5bl2AV', created_at=1713960030, description=None, file_ids=['assistant-c4VW4rS9mCzb2G6kNZs8qBL7'], instructions='\nYou are a senior data analyst who will work with data in an csv file.\nYou have access to a sandboxed environment for writing python code.\nWhen the user asks you to perform your actions, you will use the provided csv file.\nYou will perform data cleansing and transformation steps.\nExecute each of the steps listed below in your ACTIONS section.\n\nACTIONS:\n\n1. Read the csv file into a pandas DataFrame by tab separated values.\n2. Extract the listed values from the column nutrition and change the datatype to numeric.\n3. Sum the numeric values from the column nutrition and place them in a new column called nutrition_total. Prepare the results as Table_1.\n4. Prepare Table_1 as an csv file for download by the user. \n5. Provide a summary paragraph explaining the preparation of the data set.\n\nDO NOT:\n1. Do not return any images. \n2.

In [11]:
# Create a thread
thread = client.beta.threads.create()

In [12]:
# Add a user prompt to the thread

prompt = "Please execute your ACTIONS on the data stored in the csv file " + fileId

message = client.beta.threads.messages.create(
    thread_id = thread.id,
    role = "user",
    content = prompt
)

In [13]:
# Run the Assistant

run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  #instructions="New instructions" #You can optionally provide new instructions but these will override the default instructions
)

In [14]:
while True:
    sec = 30
    # Wait for 30 seconds
    time.sleep(sec)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        break
    elif run.status == "requires_action":
        # handle function calling and continue with the execution
        pass
    elif run.status == "expired" or run.status=="failed" or run.status=="cancelled":
        # run failed, expired, or was cancelled
        break    
    else:
        print("in progress...")

in progress...
in progress...
in progress...
in progress...
in progress...
Assistant: It appears that there is a technical issue preventing me from processing the CSV file through this platform. Unfortunately, this issue is out of my control, and I am unable to read the file or perform the subsequent steps at this time. I recommend trying again later or contacting support if the issue persists. My apologies for the inconvenience.
Assistant: It seems the previous environment where pandas was loaded has been reset and pandas is no longer recognized. I will need to import pandas again and then proceed with loading the CSV file. Let's go through this step once more.
Assistant: There seems to be a persistent issue preventing me from reading the data. I will attempt one more time to read and process your CSV file. Let's hope it works this time.
Assistant: It looks like there was an issue when attempting to read the data. Let me try to read the csv file again and ensure it is processed correc

In [16]:
# messages = client.beta.threads.messages.list(
#             thread_id=thread.id
#         )

# messages.data

In [12]:
# Functions to read xlsx files from Azure Openai

def read_and_save_file(first_file_id, file_name):    
    # its binary, so read it and then make it a file like object
    file_data = client.files.content(first_file_id)
    file_data_bytes = file_data.read()
    file_like_object = io.BytesIO(file_data_bytes)
    #now read as csv to create df
    returned_data = pd.read_csv(file_like_object)
    returned_data.to_csv(file_name, index=False)
    return returned_data
    # file = read_and_save_file(first_file_id, "analyst_output.csv")
    
def files_from_messages(messages, asst_name):
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids
    print(message_ids)
    # Loop through each file ID and save the file with a sequential name
    for i, file_id in enumerate(message_ids):
        file_name = f"{asst_name}_output_{i+1}.csv"  # Generate a sequential file name
        read_and_save_file(file_id, file_name)
        print(f'saved {file_name}')  

In [13]:
# extract the file names from the response and retrieve the content
asst_name = 'data_analyst_assistant'        
files_from_messages(messages, asst_name)

[]


In [14]:
for i in client.files.list():
    print(i)

FileObject(id='assistant-U63rofolXcnMAtc75dizI5Xz', bytes=976, created_at=1713958466, filename='recipe_extraction.csv', object='file', purpose='assistants', status='processed', status_details=None)


In [16]:
#Clean up

client.beta.assistants.delete(assistant.id)
client.beta.threads.delete(thread.id)
for i in client.files.list():
    client.files.delete(i.id)

In [None]:

for i in client.beta.assistants.list():
    client.beta.assistants.delete(i.id)
for i in client.beta.threads.list():
    client.beta.threads.delete(i.id)
for i in client.files.list():
    client.files.delete(i.id)

In [25]:
# for i in range(50):

#     # Load data
#     path_input = r"C:\Python\data\openfoodfacts.xlsx" #Change path if required

#     # send the csv file to the assistant purpose files
#     response = client.files.create(
#     file=open(path_input, "rb"),
#     purpose="assistants"
#     )

#     file__id = response.id

#     # Create an assistant
#     assistant = client.beta.assistants.create(
#         name = "data analyst assistant " + str(i),
#         instructions = instructions,
#         tools = [{"type": "code_interpreter"}],
#         model = "gpt-4-1106-preview", #You must replace this value with the deployment name for your model.
#         file_ids=[file__id]
#     )

#     fileId = assistant.file_ids[0]

#     thread = client.beta.threads.create()

#     prompt = "Please execute your ACTIONS on the data stored in the xlsx file " + fileId

#     message = client.beta.threads.messages.create(
#         thread_id = thread.id,
#         role = "user",
#         content = prompt
#     )

#     run = client.beta.threads.runs.create(
#     thread_id=thread.id,
#     assistant_id=assistant.id,
#     #instructions="New instructions" #You can optionally provide new instructions but these will override the default instructions
#     )
    