# Azure OpenAI Assistants - File Search

## Load Azure Configuration

In [1]:
from dotenv import load_dotenv
import os

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")

## Prepare Files

In [7]:
from openai import AzureOpenAI

# Create a client
client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)

# Create a vector store
vector_store = client.beta.vector_stores.create(name="Nasa Books")

# Specify the folder containing the files
folder_path = "../Data/nasabooks/"

# Get all file paths in the folder
file_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path)]

# Open file streams
file_streams = [open(path, "rb") for path in file_paths]

# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
    vector_store_id=vector_store.id, files=file_streams
)

In [None]:
# You can print the status and the file counts of the batch to see the result of this operation.
print(file_batch.status)
print(file_batch.file_counts)
print(vector_store.id)


## Step 1-2:
1. Create an Assistant
2. Create a Thread

In [None]:
# Step 1: Create assistant
assistant = client.beta.assistants.create(
  name="Nasa books Assistant",
  instructions="""
  You are a assistant that provides information. 
   You will answer questions based on files provided to you about information in a NASA Book. 
   You will not provide answers outside of those files.
  """,
  model=azure_openai_deployment,
  tools=[{"type":"file_search"}],
  tool_resources={"file_search":{"vector_store_ids":[vector_store.id]}},
  temperature=1,
  top_p=1
)

# Step 2: Create thread
thread = client.beta.threads.create()
print(thread)

## Step 3-6: 
3. Add a message to the thread
4. Run the Assistant
5. Check the Run Status
6. Display the Assistant's Response

In [None]:
import time

user_question ="""What can I see in the United States?"""

# Step 3: Add a message to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=user_question
)

# Step 4: Run the Assistant
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Step 5: Check the Run Status
# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

  if run.status == 'completed':
    messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  
  elif run.status == 'requires_action':
    pass
  
  else:
    print(run.status)

# Step 6: Display the Assistant's Response
content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)

## Reformat citations with the proper filenames

In [11]:
def reformat_citations(value):
    # Extract the message content and annotations
    message_content = messages.data[0].content[0]
    annotations = message_content.text.annotations
    
    # Original response
    paragraph = value

    # Dictionary to store key-value pairs of text and filename
    text_filename_pairs = {}

    # Iterate over the annotations and extract the relevant information
    for annotation in annotations:
        file_id = annotation.file_citation.file_id
        text = annotation.text
        cited_file = client.files.retrieve(file_id)
        filename = cited_file.filename

        if text not in text_filename_pairs:
            text_filename_pairs[text] = []
        text_filename_pairs[text].append(filename)

    # Print the key-value pairs
    #for text, filenames in text_filename_pairs.items():
    #    print(f'{text}: {", ".join(filenames)}')

    # Replace the citation texts with their corresponding filenames prefixed with " Source: "
    for text, filenames in text_filename_pairs.items():
        sources = " Source: " + ", ".join(filenames)
        paragraph = paragraph.replace(text, sources)

    return paragraph


In [None]:
print(reformat_citations(value))

## Delete Assistant

In [13]:
response = client.beta.assistants.delete(assistant.id)