# Semantic Kernel OpenAI Assistant Agent File Search

## Prepare the files

In [56]:
import os

file_directory = "../Data/nasabooks"

def get_filepath_for_filename(filename: str) -> str:
    base_directory = file_directory
    return os.path.join(base_directory, filename)

# List all files in the directory
base_directory = file_directory
try:
    filenames = os.listdir(base_directory)
    print(filenames)
except FileNotFoundError:
    print(f"Directory '{base_directory}' not found.")

['page-49.pdf', 'page-161.pdf', 'page-151.pdf', 'page-129.pdf', 'page-165.pdf', 'page-11.pdf', 'page-115.pdf', 'page-23.pdf', 'page-105.pdf', 'page-159.pdf', 'page-141.pdf', 'page-81.pdf', 'page-167.pdf', 'page-155.pdf', 'page-73.pdf', 'page-15.pdf', 'page-21.pdf', 'page-65.pdf', 'page-131.pdf', 'page-35.pdf', 'page-171.pdf', 'page-9.pdf', 'page-94.pdf', 'page-77.pdf', 'page-157.pdf', 'page-7.pdf', 'page-31.pdf', 'page-85.pdf', 'page-117.pdf', 'page-61.pdf', 'page-101.pdf', 'page-91.pdf', 'page-123.pdf', 'page-57.pdf', 'page-127.pdf', 'page-51.pdf', 'page-55.pdf', 'page-95.pdf', 'page-45.pdf', 'page-133.pdf', 'page-39.pdf', 'page-83.pdf', 'page-87.pdf', 'page-67.pdf', 'page-119.pdf', 'page-8.pdf', 'page-134.pdf', 'page-173.pdf', 'page-153.pdf', 'page-125.pdf', 'page-59.pdf', 'page-33.pdf', 'page-89.pdf', 'page-121.pdf', 'page-163.pdf', 'page-169.pdf', 'page-79.pdf', 'page-69.pdf', 'page-19.pdf', 'page-41.pdf', 'page-145.pdf', 'page-137.pdf', 'page-17.pdf', 'page-135.pdf', 'page-13.pdf'

## Step 1-2: Create an Agent and Thread

In [None]:
from semantic_kernel.agents.open_ai.azure_assistant_agent import AzureAssistantAgent
from semantic_kernel.contents.chat_message_content import ChatMessageContent
from semantic_kernel.contents.annotation_content import AnnotationContent
from semantic_kernel.contents.utils.author_role import AuthorRole
from semantic_kernel.kernel import Kernel

# Step 1: Create an assistant agent
agent = await AzureAssistantAgent.create(
        kernel=Kernel(),
        service_id="agent",
        name="SK_OpenAI_Assistant_Agent_File_Search",
        instructions="""
            The document store contains pages from a Nasa book.
            Always analyze the document store to provide an answer to the user's question.
            Never rely on your knowledge of information not included in the document store.
            Always format response using markdown.
            """,
        enable_file_search=True,
        vector_store_filenames=[get_filepath_for_filename(filename) for filename in filenames],
    )

# Step 2: Create a thread
thread_id = await agent.create_thread()

## Step 3-6: 
3. Add a message to the thread
4. Run the Assistant
5. Display the Assistant's Response

In [None]:
user_question = "What can I see in the United States?"

# STEP 3: Add a user question to the thread
await agent.add_chat_message(
        thread_id=thread_id, 
        message=ChatMessageContent(role=AuthorRole.USER, content=user_question)
)

footnotes: list[AnnotationContent] = []

# STEP 4: Invoke the agent to get a response
async for response in agent.invoke(thread_id=thread_id):
    footnotes.extend([item for item in response.items if isinstance(item, AnnotationContent)])
    #STEP 5: Print the Assistant response
    print(f"{response.content}", end="", flush=True)

print()

if len(footnotes) > 0:
    for footnote in footnotes:
        print(
            f"\n`{footnote.quote}` => {footnote.file_id} "
            f"(Index: {footnote.start_index} - {footnote.end_index})"
        )

In [None]:
# Understand the response
print("Value:")
print(response.content)
print("\n")
print("Annotations:")
annotations = [item for item in response.items if isinstance(item, AnnotationContent)]
for annotation in annotations:
    print(annotation)

## Reformat citations with the proper filenames

In [51]:
async def reformat_citations(response):
    # Extract the annotations
    annotations = [item for item in response.items if isinstance(item, AnnotationContent)]
    
    # Original response
    paragraph = response.content
    
    # Dictionary to store key-value pairs of text and filename
    text_filename_pairs = {}

    # Iterate over the annotations and extract the relevant information
    for annotation in annotations:
        file_id = annotation.file_id
        text = annotation.quote
        # Retrieve the filename from the file_id
        cited_file = await agent.client.files.retrieve(file_id)
        filename = cited_file.filename

        if text not in text_filename_pairs:
            text_filename_pairs[text] = []
        text_filename_pairs[text].append(filename)

    # Replace the citation texts with their corresponding filenames prefixed with " Source: "
    for text, filenames in text_filename_pairs.items():
        sources = " Source: " + ", ".join(filenames)
        paragraph = paragraph.replace(text, sources)

    return paragraph


In [None]:
print(await reformat_citations(response))

## Deleting Files, Thread, Agent

In [53]:
if agent is not None:
    [await agent.delete_file(file_id) for file_id in agent.file_search_file_ids]
    await agent.delete_thread(thread_id)
    await agent.delete()
