# Semantic Kernel OpenAI Assistant Agent File Search

## Prepare the files

In [1]:
import os

file_directory = "../Data/nasabooks"

# List all files in the directory
try:
    filenames = os.listdir(file_directory)
    print(filenames)
except FileNotFoundError:
    print(f"Directory '{file_directory}' not found.")

# Get the full path of a file
def get_filepath_for_filename(filename: str) -> str:
    base_directory = file_directory
    return os.path.join(base_directory, filename)



['page-49.pdf', 'page-161.pdf', 'page-151.pdf', 'page-129.pdf', 'page-165.pdf', 'page-11.pdf', 'page-115.pdf', 'page-23.pdf', 'page-105.pdf', 'page-159.pdf', 'page-141.pdf', 'page-81.pdf', 'page-167.pdf', 'page-155.pdf', 'page-73.pdf', 'page-15.pdf', 'page-21.pdf', 'page-65.pdf', 'page-131.pdf', 'page-35.pdf', 'page-171.pdf', 'page-9.pdf', 'page-94.pdf', 'page-77.pdf', 'page-157.pdf', 'page-7.pdf', 'page-31.pdf', 'page-85.pdf', 'page-117.pdf', 'page-61.pdf', 'page-101.pdf', 'page-91.pdf', 'page-123.pdf', 'page-57.pdf', 'page-127.pdf', 'page-51.pdf', 'page-55.pdf', 'page-95.pdf', 'page-45.pdf', 'page-133.pdf', 'page-39.pdf', 'page-83.pdf', 'page-87.pdf', 'page-67.pdf', 'page-119.pdf', 'page-8.pdf', 'page-134.pdf', 'page-173.pdf', 'page-153.pdf', 'page-125.pdf', 'page-59.pdf', 'page-33.pdf', 'page-89.pdf', 'page-121.pdf', 'page-163.pdf', 'page-169.pdf', 'page-79.pdf', 'page-69.pdf', 'page-19.pdf', 'page-41.pdf', 'page-145.pdf', 'page-137.pdf', 'page-17.pdf', 'page-135.pdf', 'page-13.pdf'

## Reformat citations with the proper filenames

In [2]:
async def reformat_citations(agent, response):
    from semantic_kernel.contents import StreamingAnnotationContent

    # Extract the annotations
    annotations = [item for item in response.items if isinstance(item, StreamingAnnotationContent)]

    # Convert the response content to a string
    paragraph = str(response.content)

    # Dictionary to store key-value pairs of text and filename
    text_filename_pairs = {}

    # Iterate over the annotations and extract the relevant information
    for annotation in annotations:
        file_id = annotation.file_id
        text = annotation.quote
        # Retrieve the filename from the file_id
        cited_file = await agent.client.files.retrieve(file_id)
        filename = cited_file.filename

        if text not in text_filename_pairs:
            text_filename_pairs[text] = []
        text_filename_pairs[text].append(filename)

    # Replace the citation texts with their corresponding filenames prefixed with " Source: "
    for text, filenames in text_filename_pairs.items():
        sources = " Source: " + ", ".join(filenames)
        paragraph = paragraph.replace(text, sources)

    return paragraph

## Create an Agent and Thread

In [3]:
from semantic_kernel.agents import AssistantAgentThread, AzureAssistantAgent
from semantic_kernel.contents import StreamingAnnotationContent

# Create the client using Azure OpenAI resources and configuration
client, model = AzureAssistantAgent.setup_resources()

# Upload the files to the client
file_ids: list[str] = []
for path in [get_filepath_for_filename(filename) for filename in filenames]:
    with open(path, "rb") as file:
        file = await client.files.create(file=file, purpose="assistants")
        file_ids.append(file.id)

vector_store = await client.vector_stores.create(
    name="assistant_search",
    file_ids=file_ids,
)

# Get the file search tool and resources
file_search_tools, file_search_tool_resources = AzureAssistantAgent.configure_file_search_tool(vector_store_ids=vector_store.id)

# Create the assistant definition
definition = await client.beta.assistants.create(
    model=model,
    instructions="""
        The document store contains pages from a Nasa book.
        Always analyze the document store to provide an answer to the user's question.
        Never rely on your knowledge of information not included in the document store.
        Always format response using markdown.
        """,
    name="SampleAssistantAgent",
    tools=file_search_tools,
    tool_resources=file_search_tool_resources,
)

# Create the agent using the client and the assistant definition
agent = AzureAssistantAgent(
    client=client,
    definition=definition,
)

# Create a thread for the agent
thread: AssistantAgentThread = None

## Helper Function 

In [4]:
async def run_agent(user_question, thread):
   
    async for response in agent.invoke_stream(messages=user_question, thread=thread):
        thread = response.thread
        annotations = [item for item in response.items if isinstance(item, StreamingAnnotationContent)]
        #Print the Assistant response
        if annotations is None:
            print(f"{response.content}", end="", flush=True)
        else:
            print(f"{await reformat_citations(agent,response)}", end="", flush=True)

In [5]:
user_question = "How did the wide floodplains in Queensland originate?"
await run_agent(user_question, thread)

The wide floodplains in Queensland, specifically in the Channel Country, are thought to have originated due to the extreme variation in water and sediment discharges from the rivers. This region experiences significant fluctuations in rainfall, leading to a unique hydrological pattern. Many years see no rainfall at all, causing the rivers to seemingly disappear. In contrast, years of even modest rainfall cause the main channels to carry water, often spilling into billabongs. Every few decades, tropical storms to the north can lead to immensely high water discharges that inundate the entire width of the floodplain. These extreme variations contribute to the formation and maintenance of the wide floodplains in this area Source: page-49.pdf.

## Appending Messages to the Thread

In [6]:
user_question = "What forms the Lower Amazon River?"
await run_agent(user_question, thread)

The Lower Amazon River forms where the coffee-colored Rio Solimões, rich with sediment, meets the black-tea-colored Rio Negro. This confluence occurs east of Manaus, Brazil, where the two rivers run side by side for several kilometers before their waters eventually mix due to turbulent eddies Source: page-61.pdf.

## Deleting Files, Thread, Agent

In [7]:
if agent is not None:
    [await client.files.delete(file_id) for file_id in file_ids]
    await thread.delete() if thread else None
    await client.beta.assistants.delete(agent.id)
