# Semantic Kernel OpenAI Assistant Agent File Search

## Prepare the files

In [33]:
import os

file_directory = "../Data/nasabooks"

# List all files in the directory
try:
    filenames = os.listdir(file_directory)
    print(filenames)
except FileNotFoundError:
    print(f"Directory '{file_directory}' not found.")

# Get the full path of a file
def get_filepath_for_filename(filename: str) -> str:
    base_directory = file_directory
    return os.path.join(base_directory, filename)



['page-49.pdf', 'page-161.pdf', 'page-151.pdf', 'page-129.pdf', 'page-165.pdf', 'page-11.pdf', 'page-115.pdf', 'page-23.pdf', 'page-105.pdf', 'page-159.pdf', 'page-141.pdf', 'page-81.pdf', 'page-167.pdf', 'page-155.pdf', 'page-73.pdf', 'page-15.pdf', 'page-21.pdf', 'page-65.pdf', 'page-131.pdf', 'page-35.pdf', 'page-171.pdf', 'page-9.pdf', 'page-94.pdf', 'page-77.pdf', 'page-157.pdf', 'page-7.pdf', 'page-31.pdf', 'page-85.pdf', 'page-117.pdf', 'page-61.pdf', 'page-101.pdf', 'page-91.pdf', 'page-123.pdf', 'page-57.pdf', 'page-127.pdf', 'page-51.pdf', 'page-55.pdf', 'page-95.pdf', 'page-45.pdf', 'page-133.pdf', 'page-39.pdf', 'page-83.pdf', 'page-87.pdf', 'page-67.pdf', 'page-119.pdf', 'page-8.pdf', 'page-134.pdf', 'page-173.pdf', 'page-153.pdf', 'page-125.pdf', 'page-59.pdf', 'page-33.pdf', 'page-89.pdf', 'page-121.pdf', 'page-163.pdf', 'page-169.pdf', 'page-79.pdf', 'page-69.pdf', 'page-19.pdf', 'page-41.pdf', 'page-145.pdf', 'page-137.pdf', 'page-17.pdf', 'page-135.pdf', 'page-13.pdf'

## Reformat citations with the proper filenames

In [34]:
from semantic_kernel.contents.annotation_content import AnnotationContent

async def reformat_citations(agent, response):
    # Extract the annotations
    annotations = [item for item in response.items if isinstance(item, AnnotationContent)]
    
    # Original response
    paragraph = response.content
    
    # Dictionary to store key-value pairs of text and filename
    text_filename_pairs = {}

    # Iterate over the annotations and extract the relevant information
    for annotation in annotations:
        file_id = annotation.file_id
        text = annotation.quote
        # Retrieve the filename from the file_id
        cited_file = await agent.client.files.retrieve(file_id)
        filename = cited_file.filename

        if text not in text_filename_pairs:
            text_filename_pairs[text] = []
        text_filename_pairs[text].append(filename)

    # Replace the citation texts with their corresponding filenames prefixed with " Source: "
    for text, filenames in text_filename_pairs.items():
        sources = " Source: " + ", ".join(filenames)
        paragraph = paragraph.replace(text, sources)

    return paragraph

## Step 1-2: Create an Agent and Thread

In [35]:
from semantic_kernel.agents.open_ai.azure_assistant_agent import AzureAssistantAgent
from semantic_kernel.contents.chat_message_content import ChatMessageContent
from semantic_kernel.contents.utils.author_role import AuthorRole
from semantic_kernel.kernel import Kernel

# Step 1: Create an assistant agent
agent = await AzureAssistantAgent.create(
        kernel=Kernel(),
        service_id="agent",
        name="SK_OpenAI_Assistant_Agent_File_Search",
        instructions="""
            The document store contains pages from a Nasa book.
            Always analyze the document store to provide an answer to the user's question.
            Never rely on your knowledge of information not included in the document store.
            Always format response using markdown.
            """,
        enable_file_search=True,
        vector_store_filenames=[get_filepath_for_filename(filename) for filename in filenames],
    )

# Step 2: Create a thread
thread_id = await agent.create_thread()

## Step 3-6: Helper Function 
3. Add a message to the thread
4. Run the Assistant
5. Display the Assistant's Response

In [36]:
async def run_agent(user_question):
    # STEP 3: Add a user question to the thread
    await agent.add_chat_message(
            thread_id=thread_id, 
            message=ChatMessageContent(role=AuthorRole.USER, content=user_question)
    )

    # STEP 4: Invoke the agent to get a response
    async for response in agent.invoke(thread_id=thread_id):
        annotations = [item for item in response.items if isinstance(item, AnnotationContent)]
        #STEP 5: Print the Assistant response
        if annotations is None:
            print(f"{response.content}", end="", flush=True)
        else:
            print(f"{await reformat_citations(agent,response)}", end="", flush=True)

In [37]:
user_question = "How did the wide floodplains in Queensland originate?"
await run_agent(user_question)

The wide floodplains in Queensland, particularly in the Channel Country area, are thought to be caused by the extreme variation in water and sediment discharges from the rivers such as the Georgina, Burke, and Hamilton rivers. The area experiences significant fluctuations in rainfall, ranging from years with no rainfall to years with modest rainfall, and occasionally, years with extremely high discharges due to tropical storms. These variations lead to the formation of semi-permanent wetlands and extensive floodplains when the high water flows inundate the entire floodplain area Source: page-49.pdf, page-163.pdf.

## Appending Messages to the Thread

In [38]:
user_question = "What forms the Lower Amazon River?"
await run_agent(user_question)

The Lower Amazon River is formed where the Rio Solimões, carrying sediment from the Andes Mountains, meets the Rio Negro, with its nearly sediment-free, dark, tea-colored waters from the Colombian hills and jungles. When these two rivers converge east of Manaus, Brazil, they initially flow side by side in the same channel for a few kilometers. The cooler, denser, faster waters of the Solimões and the warmer, slower waters of the Negro create a visibly distinct boundary. Turbulent eddies eventually mix the two, resulting in the formation of the Lower Amazon River Source: page-61.pdf, page-111.pdf.

## Display Chat History

In [39]:
async for message in agent.get_thread_messages(thread_id):
    print(f"{message.role} : {message.content}")

AuthorRole.ASSISTANT : The Lower Amazon River is formed where the Rio Solimões, carrying sediment from the Andes Mountains, meets the Rio Negro, with its nearly sediment-free, dark, tea-colored waters from the Colombian hills and jungles. When these two rivers converge east of Manaus, Brazil, they initially flow side by side in the same channel for a few kilometers. The cooler, denser, faster waters of the Solimões and the warmer, slower waters of the Negro create a visibly distinct boundary. Turbulent eddies eventually mix the two, resulting in the formation of the Lower Amazon River【8:0†source】.
AuthorRole.USER : What forms the Lower Amazon River?
AuthorRole.ASSISTANT : The wide floodplains in Queensland, particularly in the Channel Country area, are thought to be caused by the extreme variation in water and sediment discharges from the rivers such as the Georgina, Burke, and Hamilton rivers. The area experiences significant fluctuations in rainfall, ranging from years with no rainfa

## Deleting Files, Thread, Agent

In [40]:
if agent is not None:
    [await agent.delete_file(file_id) for file_id in agent.file_search_file_ids]
    await agent.delete_thread(thread_id)
    await agent.delete()
