In [7]:
import os
from openai import OpenAI
from typing_extensions import override
import pandas as pd
from openai import AssistantEventHandler
from IPython.core.display import display, HTML
import webbrowser
import tempfile
import re
import warnings
warnings.filterwarnings('ignore')

client = OpenAI()

In [9]:
def upload_files_to_vector_store(file_list, name):
    """
    Uploads a list of files to a new Vector Store and prints and saves the Vector Store ID
    """
    vector_store = client.beta.vector_stores.create(name=name)
    for i in range(0, len(file_list), 500):
        file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
            vector_store_id=vector_store.id, files=[open(path, "rb") for path in file_list[i:i+500]]
        )
    with open(f'vector_store_id_{name}.txt', 'w') as f:
        f.write(vector_store.id)
    print(f'Uploaded {len(file_list)} files to Vector Store {vector_store.id}')

file_paths = ['profiles/' + x for x in os.listdir('profiles')]
upload_files_to_vector_store(file_paths, 'pi_profiles')

class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        #print(f"\nassistant > ", end="", flush=True)
        pass
    
    @override
    def on_text_delta(self, delta, snapshot):
        print(delta.value, end="", flush=True)
    
    def on_tool_call_created(self, tool_call):
        #print(f"\nassistant > {tool_call.type}\n", flush=True)
        pass
    
    def on_tool_call_delta(self, delta, snapshot):
        if delta.type == 'code_interpreter':
            if delta.code_interpreter.input:
                print(delta.code_interpreter.input, end="", flush=True)
            if delta.code_interpreter.outputs:
                print(f"\n\noutput >", flush=True)
                for output in delta.code_interpreter.outputs:
                    if output.type == "logs":
                        print(f"\n{output.logs}", flush=True)

def create_assistant(instructions, data_name, assitant_name, model='gpt-4o'):
    
    vector_store_id = open(f'vector_store_id_{data_name}.txt', 'r').read()
    # Create a new Assistant with File Search Enabled
    assistant = client.beta.assistants.create(
        name=assistant_name,
        instructions=instructions,
        model=model,
        tools=[{"type": "file_search"}],
    )

    # Update the assistant to use the new Vector Store
    with open('vector_store_id.txt', 'r') as f:
        vector_store_id = f.read()
    assistant = client.beta.assistants.update(
        assistant_id=assistant.id,
        tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}},
    )

    return assistant

def stream_answer(query, assistant, data_name):
    vector_store_id = open(f'vector_store_id_{data_name}.txt', 'r').read()
    thread = client.beta.threads.create(
        messages=[{"role": "user", "content": query, "attachments": []}],
        tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}},
    )

    with client.beta.threads.runs.stream(
        thread_id=thread.id,
        assistant_id=assistant.id,
        instructions=assistant.instructions,
        event_handler=EventHandler(),
    ) as stream:
        stream.until_done()    

def stream_answer_query(query):
    stream_answer(query, assistant, data_name)

instructions = """
You are an assistant that points users to which individuals have similar research interests using the information provided to you.
When a user states their interests, you must provide a summary of the individuals that most resemble the user's interests and 
explain what in particular they have done or are doing that is similar to the user's interests.
Please use newlines to separate every single one of your sentences.

You must format your response as follows:
1. Provide the name, affiliation, and a brief summary of the relevant individuals'.
2. For each individual, elaborate on their specific research interests and provide their url, email, and address.

It is absolutely essential that you only use information from what is provided to you. Do not use any external information.
It is critical that all information is accurate and relevant to the user's interests.

If no relevant individuals are found, please say so and provide a description of those working in the most similar areas.
"""
data_name = 'pi_profiles'
assistant_name = 'pi_profiles_assistant'
assistant = create_assistant(instructions, data_name, assistant_name)

Uploaded 507 files to Vector Store vs_PILR6EF6tb1gCv4Hn3z7ylRV


In [6]:
query = "I am interested in the intersection of machine learning and biology. Only give me people at davis."
stream_answer_query(query)

Here are some individuals at UC Davis whose research aligns closely with the intersection of machine learning and biology:

1. **Parisa Emami**
    - **Affiliation**: Assistant Professor and Director of Uveitis Service, Department of Ophthalmology, UC Davis Health
    - **Summary**: Parisa Emami is a vitreoretinal surgeon and uveitis specialist combining artificial intelligence and machine learning with ocular imaging in her research.
    - **Research Interests**: Her work focuses on developing imaging biomarkers in patients with ocular inflammation, machine learning in disease status evaluation, and automated retina imaging devices.
    - **Contact Information**:
        - **Profile URL**: [Parisa Emami](https://citris-uc.org/people/person/parisa-emami/)
        - **Email**: pemamin@ucdavis.edu
        - **Address**: University of California, Davis, Davis, CA【4:0†source】【4:1†source】.

2. **Ilias Tagkopoulos**
    - **Affiliation**: Assistant Professor, UC Davis Genome Center
    - **S