# Sandbox for openai file_search testing

### Big chunks

In [1]:
import os
from openai import OpenAI
from pprint import pprint
from IPython.display import display

In [2]:
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

In [3]:
chunking_strategy =  {
        "type": "static",
        "static": {
          "max_chunk_size_tokens": 4000, # reduce size to ensure better context integrity
          "chunk_overlap_tokens": 0 # increase overlap to maintain context across chunks
        }}

pprint(chunking_strategy)

vector_store = client.beta.vector_stores.create(name="rag_test",
                                                chunking_strategy=chunking_strategy)

{'static': {'chunk_overlap_tokens': 0, 'max_chunk_size_tokens': 4000},
 'type': 'static'}


In [4]:
vector_store.id

'vs_67b09765c6a88191b8f371b905bcddd0'

In [5]:
%%time
assistant_data = "/Users/boris/CMBAgents/rag_test"

print("Files to upload:")
file_paths = []
for root, dirs, files in os.walk(assistant_data):
    # Filter out unwanted directories like .ipynb_checkpoints
    dirs[:] = [d for d in dirs if not d.startswith('.')]
    for file in files:
        if file.startswith('.') or file.endswith('.ipynb')  or file.endswith('.yaml') or file.endswith('.txt'):
            continue
        print(f"\t - {file}")
        file_paths.append(os.path.join(root, file))

file_streams = [open(path, "rb") for path in file_paths]

file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store.id,
        files=file_streams
        )

Files to upload:
	 - handbook.pdf
CPU times: user 26.7 ms, sys: 6.29 ms, total: 33 ms
Wall time: 3.3 s


In [6]:
try:
    client.beta.assistants.delete(new_assistant.id)
except:
    print("assistant not found yet")

assistant not found yet


In [7]:
new_assistant = client.beta.assistants.create(
    name="rag_test",
    instructions="You are a file searcher",
    tools=[{"type": "file_search",
            "file_search":
            {'max_num_results': 3}}],
    tool_resources={"file_search": {"vector_store_ids":[vector_store.id]}},
    model="gpt-4o", 
    temperature = 0.00001,
    top_p = 0.05,
)

In [8]:
new_assistant.id

'asst_aEhwtliNadz3xtrwH3Qgeu59'

In [9]:
thread = client.beta.threads.create(
                messages=[],
            )

In [10]:
thread.id

'thread_1VeIkzAb7ydOfd6xnoWNyfE1'

In [11]:
message = "What is relative weighting of the report and code for M1 module?"

In [12]:
parsed = client.beta.threads.messages.create(
                thread_id=thread.id,
                content=message,
                role='user',
            )

In [13]:
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=new_assistant.id,
    # pass the latest system message as instructions
    instructions="We perform file search",
)

In [14]:
print(run.id)

run_uXVHerG0K1d9CBJdp1YS4Jte


In [15]:
run

Run(id='run_uXVHerG0K1d9CBJdp1YS4Jte', assistant_id='asst_aEhwtliNadz3xtrwH3Qgeu59', cancelled_at=None, completed_at=None, created_at=1739626360, expires_at=1739626960, failed_at=None, incomplete_details=None, instructions='We perform file search', last_error=None, max_completion_tokens=None, max_prompt_tokens=None, metadata={}, model='gpt-4o', object='thread.run', parallel_tool_calls=True, required_action=None, response_format='auto', started_at=None, status='queued', thread_id='thread_1VeIkzAb7ydOfd6xnoWNyfE1', tool_choice='auto', tools=[FileSearchTool(type='file_search', file_search=FileSearch(max_num_results=3, ranking_options=FileSearchRankingOptions(score_threshold=0.0, ranker='default_2024_08_21')))], truncation_strategy=TruncationStrategy(type='auto', last_messages=None), usage=None, temperature=1e-05, top_p=0.05, tool_resources={}, reasoning_effort=None)

In [16]:
run = client.beta.threads.runs.retrieve(run.id, thread_id=thread.id)

In [17]:
print(run.usage)

Usage(completion_tokens=69, prompt_tokens=4344, total_tokens=4413, prompt_token_details={'cached_tokens': 0}, completion_tokens_details={'reasoning_tokens': 0})


In [18]:
# register cost 
prompt_tokens = run.usage.prompt_tokens
completion_tokens = run.usage.completion_tokens
total_tokens = run.usage.total_tokens

# cost = self.cost(run)
tokens_dict = {
    "model": run.model,
    "prompt_tokens": prompt_tokens,
    "completion_tokens": completion_tokens,
    "total_tokens": total_tokens,
}
display(tokens_dict)

{'model': 'gpt-4o',
 'prompt_tokens': 4344,
 'completion_tokens': 69,
 'total_tokens': 4413}

In [19]:
response_messages = client.beta.threads.messages.list(thread.id, order="asc")

In [20]:
response_messages

SyncCursorPage[Message](data=[Message(id='msg_UoSnnbKncCtQK2IbIbSEuMWn', assistant_id=None, attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='What is relative weighting of the report and code for M1 module?'), type='text')], created_at=1739626354, incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='user', run_id=None, status=None, thread_id='thread_1VeIkzAb7ydOfd6xnoWNyfE1'), Message(id='msg_tKUmZCF7THtWA80Sq9guOEiQ', assistant_id='asst_aEhwtliNadz3xtrwH3Qgeu59', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[FileCitationAnnotation(end_index=143, file_citation=FileCitation(file_id='file-PK6AQoHsWfCkKpmvSH1XB3'), start_index=125, text='【4:0†handbook.pdf】', type='file_citation')], value='The relative weighting of the report and code for the M1 module (Machine Learning) is 67% for the report and 33% for the code【4:0†handbook.pdf】.'), type='text')], created_at=1739626363, i

In [21]:
run_steps = client.beta.threads.runs.steps.list(
    thread_id=thread.id,
    run_id=run.id
)

In [22]:
i = 0
for step in run_steps.data:
    print("i: ", i)
    try:

        retrieved_step = client.beta.threads.runs.steps.retrieve(
            thread_id=step.thread_id,
            run_id=run.id,
            step_id=step.id,
            include=["step_details.tool_calls[*].file_search.results[*].content"]
        )
        r = 0
        for result in retrieved_step.step_details.tool_calls[0].file_search.results:
            print("\n\nr: ", r)
            print("\n\nresult: ", result)
            r += 1

    except:
        print("step.step_details.tool_calls: None")
    print("\n\nstep done\n\n")
    i += 1

i:  0
step.step_details.tool_calls: None


step done


i:  1


r:  0


result:  FileSearchResult(file_id='file-PK6AQoHsWfCkKpmvSH1XB3', file_name='handbook.pdf', score=0.9862850330868557, content=[FileSearchResultContent(text='S1 - Statistical Methods → S2 - Advanced Statistical Methods\nM1 - Machine Learning → M2 - Deep Learning\nC1 - Research Computing → C2 - High Performance Computing\n\nAs stated in the previous section on “managing your workload”, students who wish to take 2 major modules for\nexamination in Michaelmas and 3 from Lent should carefully consider both the prerequisite requirements for the\nLent modules, and the effect this will have on their workload balance during the year. Students considering this\noption should discuss it with a member of the MPhil teaching team in their review meeting before submitting their\nchoices for approval.\n\nEach Major module will count for 12% of the final grade and will be examined via a mix of coursework and written\nexams.\n\n3.1.1 

### Small chunks

In [1]:
import os
from openai import OpenAI
from pprint import pprint
from IPython.display import display

In [2]:
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

In [3]:
chunking_strategy =  {
        "type": "static",
        "static": {
          "max_chunk_size_tokens": 400, # reduce size to ensure better context integrity
          "chunk_overlap_tokens": 0 # increase overlap to maintain context across chunks
        }}

pprint(chunking_strategy)

vector_store = client.beta.vector_stores.create(name="rag_test",
                                                chunking_strategy=chunking_strategy)

{'static': {'chunk_overlap_tokens': 0, 'max_chunk_size_tokens': 400},
 'type': 'static'}


In [4]:
vector_store.id

'vs_67b097b4883c819184f21742313e9f81'

In [5]:
%%time
assistant_data = "/Users/boris/CMBAgents/rag_test"

print("Files to upload:")
file_paths = []
for root, dirs, files in os.walk(assistant_data):
    # Filter out unwanted directories like .ipynb_checkpoints
    dirs[:] = [d for d in dirs if not d.startswith('.')]
    for file in files:
        if file.startswith('.') or file.endswith('.ipynb')  or file.endswith('.yaml') or file.endswith('.txt'):
            continue
        print(f"\t - {file}")
        file_paths.append(os.path.join(root, file))

file_streams = [open(path, "rb") for path in file_paths]

file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store.id,
        files=file_streams
        )

Files to upload:
	 - handbook.pdf
CPU times: user 28.6 ms, sys: 6.32 ms, total: 35 ms
Wall time: 5.48 s


In [6]:
try:
    client.beta.assistants.delete(new_assistant.id)
except:
    print("assistant not found yet")

assistant not found yet


In [7]:
new_assistant = client.beta.assistants.create(
    name="rag_test",
    instructions="You are a file searcher",
    tools=[{"type": "file_search",
            "file_search":
            {'max_num_results': 3}}],
    tool_resources={"file_search": {"vector_store_ids":[vector_store.id]}},
    model="gpt-4o", 
    temperature = 0.00001,
    top_p = 0.05,
)

In [8]:
new_assistant.id

'asst_Mw9Vylelr7Qje18Jd35FxwKC'

In [9]:
thread = client.beta.threads.create(
                messages=[],
            )

In [10]:
thread.id

'thread_JxjbyBLMVZoNiJZoB5YaAlx0'

In [11]:
message = "What is relative weighting of the report and code for M1 module?"

In [12]:
parsed = client.beta.threads.messages.create(
                thread_id=thread.id,
                content=message,
                role='user',
            )

In [13]:
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=new_assistant.id,
    # pass the latest system message as instructions
    instructions="We perform file search",
)

In [14]:
print(run.id)

run_5tSSMXdbBqEAxnIjxkIlkXDA


In [18]:
run

Run(id='run_5tSSMXdbBqEAxnIjxkIlkXDA', assistant_id='asst_Mw9Vylelr7Qje18Jd35FxwKC', cancelled_at=None, completed_at=None, created_at=1739626448, expires_at=1739627048, failed_at=None, incomplete_details=None, instructions='We perform file search', last_error=None, max_completion_tokens=None, max_prompt_tokens=None, metadata={}, model='gpt-4o', object='thread.run', parallel_tool_calls=True, required_action=None, response_format='auto', started_at=None, status='queued', thread_id='thread_JxjbyBLMVZoNiJZoB5YaAlx0', tool_choice='auto', tools=[FileSearchTool(type='file_search', file_search=FileSearch(max_num_results=3, ranking_options=FileSearchRankingOptions(score_threshold=0.0, ranker='default_2024_08_21')))], truncation_strategy=TruncationStrategy(type='auto', last_messages=None), usage=None, temperature=1e-05, top_p=0.05, tool_resources={}, reasoning_effort=None)

In [19]:
run = client.beta.threads.runs.retrieve(run.id, thread_id=thread.id)

In [20]:
print(run.usage)

Usage(completion_tokens=69, prompt_tokens=4338, total_tokens=4407, prompt_token_details={'cached_tokens': 0}, completion_tokens_details={'reasoning_tokens': 0})


In [21]:
# register cost 
prompt_tokens = run.usage.prompt_tokens
completion_tokens = run.usage.completion_tokens
total_tokens = run.usage.total_tokens

# cost = self.cost(run)
tokens_dict = {
    "model": run.model,
    "prompt_tokens": prompt_tokens,
    "completion_tokens": completion_tokens,
    "total_tokens": total_tokens,
}
display(tokens_dict)

{'model': 'gpt-4o',
 'prompt_tokens': 4338,
 'completion_tokens': 69,
 'total_tokens': 4407}

In [22]:
response_messages = client.beta.threads.messages.list(thread.id, order="asc")

In [23]:
response_messages

SyncCursorPage[Message](data=[Message(id='msg_I6PA9udwmJm61esmJDWA2fRH', assistant_id=None, attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='What is relative weighting of the report and code for M1 module?'), type='text')], created_at=1739626443, incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='user', run_id=None, status=None, thread_id='thread_JxjbyBLMVZoNiJZoB5YaAlx0'), Message(id='msg_Xb1Vloee659pPVcZ7Gujf2Fg', assistant_id='asst_Mw9Vylelr7Qje18Jd35FxwKC', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[FileCitationAnnotation(end_index=143, file_citation=FileCitation(file_id='file-Y8eTyzvzuGfFrKViMKPgij'), start_index=125, text='【4:0†handbook.pdf】', type='file_citation')], value='The relative weighting of the report and code for the M1 module (Machine Learning) is 67% for the report and 33% for the code【4:0†handbook.pdf】.'), type='text')], created_at=1739626451, i

In [24]:
run_steps = client.beta.threads.runs.steps.list(
    thread_id=thread.id,
    run_id=run.id
)

In [25]:
i = 0
for step in run_steps.data:
    print("i: ", i)
    try:

        retrieved_step = client.beta.threads.runs.steps.retrieve(
            thread_id=step.thread_id,
            run_id=run.id,
            step_id=step.id,
            include=["step_details.tool_calls[*].file_search.results[*].content"]
        )
        r = 0
        for result in retrieved_step.step_details.tool_calls[0].file_search.results:
            print("\n\nr: ", r)
            print("\n\nresult: ", result)
            r += 1

    except:
        print("step.step_details.tool_calls: None")
    print("\n\nstep done\n\n")
    i += 1

i:  0
step.step_details.tool_calls: None


step done


i:  1


r:  0


result:  FileSearchResult(file_id='file-Y8eTyzvzuGfFrKViMKPgij', file_name='handbook.pdf', score=0.9862615222581074, content=[FileSearchResultContent(text='S1 - Statistical Methods → S2 - Advanced Statistical Methods\nM1 - Machine Learning → M2 - Deep Learning\nC1 - Research Computing → C2 - High Performance Computing\n\nAs stated in the previous section on “managing your workload”, students who wish to take 2 major modules for\nexamination in Michaelmas and 3 from Lent should carefully consider both the prerequisite requirements for the\nLent modules, and the effect this will have on their workload balance during the year. Students considering this\noption should discuss it with a member of the MPhil teaching team in their review meeting before submitting their\nchoices for approval.\n\nEach Major module will count for 12% of the final grade and will be examined via a mix of coursework and written\nexams.\n\n3.1.1 