In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [None]:
df = pd.read_parquet("./feature_repo/data/docling_samples.parquet")

In [4]:
len(df['chunk_embedding'].values[0])

384

In [None]:
from feast import FeatureStore

store = FeatureStore(repo_path="./feature_repo")

In [9]:
embedding_length = len(df['chunk_embedding'].values[0])
print(f'embedding length = {embedding_length}')
df['created'] = pd.Timestamp.now()

# Ingesting transformed data to the feature view that has no associated transformation
store.write_to_online_store(feature_view_name='docling_feature_view', df=df)

print('batch ingestion done')

embedding length = 384
Connecting to Milvus in local mode using feature_repo/data/online_store.db
batch ingestion done


In [None]:
from feature_repo.feature_repo import embed_text

In [None]:
query_text = "Who is Ahmed Nassar?"
query_embedding = embed_text(query_text)

In [None]:
# Retrieve top k documents
context_data = store.retrieve_online_documents_v2(
    features=[
        "docling_feature_view:vector",
        "docling_feature_view:file_name",
        "docling_feature_view:raw_chunk_markdown",
        "docling_feature_view:chunk_id",
    ],
    query=query_embedding,
    top_k=3,
    distance_metric='COSINE',
).to_df()

In [8]:
context_data[['vector', 'file_name', 'raw_chunk_markdown', 'distance']]

Unnamed: 0,vector,file_name,raw_chunk_markdown,distance
0,"[0.056698862463235855, 0.062439583241939545, -...",2305.03393v1,"References\n- 8. Livathinos, N., Berrospi, C.,...",0.41953
1,"[0.056698862463235855, 0.062439583241939545, -...",2203.01017v2,"Ahmed Nassar, Nikolaos Livathinos, Maksym Lysa...",0.406184
2,"[0.056698862463235855, 0.062439583241939545, -...",2305.03393v1,Optimized Table Tokenization for Table Structu...,0.371177


In [24]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

True

In [25]:
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [27]:
def format_documents(context_df):
    
    output_context = ""
    
    # Remove duplicates based on 'chunk_id' (ensuring unique document chunks)
    unique_documents = context_df.drop_duplicates(subset=["chunk_id"])["raw_chunk_markdown"]
    
    # Format each document
    for i, document_text in enumerate(unique_documents):
        output_context += f"****START DOCUMENT {i}****\n"
        output_context += f"document = {{ {document_text.strip()} }}\n"
        output_context += f"****END DOCUMENT {i}****\n\n"
    
    return output_context.strip()

In [38]:
RAG_CONTEXT = format_documents(context_data)

In [39]:
print(RAG_CONTEXT)

****START DOCUMENT 0****
document = { References
- 8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., }
****END DOCUMENT 0****

****START DOCUMENT 1****
document = { Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar IBM Research
{ ahn,nli,mly,taa } @zurich.ibm.com }
****END DOCUMENT 1****

****START DOCUMENT 2****
document = { Optimized Table Tokenization for Table Structure Recognition
Maksym Lysak [0000 - 0002 - 3723 - 6960] , Ahmed Nassar [0000 - 0002 - 9468 - 0822] , Nikolaos Livathinos [0000 - 0001 - 8513 - }
****END DOCUMENT 2****


In [40]:
FULL_PROMPT = f"""
You are an assistant for answering questions about a series of documents. You will be provided documentation from different documents. Provide a conversational answer.
If you don't know the answer, just say "I do not know." Don't make up an answer.

Here are document(s) you should use when answer the users question:
{RAG_CONTEXT}
"""

In [41]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": FULL_PROMPT},
        {"role": "user", "content": query_text}
    ],
)

print('\n'.join([c.message.content for c in response.choices]))

Ahmed Nassar is one of the authors associated with research conducted at IBM. He is mentioned alongside Nikolaos Livathinos and Maksym Lysak in the context of their work on optimized table tokenization for table structure recognition.
