Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10-slim
FROM python:3.10-slim-bullseye
WORKDIR /code
ENV PORT 8000
EXPOSE 8000
Expand All @@ -15,6 +15,12 @@ RUN rm -rf /var/lib/apt/lists/* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install system dependencies
# this was added before pinned python:3.10-slim-bullseye, shouldn't need it anymore
# RUN apt-get update && apt-get install -y --no-install-recommends \
# libgl1 \
# && rm -rf /var/lib/apt/lists/*

# Now update and install dependencies including build tools
RUN apt-get update && \
apt-get install -y --no-install-recommends --no-install-suggests \
Expand Down
3 changes: 2 additions & 1 deletion backend/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ rouge-score = "==0.1.2"
langchain-neo4j = "==0.4.0"
pypandoc-binary = "==1.15"
chardet = "==5.2.0"
unstructured = "==0.17.2"
contextgem = "==0.8.1"
# we need to backpropagate this unstructured with doctypes to requirements.txt, right now it has unstructured[all-docs]
unstructured = {extras = ["pptx", "md", "docx"], version = "*"}

[dev-packages]

Expand Down
2,795 changes: 1,481 additions & 1,314 deletions backend/Pipfile.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ starlette==0.46.1
sse-starlette==2.2.1
starlette-session==0.4.3
tqdm==4.67.1
unstructured[all-docs]
unstructured[all-docs] #in the pipfile we have unstructured[pdf,md,docx,pptx], need to reconcile these two
unstructured==0.17.2
unstructured-client==0.32.3
unstructured-inference==0.8.10
Expand Down
18 changes: 15 additions & 3 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,11 +599,23 @@ async def chat_bot_grounding(
session_id=Form(None),
mode=Form(None),
email=Form(None),
requireGrounding: bool = Form(True)
requireGrounding: bool = Form(True),
filterProperties: str = Form(None)
):
logging.info(f"QA_RAG (grounding) called at {datetime.now()}")
logging.info(f"document_names = {document_names}")
logging.info(f"filterProperties = {filterProperties}")
qa_rag_start_time = time.time()

# Parse filterProperties JSON if provided
filter_properties = None
if filterProperties:
try:
filter_properties = json.loads(filterProperties)
except json.JSONDecodeError as e:
logging.warning(f"Invalid JSON in filterProperties: {e}")
filter_properties = None

try:
if mode == "graph":
graph = Neo4jGraph( url=uri,username=userName,password=password,database=database,sanitize = True, refresh_schema=True)
Expand All @@ -614,14 +626,14 @@ async def chat_bot_grounding(
write_access = graph_DB_dataAccess.check_account_access(database=database)
# Select the system template based on requireGrounding (to be used inside QA_RAG or before calling it):
# system_template = CHAT_SYSTEM_TEMPLATE if requireGrounding else CHAT_SYSTEM_TEMPLATE_UNGROUNDED
result = await asyncio.to_thread(QA_RAG_GROUNDING,graph=graph,model=model,question=question,document_names=document_names,session_id=session_id,mode=mode,write_access=write_access,requireGrounding=requireGrounding)
result = await asyncio.to_thread(QA_RAG_GROUNDING,graph=graph,model=model,question=question,document_names=document_names,session_id=session_id,mode=mode,write_access=write_access,filter_properties=filter_properties,requireGrounding=requireGrounding)

total_call_time = time.time() - qa_rag_start_time
logging.info(f"Total Response time is {total_call_time:.2f} seconds")
result["info"]["response_time"] = round(total_call_time, 2)

json_obj = {'api_name':'chat_bot_grounding','db_url':uri, 'userName':userName, 'database':database, 'question':question,'document_names':document_names,
'session_id':session_id, 'mode':mode, 'requireGrounding': requireGrounding, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{total_call_time:.2f}','email':email}
'session_id':session_id, 'mode':mode, 'requireGrounding': requireGrounding, 'filterProperties': filterProperties, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{total_call_time:.2f}','email':email}
logger.log_struct(json_obj, "INFO")

return create_api_response('Success',data=result)
Expand Down
24 changes: 17 additions & 7 deletions backend/src/QA_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,15 +409,23 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
'effective_search_ratio': ef_ratio,
'score_threshold': score_threshold,
'filter': base_filter
}
},
)
logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold} for documents {document_names}")
else:
# note from ian changed the way this is written slightly to match how the upstream team is doing it, passing filter_properties or None to search_kwargs literal.

# search_kwargs = {
# 'k': search_k,
# 'score_threshold': score_threshold,
# "filter": {
# "topics": { "$like": "topic_ENDOC"}
# }
# }
search_kwargs = {
'k': search_k,
'score_threshold': score_threshold
'score_threshold': score_threshold,
# "filter": filter_properties
}
if filter_properties is not None:
search_kwargs['filter'] = filter_properties
Expand All @@ -428,16 +436,18 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
search_kwargs=search_kwargs
)
print(f"search_kwargs = {search_kwargs}")
logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}, filter={filter_properties}")
return retriever

def get_neo4j_retriever(graph, document_names, chat_mode_settings, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD, filter_properties=None):
def get_neo4j_retriever(graph, document_names, chat_mode_settings, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD, filter_properties=None):
try:
neo_db = initialize_neo4j_vector(graph, chat_mode_settings)
# document_names= list(map(str.strip, json.loads(document_names)))
search_k = chat_mode_settings["top_k"]
ef_ratio = int(os.getenv("EFFECTIVE_SEARCH_RATIO", "2")) if os.getenv("EFFECTIVE_SEARCH_RATIO", "2").isdigit() else 2
retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold,ef_ratio,filter_properties)

# Use the parameter instead of hardcoded value
retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold,ef_ratio, filter_properties)
return retriever
except Exception as e:
index_name = chat_mode_settings.get("index_name")
Expand Down Expand Up @@ -928,7 +938,7 @@ def QA_RAG(graph,model, question, document_names, session_id, mode, write_access

return result

def QA_RAG_GROUNDING(graph,model, question, document_names, session_id, mode, write_access=True, requireGrounding=True):
def QA_RAG_GROUNDING(graph,model, question, document_names, session_id, mode, write_access=True, filter_properties=None, requireGrounding=True):
logging.info(f"Chat Mode: {mode}")

history = create_neo4j_chat_message_history(graph, session_id, write_access)
Expand Down Expand Up @@ -967,7 +977,7 @@ def QA_RAG_GROUNDING(graph,model, question, document_names, session_id, mode, wr
"user": "chatbot"
}
else:
result = process_chat_response(messages,history, question, model, graph, document_names,chat_mode_settings, extract_tools=False, filter_properties={}, requireGrounding=requireGrounding)
result = process_chat_response(messages,history, question, model, graph, document_names,chat_mode_settings, extract_tools=False, filter_properties=filter_properties, requireGrounding=requireGrounding)

result["session_id"] = session_id

Expand Down
33 changes: 32 additions & 1 deletion backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,15 @@ async def processing_source(uri, userName, password, database, model, file_name,
uri_latency["create_list_chunk_and_document"] = f'{elapsed_get_chunkId_chunkDoc_list:.2f}'
uri_latency["total_chunks"] = total_chunks

# Add hardcoded label to Document node
# uncomment this bloc when ingesting - Ian
# start_add_hardcoded_label = time.time()
# add_hardcoded_label_to_document(graph, file_name, "persona_FIRST_LINE_MANAGER")
# end_add_hardcoded_label = time.time()
# elapsed_add_hardcoded_label = end_add_hardcoded_label - start_add_hardcoded_label
# logging.info(f'Time taken to add hardcoded label to document: {elapsed_add_hardcoded_label:.2f} seconds')
# uri_latency["add_hardcoded_label"] = f'{elapsed_add_hardcoded_label:.2f}'

# this was autogenerated by cursor - we may want to write the data to the document differenttly
# Update document node with metadata
# if metadata.domain:
Expand Down Expand Up @@ -786,4 +795,26 @@ def failed_file_process(uri,file_name, merged_file_path):
delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name)
else:
logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
delete_uploaded_local_file(merged_file_path,file_name)
delete_uploaded_local_file(merged_file_path,file_name)

def add_hardcoded_label_to_document(graph, file_name, label):
"""
Add a hardcoded label to the Document node for the given file.

Args:
graph: Neo4jGraph connection object
file_name: Name of the file to add label to
label: The label to add to the Document node
"""
try:
# Add a hardcoded label to the Document node
# Using string formatting for the label since Cypher doesn't support parameterized labels
query = f"""
MATCH (d:Document {{fileName: $file_name}})
SET d:Document:{label}
"""
graph.query(query, {"file_name": file_name})
logging.info(f"Successfully added hardcoded label '{label}' to Document node for file: {file_name}")
except Exception as e:
logging.error(f"Error adding hardcoded label '{label}' to Document node for file {file_name}: {e}")
# Don't raise the exception to avoid stopping the processing pipeline
4 changes: 2 additions & 2 deletions deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ set +a # turn off automatic export

# 2. Build Docker images using Docker Compose
echo "Building Docker images..."
# docker-compose build --no-cache --progress=plain
docker-compose build
docker-compose build --no-cache --progress=plain
# docker-compose build

# 2. Authenticate Docker to AWS ECR
echo "Authenticating to AWS ECR..."
Expand Down