FuturetalkDev · ian-katsuno · Sep 8, 2025 · Aug 16, 2025 · Aug 16, 2025 · Sep 8, 2025
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.10-slim-bullseye
 WORKDIR /code
 ENV PORT 8000
 EXPOSE 8000
@@ -15,6 +15,12 @@ RUN rm -rf /var/lib/apt/lists/* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# Install system dependencies
+# this was added before pinned python:3.10-slim-bullseye, shouldn't need it anymore
+# RUN apt-get update && apt-get install -y --no-install-recommends \
+#     libgl1 \
+#     && rm -rf /var/lib/apt/lists/*
+
 # Now update and install dependencies including build tools
 RUN apt-get update && \
    apt-get install -y --no-install-recommends --no-install-suggests \

diff --git a/backend/Pipfile b/backend/Pipfile
@@ -66,8 +66,9 @@ rouge-score = "==0.1.2"
 langchain-neo4j = "==0.4.0"
 pypandoc-binary = "==1.15"
 chardet = "==5.2.0"
-unstructured = "==0.17.2"
 contextgem = "==0.8.1"
+# we need to backpropagate this unstructured with doctypes to requirements.txt, right now it has unstructured[all-docs]
+unstructured = {extras = ["pptx", "md", "docx"], version = "*"} 
 
 [dev-packages]
 

diff --git a/backend/Pipfile.lock b/backend/Pipfile.lock
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -41,7 +41,7 @@ starlette==0.46.1
 sse-starlette==2.2.1
 starlette-session==0.4.3
 tqdm==4.67.1
-unstructured[all-docs]
+unstructured[all-docs] #in the pipfile we have unstructured[pdf,md,docx,pptx], need to reconcile these two
 unstructured==0.17.2
 unstructured-client==0.32.3
 unstructured-inference==0.8.10

diff --git a/backend/score.py b/backend/score.py
@@ -599,11 +599,23 @@ async def chat_bot_grounding(
     session_id=Form(None),
     mode=Form(None),
     email=Form(None),
-    requireGrounding: bool = Form(True)
+    requireGrounding: bool = Form(True),
+    filterProperties: str = Form(None)
 ):
     logging.info(f"QA_RAG (grounding) called at {datetime.now()}")
     logging.info(f"document_names = {document_names}")
+    logging.info(f"filterProperties = {filterProperties}")
     qa_rag_start_time = time.time()
+
+    # Parse filterProperties JSON if provided
+    filter_properties = None
+    if filterProperties:
+        try:
+            filter_properties = json.loads(filterProperties)
+        except json.JSONDecodeError as e:
+            logging.warning(f"Invalid JSON in filterProperties: {e}")
+            filter_properties = None
+
     try:
         if mode == "graph":
             graph = Neo4jGraph( url=uri,username=userName,password=password,database=database,sanitize = True, refresh_schema=True)
@@ -614,14 +626,14 @@ async def chat_bot_grounding(
         write_access = graph_DB_dataAccess.check_account_access(database=database)
         # Select the system template based on requireGrounding (to be used inside QA_RAG or before calling it):
         # system_template = CHAT_SYSTEM_TEMPLATE if requireGrounding else CHAT_SYSTEM_TEMPLATE_UNGROUNDED
-        result = await asyncio.to_thread(QA_RAG_GROUNDING,graph=graph,model=model,question=question,document_names=document_names,session_id=session_id,mode=mode,write_access=write_access,requireGrounding=requireGrounding)
+        result = await asyncio.to_thread(QA_RAG_GROUNDING,graph=graph,model=model,question=question,document_names=document_names,session_id=session_id,mode=mode,write_access=write_access,filter_properties=filter_properties,requireGrounding=requireGrounding)
 
         total_call_time = time.time() - qa_rag_start_time
         logging.info(f"Total Response time is  {total_call_time:.2f} seconds")
         result["info"]["response_time"] = round(total_call_time, 2)
 
         json_obj = {'api_name':'chat_bot_grounding','db_url':uri, 'userName':userName, 'database':database, 'question':question,'document_names':document_names,
-                             'session_id':session_id, 'mode':mode, 'requireGrounding': requireGrounding, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{total_call_time:.2f}','email':email}
+                             'session_id':session_id, 'mode':mode, 'requireGrounding': requireGrounding, 'filterProperties': filterProperties, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{total_call_time:.2f}','email':email}
         logger.log_struct(json_obj, "INFO")
 
         return create_api_response('Success',data=result)

diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py
@@ -409,15 +409,23 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
                 'effective_search_ratio': ef_ratio,
                 'score_threshold': score_threshold,
                 'filter': base_filter
-            }
+            },
         )
         logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold} for documents {document_names}")
     else:
         # note from ian changed the way this is written slightly to match how the upstream team is doing it, passing filter_properties or None to search_kwargs literal.            
 
+        # search_kwargs = {
+        #     'k': search_k, 
+        #     'score_threshold': score_threshold,
+        #     "filter": {
+        #         "topics": { "$like": "topic_ENDOC"}
+        #     }
+        # }
         search_kwargs = {
             'k': search_k, 
-            'score_threshold': score_threshold
+            'score_threshold': score_threshold,
+            # "filter": filter_properties
         }
         if filter_properties is not None:
             search_kwargs['filter'] = filter_properties
@@ -428,16 +436,18 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
             search_kwargs=search_kwargs
         )
         print(f"search_kwargs = {search_kwargs}")
-        logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
+        logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}, filter={filter_properties}")
     return retriever
 
-def get_neo4j_retriever(graph, document_names, chat_mode_settings, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD, filter_properties=None):
+def get_neo4j_retriever(graph, document_names, chat_mode_settings, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD, filter_properties=None):    
     try:
         neo_db = initialize_neo4j_vector(graph, chat_mode_settings)
         # document_names= list(map(str.strip, json.loads(document_names)))
         search_k = chat_mode_settings["top_k"]
         ef_ratio = int(os.getenv("EFFECTIVE_SEARCH_RATIO", "2")) if os.getenv("EFFECTIVE_SEARCH_RATIO", "2").isdigit() else 2
-        retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold,ef_ratio,filter_properties)
+
+        # Use the parameter instead of hardcoded value
+        retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold,ef_ratio, filter_properties)
         return retriever
     except Exception as e:
         index_name = chat_mode_settings.get("index_name")
@@ -928,7 +938,7 @@ def QA_RAG(graph,model, question, document_names, session_id, mode, write_access
 
     return result
 
-def QA_RAG_GROUNDING(graph,model, question, document_names, session_id, mode, write_access=True, requireGrounding=True):
+def QA_RAG_GROUNDING(graph,model, question, document_names, session_id, mode, write_access=True, filter_properties=None, requireGrounding=True):
     logging.info(f"Chat Mode: {mode}")
 
     history = create_neo4j_chat_message_history(graph, session_id, write_access)
@@ -967,7 +977,7 @@ def QA_RAG_GROUNDING(graph,model, question, document_names, session_id, mode, wr
                 "user": "chatbot"
             }
         else:
-            result = process_chat_response(messages,history, question, model, graph, document_names,chat_mode_settings, extract_tools=False, filter_properties={}, requireGrounding=requireGrounding)
+            result = process_chat_response(messages,history, question, model, graph, document_names,chat_mode_settings, extract_tools=False, filter_properties=filter_properties, requireGrounding=requireGrounding)
 
     result["session_id"] = session_id
 

diff --git a/backend/src/main.py b/backend/src/main.py
@@ -341,6 +341,15 @@ async def processing_source(uri, userName, password, database, model, file_name,
   uri_latency["create_list_chunk_and_document"] = f'{elapsed_get_chunkId_chunkDoc_list:.2f}'
   uri_latency["total_chunks"] = total_chunks
 
+  # Add hardcoded label to Document node
+  # uncomment this bloc when ingesting - Ian
+  # start_add_hardcoded_label = time.time()
+  # add_hardcoded_label_to_document(graph, file_name, "persona_FIRST_LINE_MANAGER")
+  # end_add_hardcoded_label = time.time()
+  # elapsed_add_hardcoded_label = end_add_hardcoded_label - start_add_hardcoded_label
+  # logging.info(f'Time taken to add hardcoded label to document: {elapsed_add_hardcoded_label:.2f} seconds')
+  # uri_latency["add_hardcoded_label"] = f'{elapsed_add_hardcoded_label:.2f}'
+
   # this was autogenerated by cursor - we may want to write the data to the document differenttly
   # Update document node with metadata
   # if metadata.domain:
@@ -786,4 +795,26 @@ def failed_file_process(uri,file_name, merged_file_path):
       delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name)
   else:
       logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
-      delete_uploaded_local_file(merged_file_path,file_name)
+      delete_uploaded_local_file(merged_file_path,file_name)
+
+def add_hardcoded_label_to_document(graph, file_name, label):
+    """
+    Add a hardcoded label to the Document node for the given file.
+
+    Args:
+        graph: Neo4jGraph connection object
+        file_name: Name of the file to add label to
+        label: The label to add to the Document node
+    """
+    try:
+        # Add a hardcoded label to the Document node
+        # Using string formatting for the label since Cypher doesn't support parameterized labels
+        query = f"""
+        MATCH (d:Document {{fileName: $file_name}})
+        SET d:Document:{label}
+        """
+        graph.query(query, {"file_name": file_name})
+        logging.info(f"Successfully added hardcoded label '{label}' to Document node for file: {file_name}")
+    except Exception as e:
+        logging.error(f"Error adding hardcoded label '{label}' to Document node for file {file_name}: {e}")
+        # Don't raise the exception to avoid stopping the processing pipeline
diff --git a/deploy.sh b/deploy.sh
@@ -32,8 +32,8 @@ set +a  # turn off automatic export
 
 # 2. Build Docker images using Docker Compose
 echo "Building Docker images..."
-# docker-compose build --no-cache --progress=plain
-docker-compose build
+docker-compose build --no-cache --progress=plain
+# docker-compose build
 
 # 2. Authenticate Docker to AWS ECR
 echo "Authenticating to AWS ECR..."