Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,10 @@ VECTOR_DB_ENV=
POSTGRES_URI=
SECRET_KEY=
# Comma-separated list of allowed CORS origins, e.g. http://localhost:3000,https://yourdomain.com
CORS_ALLOW_ORIGINS=
CORS_ALLOW_ORIGINS=

# Web search configuration
WEB_SEARCH_ENGINE=tavily
TAVILY_API_KEY=your_tavily_api_key_here
MAX_FETCH_CONCURRENCY=4
DEFAULT_TOP_K_RESULTS=8
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
authormaton/
experimentalCode/.env
.env

# Ignore Python cache

__pycache__/
Expand Down
7 changes: 2 additions & 5 deletions api/indexing_router.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
"""
Indexing router for /internal/index endpoint.
"""
from fastapi import APIRouter, HTTPException, status, Request
from fastapi import APIRouter, HTTPException, Request
from pydantic import BaseModel
from config.settings import settings
from services.vector_db_service import VectorDBService
from services.vector_db_service import VectorDBClient as VectorDBService
from services.embedding_service import embed_texts_batched
from services.chunking_service import chunk_text
from services.parsing_service import extract_text_from_pdf, extract_text_from_docx
import logging
import os

router = APIRouter(prefix="/internal", tags=["internal"])

Expand Down
2 changes: 2 additions & 0 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ def read_root():
# Register routers
from api.endpoints.upload import router as upload_router
from api.endpoints.internal import router as internal_router
from api.endpoints.web_answering import router as web_answering_router
app.include_router(upload_router, prefix="/upload")
app.include_router(internal_router)
app.include_router(web_answering_router, prefix="/internal", tags=["websearch"])
app.include_router(indexing_router)

@app.get("/health")
Expand Down
10 changes: 9 additions & 1 deletion config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
"""
import os
from pydantic_settings import BaseSettings
from pydantic import SecretStr, ValidationError
from pydantic import SecretStr, ValidationError, Field
from typing import Optional
import sys
try:
from dotenv import load_dotenv
Expand All @@ -24,6 +25,13 @@ class Settings(BaseSettings):
embedding_dimension: int = 3072
embed_batch_size: int = 128
max_upload_mb: int = 25

# Web search settings
web_search_engine: str = os.environ.get("WEB_SEARCH_ENGINE", "dummy") # Default to dummy provider if not specified
tavily_api_key: Optional[SecretStr] = None
bing_api_key: Optional[SecretStr] = None
max_fetch_concurrency: int = 4
default_top_k_results: int = 8

try:
settings = Settings()
Expand Down
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ python-multipart>=0.0.6,<1.0.0
fastapi>=0.110.0,<1.0.0
uvicorn[standard]>=0.29.0,<1.0.0
pytest>=8.2.0,<9.0.0
httpx>=0.27.0,<1.0.0
httpx[http2]>=0.27.0,<1.0.0
python-dotenv>=1.0.0,<2.0.0
PyPDF2>=3.0.0,<4.0.0
requests>=2.31.0,<3.0.0
Expand All @@ -13,4 +13,6 @@ pinecone-client>=3.0.0,<4.0.0
weaviate-client>=4.4.0,<5.0.0
transformers>=4.40.0,<5.0.0
torch>=2.2.0,<3.0.0
pydantic>=2.6.0,<3.0.0
pydantic>=2.6.0,<3.0.0
trafilatura>=1.6.0,<2.0.0
numpy>=1.26.0,<2.0.0
21 changes: 21 additions & 0 deletions services/vector_db_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,27 @@ def upsert_vectors(self, vectors: List[List[float]], ids: List[str]):
if not self.index:
raise RuntimeError("Index is not initialized. Call create_index first.")
self.index.upsert(vectors=[(id, vec) for id, vec in zip(ids, vectors)])

def upsert(self, namespace, ids, vectors, metadata=None):
"""
Upsert vectors into the index, ensuring index is created and metadata is validated.
"""
if self.index is None:
self.create_index()
if not (len(ids) == len(vectors)):
Comment on lines +55 to +57
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Guard against missing embedding dimension before auto-creating the index.

Without a dimension, create_index will fail at runtime.

-        if self.index is None:
-            self.create_index()
+        if self.index is None:
+            if self.dimension is None:
+                raise RuntimeError(
+                    "embedding dimension is not configured; set settings.embedding_dimension or pass dimension."
+                )
+            self.create_index()
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if self.index is None:
self.create_index()
if not (len(ids) == len(vectors)):
if self.index is None:
if self.dimension is None:
raise RuntimeError(
"embedding dimension is not configured; set settings.embedding_dimension or pass dimension."
)
self.create_index()
if not (len(ids) == len(vectors)):
🤖 Prompt for AI Agents
In services/vector_db_service.py around lines 55-57, the code auto-calls
create_index() without ensuring an embedding dimension exists which will cause a
runtime failure; before calling create_index(), validate that self.dimension is
set or infer it from the provided vectors (e.g., if vectors is non-empty set
self.dimension = len(vectors[0]) after validating all vectors share that
length), and if neither self.dimension nor inferable from vectors, raise a clear
ValueError asking the caller to provide an embedding dimension; adjust
create_index to accept/use the dimension if it currently does not.

raise ValueError("ids and vectors must have the same length")
if metadata is not None and len(metadata) != len(ids):
raise ValueError("metadata length must match ids/vectors length")
items = []
for i, (id_, vector) in enumerate(zip(ids, vectors)):
item = {
"id": id_,
"values": vector
}
if metadata is not None:
item["metadata"] = metadata[i]
items.append(item)
self.index.upsert(vectors=items, namespace=namespace)
Comment on lines +57 to +70
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add dimension checks, vector normalization, and metadata type validation.

Parity with upsert_vectors and fewer Pinecone-side errors.

-        if not (len(ids) == len(vectors)):
+        if len(ids) != len(vectors):
             raise ValueError("ids and vectors must have the same length")
-        if metadata is not None and len(metadata) != len(ids):
-            raise ValueError("metadata length must match ids/vectors length")
-        items = []
-        for i, (id_, vector) in enumerate(zip(ids, vectors)):
-            item = {
-                "id": id_,
-                "values": vector
-            }
-            if metadata is not None:
-                item["metadata"] = metadata[i]
-            items.append(item)
-        self.index.upsert(vectors=items, namespace=namespace)
+        if metadata is not None and len(metadata) != len(ids):
+            raise ValueError("metadata length must match ids/vectors length")
+        items: list[dict] = []
+        for i, (id_, vector) in enumerate(zip(ids, vectors)):
+            # Dimension validation + normalization (supports numpy arrays)
+            if self.dimension is not None and len(vector) != self.dimension:
+                raise ValueError(
+                    f"vector[{i}] dimensionality {len(vector)} != expected {self.dimension}"
+                )
+            values = vector.tolist() if hasattr(vector, "tolist") else list(vector)
+            if not all(isinstance(x, (int, float)) for x in values):
+                raise TypeError(f"vector[{i}] must be a sequence of numbers")
+            item = {"id": id_, "values": values}
+            if metadata is not None:
+                md = metadata[i]
+                if not isinstance(md, dict):
+                    raise TypeError("each metadata entry must be a dict")
+                item["metadata"] = md
+            items.append(item)
+        self.index.upsert(vectors=items, namespace=namespace)
🧰 Tools
🪛 Ruff (0.13.1)

58-58: Avoid specifying long messages outside the exception class

(TRY003)


60-60: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In services/vector_db_service.py around lines 57 to 70, add parity with
upsert_vectors by validating each vector's dimension, normalizing vectors, and
validating metadata types before calling upsert: check that each vector is a
sequence of numeric values and its length equals the configured embedding
dimension (e.g., self.dimension) and raise ValueError if not; if normalization
is required, convert each vector to a unit vector (divide by its L2 norm,
guarding against zero norm); ensure metadata (when provided) is the expected
type (e.g., dict or list) and raise TypeError for invalid entries; perform these
checks/normalization inside the loop that builds items so you send only
validated, normalized vectors to self.index.upsert(vectors=items,
namespace=namespace).


def query(self, vector: List[float], top_k: int = 5):
if not self.index:
Expand Down
Loading