Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 45 additions & 24 deletions ai/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
load_sqlite_vector_extension as _load_sqlite_vector_extension,
ensure_chunks_and_meta as _ensure_chunks_and_meta,
insert_chunk_vector_with_retry as _insert_chunk_vector_with_retry,
search_vectors as _search_vectors,
get_chunk_text as _get_chunk_text,
)
from .openai import call_coding_api, EmbeddingClient
from .openai import call_coding_api
from .llama_embeddings import OpenAICompatibleEmbedding
from .llama_chunker import chunk_with_llama_index
from llama_index.core import Document
from utils.logger import get_logger
from utils import compute_file_hash, chunk_text, norm, cosine
from .smart_chunker import smart_chunk
from utils import compute_file_hash, norm, cosine
import logging

# reduce noise from httpx used by external libs
Expand Down Expand Up @@ -64,8 +64,8 @@

logger = get_logger(__name__)

# Initialize EmbeddingClient for structured logging and retry logic
_embedding_client = EmbeddingClient()
# Initialize llama-index embedding client
_embedding_client = OpenAICompatibleEmbedding()

# Thread-local storage to track execution state inside futures
_thread_state = threading.local()
Expand All @@ -86,7 +86,8 @@ def _get_embedding_with_semaphore(semaphore: threading.Semaphore, text: str, fil
semaphore.acquire()
try:
_thread_state.stage = "calling_embed_text"
result = _embedding_client.embed_text(text, file_path=file_path, chunk_index=chunk_index)
# Use llama-index embedding client
result = _embedding_client._get_text_embedding(text)
_thread_state.stage = "completed"
return result
except Exception as e:
Expand Down Expand Up @@ -171,14 +172,8 @@ def _process_file_sync(
if isinstance(cfg, dict):
embedding_model = cfg.get("embedding_model")

# Use smart chunking for supported code languages
use_smart_chunking = cfg.get("smart_chunking", True) if isinstance(cfg, dict) else True
supported_languages = ["python", "javascript", "typescript", "java", "go", "rust", "c", "cpp"]

if use_smart_chunking and lang in supported_languages:
chunks = smart_chunk(content, language=lang, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
else:
chunks = chunk_text(content, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
# Use llama-index chunking for all content
chunks = chunk_with_llama_index(content, language=lang, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

if not chunks:
chunks = [content]
Expand Down Expand Up @@ -395,11 +390,13 @@ def analyze_local_path_sync(

try:
# Use batch update for efficiency - single database transaction
# Store total_files for performance (avoid re-scanning directory on every request)
set_project_metadata_batch(database_path, {
"last_indexed_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"last_index_duration": str(duration),
"files_indexed": str(file_count),
"files_skipped": str(skipped_count)
"files_skipped": str(skipped_count),
"total_files": str(total_files) # Store total files found during indexing
})
except Exception:
logger.exception("Failed to store indexing metadata")
Expand Down Expand Up @@ -442,16 +439,40 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path

def search_semantic(query: str, database_path: str, top_k: int = 5):
"""
Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns
a list of {file_id, path, chunk_index, score}.
Uses llama-index with sqlite-vector backend to retrieve best-matching chunks.
Always includes content as it's needed for the coding model context.

Args:
query: Search query text
database_path: Path to the SQLite database
top_k: Number of results to return

Returns:
List of dicts with file_id, path, chunk_index, score, and content
"""
q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
if not q_emb:
return []

try:
return _search_vectors(database_path, q_emb, top_k=top_k)
except Exception:
# Use llama-index for semantic search
from .llama_integration import llama_index_search

docs = llama_index_search(query, database_path, top_k=top_k)

results = []
for doc in docs:
metadata = doc.metadata or {}
result = {
"file_id": metadata.get("file_id", 0),
"path": metadata.get("path", ""),
"chunk_index": metadata.get("chunk_index", 0),
"score": metadata.get("score", 0.0),
"content": doc.text or "" # Always include content for LLM context
}
results.append(result)

logger.info(f"llama-index search returned {len(results)} results")
return results

except Exception as e:
logger.exception(f"Semantic search failed: {e}")
raise


Expand Down
111 changes: 111 additions & 0 deletions ai/llama_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
LlamaIndex-based chunking for code and text.
Replaces smart_chunker.py with llama-index's built-in splitters.
"""
from typing import List
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
from llama_index.core.schema import Document

from utils.logger import get_logger

logger = get_logger(__name__)


def chunk_with_llama_index(
content: str,
language: str = "text",
chunk_size: int = 800,
chunk_overlap: int = 100
) -> List[str]:
"""
Chunk text or code using llama-index's splitters.

Args:
content: Text or code content to chunk
language: Programming language (python, javascript, etc.) or "text"
chunk_size: Target size for each chunk in characters
chunk_overlap: Overlap between chunks in characters

Returns:
List of text chunks
"""
# Map language names to llama-index language identifiers
language_map = {
"python": "python",
"javascript": "js",
"typescript": "ts",
"java": "java",
"go": "go",
"rust": "rust",
"c": "c",
"cpp": "cpp",
"c++": "cpp",
}

try:
# Check if it's a supported code language
llama_lang = language_map.get(language.lower())

if llama_lang:
# Use CodeSplitter for code
splitter = CodeSplitter(
language=llama_lang,
chunk_lines=40, # Target lines per chunk (approximation)
chunk_lines_overlap=5, # Overlap in lines
max_chars=chunk_size
)
logger.debug(f"Using CodeSplitter for language: {llama_lang}")
else:
# Use SentenceSplitter for text or unknown languages
splitter = SentenceSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
paragraph_separator="\n\n",
secondary_chunking_regex="[^,.;。?!]+[,.;。?!]?"
)
logger.debug(f"Using SentenceSplitter for language: {language}")

# Create a document and split it
doc = Document(text=content)
nodes = splitter.get_nodes_from_documents([doc])

# Extract text from nodes
chunks = [node.text for node in nodes if node.text]

logger.debug(f"Split content into {len(chunks)} chunks")
return chunks if chunks else [content]

except Exception as e:
logger.exception(f"Error chunking with llama-index: {e}")
# Fallback to simple chunking
return simple_chunk(content, chunk_size, chunk_overlap)


def simple_chunk(text: str, chunk_size: int = 800, chunk_overlap: int = 100) -> List[str]:
"""
Simple character-based chunking fallback.

Args:
text: Text to chunk
chunk_size: Size of each chunk
chunk_overlap: Overlap between chunks

Returns:
List of text chunks
"""
if not text:
return []

chunks = []
step = max(1, chunk_size - chunk_overlap)

for i in range(0, len(text), step):
end = min(i + chunk_size, len(text))
chunk = text[i:end]
if chunk.strip():
chunks.append(chunk)

if end >= len(text):
break

return chunks if chunks else [text]
100 changes: 100 additions & 0 deletions ai/llama_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
LlamaIndex-compatible embeddings using OpenAI API.
Replaces the custom EmbeddingClient with llama-index's embedding abstraction.
"""
from typing import List, Optional
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core.bridge.pydantic import PrivateAttr
from openai import OpenAI

from utils.config import CFG
from utils.logger import get_logger

logger = get_logger(__name__)


class OpenAICompatibleEmbedding(BaseEmbedding):
"""
LlamaIndex-compatible embedding model using OpenAI-compatible API.
Works with any OpenAI-compatible endpoint (OpenAI, Azure, local servers, etc.)
"""

_client: OpenAI = PrivateAttr()
_model: str = PrivateAttr()

def __init__(
self,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
model: Optional[str] = None,
**kwargs
):
"""
Initialize the embedding model.

Args:
api_key: OpenAI API key (defaults to config)
api_base: API base URL (defaults to config)
model: Model name (defaults to config)
"""
super().__init__(**kwargs)

# Get config values
self._client = OpenAI(
api_key=api_key or CFG.get("api_key"),
base_url=api_base or CFG.get("api_url")
)
self._model = model or CFG.get("embedding_model") or "text-embedding-3-small"

logger.info(f"Initialized OpenAICompatibleEmbedding with model: {self._model}")

@classmethod
def class_name(cls) -> str:
return "OpenAICompatibleEmbedding"

async def _aget_query_embedding(self, query: str) -> List[float]:
"""Get query embedding asynchronously."""
return self._get_query_embedding(query)

async def _aget_text_embedding(self, text: str) -> List[float]:
"""Get text embedding asynchronously."""
return self._get_text_embedding(text)

def _get_query_embedding(self, query: str) -> List[float]:
"""Get embedding for a query."""
return self._get_text_embedding(query)

def _get_text_embedding(self, text: str) -> List[float]:
"""Get embedding for a text."""
try:
# Clean the text
text = text.replace("\n", " ").strip()
if not text:
logger.warning("Empty text provided for embedding")
return []

# Call OpenAI API
response = self._client.embeddings.create(
input=[text],
model=self._model
)

if response.data and len(response.data) > 0:
embedding = response.data[0].embedding
logger.debug(f"Generated embedding with dimension: {len(embedding)}")
return embedding
else:
logger.error("No embedding returned from API")
return []

except Exception as e:
logger.exception(f"Failed to generate embedding: {e}")
return []

def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
"""Get embeddings for multiple texts."""
embeddings = []
for text in texts:
embedding = self._get_text_embedding(text)
embeddings.append(embedding)
return embeddings
Loading
Loading