### Step One: Imports

In [1]:
import chromadb
from chromadb.config import Settings
import openai
from openai import OpenAI
import pandas as pd
from typing import Dict, List, Tuple, Optional, Any
import numpy as np
from pathlib import Path
import re

### Step Two: Initialize OpenAI and ChromaDB.

In [2]:
openai_client = OpenAI(
            base_url="https://openai.vocareum.com/v1",
            api_key="voc-1731846060126677132371766c7b06ddc2849.94786379"
        )

In [3]:
client = chromadb.PersistentClient(
            path= "/workspace/Project/",
            settings=Settings(
                anonymized_telemetry=False,  # Disable telemetry for privacy
                allow_reset=True             # Allow database reset for development
            )
        )

### Step Three: Create Collection in chromadb

In [4]:
def create_collection(collection_name : str) -> chromadb.Collection:
    if collection_name == "documents":
        client.delete_collection(collection_name)
    
    collection = client.create_collection(
                    name=collection_name,
                    embedding_function=None,  # We'll handle embeddings manually
                )
    return collection
collection = create_collection("documents")
print(collection)

Collection(name=documents)


### Step Four: Chunk Texts

In [5]:
def chunk_text(file_path: Path) -> tuple[list[str], list[Dict[str, Any]]]: 
    with open(file_path, 'r') as file:
        text = file.read()

    rules = text.split("----------------------------------------------")
    documents : list[Dict] = []

    index = 0
    for rule in rules:
        if not rule:
            break

        rule_id = None
        category = None
        severity = None

        for line in rule.split("\n"):
            if line.startswith("[RULE_ID:"):
                rule_id = line.split(":")[1].strip("]").strip()
            elif line.startswith("[CATEGORY"):
                category = line.split(":")[1].strip("]").strip()
            elif line.startswith("[SEVERITY:"):
                severity = line.split(":")[1].strip("]").strip()
        
        chunk = re.sub(r"\[.*?\]\s*", "", rule).strip()
        metadata = {
            "rule_id" : rule_id,
            "category" : category,
            "severity" : severity
        }

        documents.append({
            "id" : str(index),
            "chunk" : chunk,
            "metadata" : metadata
        })

        index += 1
        
    return documents

documents = chunk_text("/workspace/Project/conventions_document.txt")
print(documents)



### Step Five: Generate Embeddings for Chnuks

In [6]:
def generate_embedding(chunks : list[str]) -> List[List[float]]:
    response = openai_client.embeddings.create(
                    model = "text-embedding-3-small",
                    input = chunks
                )
    embeddings = [embedding.embedding for embedding in response.data]
    return embeddings

### Step Six: Add Chunks, Embeddings and Metadatas to Collection

In [7]:
def add_to_collection(documents : list[Dict]):

    ids = [document["id"] for document in documents]
    chunks = [document["chunk"] for document in documents]
    embeddings = generate_embedding(chunks)
    metadatas = [document["metadata"] for document in documents]

    collection.add(
        ids=ids,
        documents=chunks,
        embeddings=embeddings,
        metadatas=metadatas,
    )

add_to_collection(documents)

### Step Seven: Search Documents (Query)

In [10]:
def search_documents(query: str, n_context: int = 9, metadata_filter: Optional[Dict] = None):
    if metadata_filter:
        print("metadata filters: ", metadata_filter)

    query_embeddings = generate_embedding([query])

    results = collection.query(
                query_embeddings=query_embeddings,
                n_results=n_context,
                where=metadata_filter,
                include=["documents", "distances", "metadatas"]
            )

    formatted_results = {
        "query": query,
        "n_results": len(results['documents'][0]),
        "results": []
    }
    for i in range(len(results['documents'][0])):
        formatted_results["results"].append({
            "document": results['documents'][0][i],
            "similarity_score": 1 - results['distances'][0][i],  # Convert distance to similarity
            "metadata": results['metadatas'][0][i]
    })

    return formatted_results


In [24]:
def generate_llm_response(query: str, n_context: int = 9):

    search_results = search_documents(query, n_context)
        
    if not search_results["results"]:
        return {
            "query": query,
            "answer": "I couldn't find relevant information to answer your question.",
            "context": [],
            "generation_time": 0,
            "context_used": 0
        }

    context_documents = []
    for result in search_results["results"]:
        context_documents.append({
            "content": result["document"],
            "similarity": result["similarity_score"],
            "rule_id": result["metadata"].get("rule_id", "Unknown")
        })

    context_text = "\n\n".join([
            f"Document {doc["rule_id"]} (Similarity: {doc['similarity']:.3f}):\n{doc['content']}"
            for i, doc in enumerate(context_documents)
        ])

    prompt = f"""Based on the following context documents, please check user's java code convention, If context has no information mention that it not provided from context.

        Context Documents:
        {context_text}

        User Question: {query}

        Please provide a comprehensive answer based on the context provided:"""

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_completion_tokens=500
    )

    llm_response = {
                "query": query,
                "answer": response.choices[0].message.content,
                "context": context_documents,
                "context_used": len(context_documents),
            }

    return llm_response

query = """

    @Service
    public class UserServiceImpl implements UserService {
        //...
    }

    """
llm_response = generate_llm_response(query)



In [25]:
print(llm_response["answer"])

Based on the context documents provided, your Java code convention for the class `UserServiceImpl` is correct. Here are the specific points that align with the conventions outlined in the documents:

1. **Naming Convention for Service Implementation**:
   - Your class `UserServiceImpl` correctly ends with "ServiceImpl", which aligns with the convention stated in the document `service_impl_naming`.

2. **Annotation Usage**:
   - You have correctly used the `@Service` annotation, which is required as per the same document.

3. **Interface Implementation**:
   - Your class implements the `UserService` interface. The context does not directly specify a convention regarding interface names, but it is generally accepted that service classes should implement service interfaces of the same name.

Overall, your code adheres to the conventions provided in the context documents. No issues were found regarding naming or annotation usage.
