In [0]:
#pip install openai==0.28
#%pip install faiss-cpu

%pip install -U mlflow databricks-sdk databricks-langchain unitycatalog-ai[databricks] \
databricks-vectorsearch langchain langgraph faiss-cpu


[33mDEPRECATION: Using the pkg_resources metadata backend is deprecated. pip 26.3 will enforce this behaviour change. A possible replacement is to use the default importlib.metadata backend, by unsetting the _PIP_USE_IMPORTLIB_METADATA environment variable. Discussion can be found at https://github.com/pypa/pip/issues/13317[0m[33m
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/31.4 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m31.2/31.4 MB[0m [31m196.5 MB/s[0m eta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m97.6 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cp

In [0]:
dbutils.library.restartPython()

In [0]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import json
import os, mlflow.deployments

def basic_llm_chat(prompt):
    """Basic LLM interaction ; I'm using databricks model you can use your own model"""
    try:
        client = mlflow.deployments.get_deploy_client("databricks")
        resp = client.predict(
            endpoint="databricks-meta-llama-3-1-8b-instruct", # choose your chat model endpoint
            inputs={"messages": [
                {"role": "system", "content": "You are a concise, accurate assistant."},{"role": "user", "content": prompt},],
                    "max_tokens": 300,
                    "temperature": 0.2,
                    },)
        return resp["choices"][0]["message"]["content"]
        
    except Exception as e:
        return f"Error calling LLM API: {str(e)}"
    
class RAGSystem:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        """Initialize RAG system with embedding model and vector store"""
        self.embedding_model = SentenceTransformer(model_name)
        self.documents = []
        self.embeddings = None
        self.index = None
    
    def add_documents(self, documents):
        """Add documents to the knowledge base"""
        self.documents.extend(documents)
        
        # Generate embeddings for all documents
        doc_embeddings = self.embedding_model.encode([doc['content'] for doc in documents])
        
        if self.embeddings is None:
            self.embeddings = doc_embeddings
        else:
            self.embeddings = np.vstack([self.embeddings, doc_embeddings])
        
        # Build FAISS index for fast similarity search
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
        
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)
        self.index.add(self.embeddings)
    
    def retrieve_context(self, query, top_k=3):
        """Retrieve most relevant documents for a query"""
        if self.index is None:
            return []
        
        # Embed the query
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding)
        
        # Search for similar documents
        scores, indices = self.index.search(query_embedding, top_k)
        
        # Return relevant documents with scores
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx < len(self.documents):
                results.append({
                    'document': self.documents[idx],
                    'relevance_score': float(score),
                    'rank': i + 1
                })
        
        return results
    
    def generate_response(self, query, retrieved_docs):
        """Generate response using retrieved context"""
        # Build context from retrieved documents
        context = "\n\n".join([
            f"Document {i+1}: {doc['document']['content']}"
            for i, doc in enumerate(retrieved_docs)
        ])
        
        # Create RAG prompt
        rag_prompt = f"""Context Information:{context} Based on the above context, please answer the following question:{query}If the context doesn't contain enough information to answer the question, please say so."""
        
        # In a real implementation, this would call your LLM
        return basic_llm_chat(rag_prompt)
          
    
# Sales Data RAG
def setup_sales_rag_example():
    """Real-world example: Company sales data RAG system"""
    
    # Sample documents
    company_docs = [
        {
            "id": "sales_q1_2024", 
            "content": "Q1 2024 Sales Report: Total revenue reached $4.8M, representing 12% growth YoY. Key drivers included enterprise software sales ($2.1M) and consulting services ($1.9M). Geographic breakdown: North America 65%, Europe 25%, APAC 10%.",
            "metadata": {"type": "sales_report", "quarter": "Q1", "year": 2024}
        },
        {
            "id": "sales_q2_2024", 
            "content": "Q2 2024 Performance: Revenue hit $5.2M (+8% QoQ). Software subscriptions showed strong growth at $2.4M. New customer acquisition: 47 enterprise clients. Churn rate decreased to 3.2%.",
            "metadata": {"type": "sales_report", "quarter": "Q2", "year": 2024}
        },
        {
            "id": "product_strategy_2024", 
            "content": "2024 Product Strategy: Focus on AI-powered analytics platform. Expected market size $12B by 2025. Competitive advantage: 40% faster processing than nearest competitor. Investment needed: $2M in R&D.",
            "metadata": {"type": "strategy", "year": 2024}
        }
    ]
    
    # Initialize RAG system
    rag_system = RAGSystem()
    rag_system.add_documents(company_docs)
    
    return rag_system
# Demo the RAG system
rag_system = setup_sales_rag_example()
# Example queries
queries = [
    "What were our Q2 2024 sales figures?",
    "How much should we invest in R&D this year?",
    "What's our customer churn rate?"
]
for query in queries:
    print(f"\n Query: {query}")
    retrieved = rag_system.retrieve_context(query, top_k=2)
    
    print(" Retrieved Documents:")
    for doc in retrieved:
        print(f"  • {doc['document']['id']} (relevance: {doc['relevance_score']:.2f})")
    
    response = rag_system.generate_response(query, retrieved)
    print(f" AI Response: {response}")

2025-08-31 13:58:02.138468: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-31 13:58:02.142859: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-31 13:58:02.155642: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-31 13:58:02.176650: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-31 13:58:02.182931: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-31 13:58:02.198691: I tensorflow/core/platform/cpu_feature_gu

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


 Query: What were our Q2 2024 sales figures?
 Retrieved Documents:
  • sales_q2_2024 (relevance: 0.68)
  • sales_q1_2024 (relevance: 0.67)
 AI Response: According to Document 1, the Q2 2024 sales figures were:

* Total revenue: $5.2M
* Software subscriptions: $2.4M

 Query: How much should we invest in R&D this year?
 Retrieved Documents:
  • product_strategy_2024 (relevance: 0.53)
  • sales_q1_2024 (relevance: 0.38)
 AI Response: The context contains information about the investment needed in R&D, which is $2M, but it does not provide information about the current year's budget or investment plans. Therefore, I cannot determine how much should be invested in R&D this year based on the provided information.

 Query: What's our customer churn rate?
 Retrieved Documents:
  • sales_q2_2024 (relevance: 0.46)
  • sales_q1_2024 (relevance: 0.35)
 AI Response: Based on the provided context, the customer churn rate is 3.2%. This information is mentioned in Document 1: Q2 2024 Performance.
