# Putting all of the Pieces Together

Now that we know how to evaluate semantic similarity using a database, we can proceed to an actual RAG implementation.

In [None]:
! pip install chromadb
! pip install ollama
! pip install nltk

In [17]:
import requests
import chromadb
import numpy as np
import uuid
import time
import nltk

from nltk.tokenize import PunktSentenceTokenizer

from openai import OpenAI
from typing import Dict, Any, List

from api_utils import load_api_params

In [None]:
# Download the punkt tokenizer models that will help us split our text into sentences.
nltk.download('punkt')

In [6]:
with open('./heart_attack.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [None]:
# Break our story up into seperate sentances
tokenizer = PunktSentenceTokenizer()
sentences = tokenizer.tokenize(text)

print(f"Total sentences: {len(sentences)}")
print(sentences[33])

## Now that our sentences are split, we can generate embeddings for individual sentences

In [12]:
# Using Nomic model served locally via Ollama for embedding
# Ollama is a friend --> https://ollama.com/
def get_embeddings_from_ollama(text, model="nomic-embed-text"):
    url = "http://localhost:11434/api/embeddings"
    
    payload = {
        "model": model,
        "prompt": text
    }
    
    response = requests.post(url, json=payload)
    return np.array(response.json()["embedding"], dtype=np.float32)

In [13]:
embeddings = []
for sentence in sentences:
    embedding = get_embeddings_from_ollama(sentence)
    embeddings.append(embedding)

## And now, on to packing everything into a database

In [14]:
client = chromadb.PersistentClient(path="./chroma_db")

In [16]:
# Create a unique collection and add the embeddings to it

unique_collection_name = f"document_sentences_{int(time.time())}"

collection = client.get_or_create_collection(
    name=unique_collection_name,
    metadata={"hnsw:space": "cosine"}  # Using cosine similarity
)

# Generate IDs for each sentence
ids = [str(uuid.uuid4()) for _ in embeddings]

collection.add(
    ids=ids,
    embeddings=embeddings,
    documents=sentences
)

## And now ... bringing in the LLM and the full RAG experience with semantic similarity

In [18]:
# Load API parameters and initialize client

SECRETS_PATH = ".secrets.toml"

API_CALL_PARAMS = load_api_params(SECRETS_PATH)
client = OpenAI(
    base_url = API_CALL_PARAMS['API_URL'],
    api_key = API_CALL_PARAMS['API_KEY']
)

In [19]:
def generate_completion(model: str, messages: List[Dict[str, str]]) -> str:
    """Generate LLM output"""
    response = client.chat.completions.create(
        model=model, 
        messages=messages
    )
    return response.choices[0].message.content

In [20]:
USER_PROMPT = """Can you please tell me what is a heart attack."""

In [21]:
query_vector = get_embeddings_from_ollama(USER_PROMPT)

In [32]:
KNOWLEDGE =""

In [None]:
results = collection.query(
    query_embeddings=[query_vector],
    n_results=10
)

print("Query results for:", USER_PROMPT)
for i, (doc, distance) in enumerate(zip(results["documents"][0], results["distances"][0])):
    print(f"Result {i+1}: {doc}")
    KNOWLEDGE += str(doc)+"\n"

In [None]:
SYSTEM_PROMPT = f"""Answer all user questions to the best of your ability. Use the following text for reference:

{KNOWLEDGE}
"""

print(SYSTEM_PROMPT)

In [None]:
messages = [
    {"role": "system", "content": f"""{SYSTEM_PROMPT}"""},
    {"role": "user", "content":f"""{USER_PROMPT}"""}
]
try:
    model = API_CALL_PARAMS['MODEL']
    LLM_output = generate_completion(model, messages)
except Exception as e:
    raise Exception(f"Error generating completion: {e}")

print(LLM_output)