# RAG System Test Environment

### Load & Process Documents

In [None]:
# FIRST IS TO LOAD AND PROCESS THE DOCUMENT
from langchain.document_loaders import PyPDFLoader, TextLoader # type: ignore
from langchain.text_splitter import RecursiveCharacterTextSplitter # type: ignore

loader = PyPDFLoader("knowledge-base/Company Profile.pdf")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)

In [None]:
len(chunks)

### Embed Chunks + Store in Vector DB (FAISS)

In [None]:
# NEXT IS TO EMBED CHUNKS AND STORE IN VECTOR DB(FAISS)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local("my_faiss_index")

In [None]:
vectorstore

### Querying (RAG Loop)

In [None]:
query = input("How can i help you?" )

### Query Translation

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

# Initialize the text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import Accelerator

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
accelerator = Accelerator()

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# Use accelerator to place the model and inputs on the available device
model = accelerator.prepare(model)

# Example usage
input_text = "Hello, world!"
inputs = tokenizer(input_text, return_tensors="pt")

# Move inputs to the appropriate device
inputs = {key: value.to(accelerator.device) for key, value in inputs.items()}

# Generate text
outputs = model.generate(**inputs, max_length=50)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

In [None]:
def multi_query_paraphrase(input_text: str, n_variants=3, max_length=100):
    # Create multiple paraphrasing prompts
    prompts = [
        f"Paraphrase the following sentence differently: {input_text}",
        f"Rewrite this in a new way: {input_text}",
        f"Say this with different words: {input_text}",
        f"Change the wording while keeping the same meaning: {input_text}",
    ][:n_variants]

    # Generate outputs for each prompt
    outputs = []
    for prompt in prompts:
        result = generator(prompt, max_length=max_length, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1)
        text = result[0]["generated_text"].replace(prompt, "").strip()
        outputs.append(text)

    return list(set(outputs))  # Return unique outputs

In [None]:
paraphrases = multi_query_paraphrase(query, n_variants=3)
for i, p in enumerate(paraphrases, 1):
    print(f"{i}. {p}")

### Routing(Semantic)

In [None]:
# routing_labels.py

routing_labels = {
    "database": "Questions about internal records, metrics, tables, or structured data.",
    "vectorstore": "Questions related to uploaded documents, PDFs, knowledge bases, or internal wikis.",
    "web": "Questions that might require up-to-date, external information not present in the system."
}

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def route_question(question: str, threshold: float = 0.4):
    question_embedding = model.encode(question, convert_to_tensor=True)

    similarities = {}
    for label, description in routing_labels.items():
        desc_embedding = model.encode(description, convert_to_tensor=True)
        sim_score = float(util.pytorch_cos_sim(question_embedding, desc_embedding))
        similarities[label] = sim_score

    # Determine best route
    best_route = max(similarities, key=similarities.get)
    best_score = similarities[best_route]

    # If below threshold, fallback to web
    if best_score < threshold:
        return "web", similarities

    return best_route, similarities

In [1]:
# Mock response functions
def query_database(question):
    return f"[DB Answer] to: {question}"

def query_vectorstore(question):
    return f"[Vector RAG Answer] to: {question}"

def query_web(question):
    return f"[Web Search Answer] to: {question}"


def handle_question(user_question):
    route, sim_scores = route_question(user_question)

    print(f"\nRouting to: {route}")
    print(f"Similarities: { {k: round(v, 3) for k, v in sim_scores.items()} }")

    if route == "database":
        return query_database(user_question)
    elif route == "vectorstore":
        return query_vectorstore(user_question)
    else:
        return query_web(user_question)

In [None]:
# CLI interface
if __name__ == "__main__":
    while True:
        try:
            question = input("\nAsk a question: ")
            if question.lower() in {"exit", "quit"}:
                print("Goodbye!")
                break
            answer = handle_question(question)
            print(answer)
        except KeyboardInterrupt:
            print("\nInterrupted by user. Exiting.")
            break

### Query Construction

### Retrieval

In [None]:
# NEXT IS TO QUERY THE RAG MODEL(TEST WITH A SIMPLE QUERY)
from langchain.chains import RetrievalQA
from langchain.llms import Ollama

llm = Ollama(model="tinyllama")
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())
response = qa_chain.run(query)
print(response)

In [None]:
#CHECKING THE TIME TAKEN FOR RETRIEVAL AND GENERATION
import time

query = "What is the name of the company?"

start = time.time()
retrieved_docs = vectorstore.similarity_search(query)
print(f"Retrieval Time: {time.time() - start:.2f} sec")

start = time.time()
response = qa_chain.run(query)
print(f"Generation Time: {time.time() - start:.2f} sec")

print("\nAnswer:", response)

### UI with Streamlit

In [None]:
import streamlit as st

st.title("Free RAG Assistant")
query = st.text_input("Ask a question:")
if query:
    result = qa_chain.run(query)
    st.write(result)


### Logging Queries

In [None]:
#LOGGING THE QUERIES
import logging
logging.basicConfig(filename='queries.log', level=logging.INFO)
logging.info(f"User asked: {query}")