In [1]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
import os


FILE_PATH = r"C:\Users\LENOVO\Desktop\Python\speech.txt"
DB_LOCATION = "./chroma_langchain_db"
COLLECTION_NAME = "dbr_text"
MODEL_NAME = "mistral:latest"
EMBEDDING_MODEL = "all-minilm:l6-v2"

def chunk_text(text, chunk_size=500, overlap=50):
    """Split text into overlapping chunks for better context"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if chunk.strip():  
            chunks.append(chunk)
    
    return chunks

def load_documents(file_path):
    """Load and prepare documents from text file"""
    print(f"Loading file: {file_path}")
    
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    
    
    chunks = chunk_text(content, chunk_size=500, overlap=50)
    
    documents = []
    ids = []
    
    for i, chunk in enumerate(chunks, start=1):
        document = Document(
            page_content=chunk,
            metadata={
                "chunk_id": i,
                "source": os.path.basename(file_path)
            },
            id=str(i)
        )
        documents.append(document)
        ids.append(str(i))
    
    print(f"Created {len(documents)} document chunks")
    return documents, ids

def initialize_vector_store(db_location, collection_name, embeddings):
    """Initialize or load existing vector store"""
    add_documents = not os.path.exists(db_location)
    
    vector_store = Chroma(
        collection_name=collection_name,
        persist_directory=db_location,
        embedding_function=embeddings
    )
    
    return vector_store, add_documents

def format_documents(docs):
    """Format retrieved documents for display"""
    formatted = []
    for i, doc in enumerate(docs, 1):
        formatted.append(f"[Chunk {i}]\n{doc.page_content}\n")
    return "\n".join(formatted)

def main():
    print("Initializing RAG System...")
    
   
    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)
    
   
    vector_store, add_documents = initialize_vector_store(
        DB_LOCATION, 
        COLLECTION_NAME, 
        embeddings
    )
    
    
    if add_documents:
        print("First run - loading documents into vector store...")
        documents, ids = load_documents(FILE_PATH)
        vector_store.add_documents(documents=documents, ids=ids)
        print("Documents added successfully!")
    else:
        print("Loading existing vector store...")
    
   
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5}
    )
    
    
    model = OllamaLLM(model=MODEL_NAME)
    
   
    template = """You are an expert at answering questions about a speech text.

Use the following relevant excerpts from the speech to answer the question.
If the answer cannot be found in the excerpts, say "I cannot find that information in the provided speech."

Relevant excerpts:
{context}

Question: {question}

Answer:"""
    
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model
    
    print("\n" + "="*60)
    print("RAG System Ready!")
    print("="*60)
    print(f"Using model: {MODEL_NAME}")
    print(f"Embedding model: {EMBEDDING_MODEL}")
    print(f"Vector store location: {DB_LOCATION}")
    print("="*60)
    
   
    while True:
        print("\n" + "-"*60)
        question = input("Ask your question (or 'q' to quit): ").strip()
        
        if question.lower() == 'q':
            print("Goodbye!")
            break
        
        if not question:
            print("Please enter a question.")
            continue
        
        print("\nSearching for relevant context...")
        
        try:
           
            retrieved_docs = retriever.invoke(question)
            
            if not retrieved_docs:
                print("No relevant context found.")
                continue
            
           
            context = format_documents(retrieved_docs)
            
            print(f"Found {len(retrieved_docs)} relevant chunks. Generating answer...\n")
            
           
            result = chain.invoke({
                "context": context,
                "question": question
            })
            
            print("Answer:")
            print("-" * 60)
            print(result)
            print("-" * 60)
            
           
            show_sources = input("\nShow source chunks? (y/n): ").strip().lower()
            if show_sources == 'y':
                print("\nSource Chunks:")
                print("="*60)
                for i, doc in enumerate(retrieved_docs, 1):
                    print(f"\n[Chunk {i}] (ID: {doc.metadata.get('chunk_id', 'N/A')})")
                    print(doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content)
                    print("-"*60)
        
        except Exception as e:
            print(f"Error: {str(e)}")
            continue

if __name__ == "__main__":
    main()

Initializing RAG System...
First run - loading documents into vector store...
Loading file: C:\Users\LENOVO\Desktop\Python\speech.txt
Created 1 document chunks
Documents added successfully!

RAG System Ready!
Using model: mistral:latest
Embedding model: all-minilm:l6-v2
Vector store location: ./chroma_langchain_db

------------------------------------------------------------


Ask your question (or 'q' to quit):  who is real enemy?



Searching for relevant context...
Found 1 relevant chunks. Generating answer...

Answer:
------------------------------------------------------------
 In the provided speech, the real enemy referred to is "the belief in the sanctity of the shastras."
------------------------------------------------------------



Show source chunks? (y/n):  y



Source Chunks:

[Chunk 1] (ID: 1)
"The real remedy is to destroy the belief in the sanctity of the shastras. How do you expect to succeed if you allow the shastras to continue to be held as sacred and infallible? You must take a stand against the scriptures. Either you must stop the practice of caste or you must stop believing in th...
------------------------------------------------------------

------------------------------------------------------------


Ask your question (or 'q' to quit):  q


Goodbye!
