In [44]:
# Load required Libraries

import re
import os
from dotenv import load_dotenv
load_dotenv()

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import  Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnableLambda
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory
from rich.console import Console
from rich.markdown import Markdown
from langchain.vectorstores import FAISS


In [45]:
# Load PDF Documents from directory

loader = DirectoryLoader(
    path = r'C:\Desktop\Chatbot\HerbalDocs',
    glob = '*.pdf',
    loader_cls = PyPDFLoader    
)

docs = loader.load()
print(len(docs))

1499


In [46]:
# Load Embedding Model 
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v1')

In [47]:
# Text Splitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=120,
    separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)
print("Chunks created:", len(chunks))

Chunks created: 5441


In [48]:
chunks[0]

Document(metadata={'producer': 'Acrobat Distiller 4.0 for Macintosh', 'creator': 'PageMaker 5.0', 'creationdate': '2004-05-25T14:59:01+03:00', 'moddate': '2004-05-25T14:59:05+03:00', 'source': 'C:\\Desktop\\Chatbot\\HerbalDocs\\Volume-1.pdf', 'total_pages': 295, 'page': 0, 'page_label': '1'}, page_content='Bulbus Allii Cepae\ni\nWHO\nmonographs\non selected\nmedicinal plants\nVOLUME 1\nWorld Health Organization\nGeneva\n1999')

In [49]:
# Embedding & Vector store  (Already save so run next console)

# vector_store = Chroma.from_documents(
#      documents=chunks,
#     embedding=embedding,
#     persist_directory="./chroma_db"   # saves locally
# )

In [None]:
# Embedding & Vector store  (Already save so run next console)

# FAISS_DIR = "./faiss_db"
# os.makedirs(FAISS_DIR, exist_ok=True)

# # Create FAISS vectorstore from documents and persist
# faiss_store = FAISS.from_documents(
#     documents=chunks,          # your list of langchain Document objects
#     embedding=embedding,       # your HuggingFaceEmbeddings instance
# )
# faiss_store.save_local(FAISS_DIR)

In [None]:
# Use already saved vector store

FAISS_DIR = "./faiss_db"

vector_store = FAISS.load_local(
    folder_path=FAISS_DIR,
    embeddings=embedding,
    allow_dangerous_deserialization=True
)

In [52]:
# Use already saved vector store

# vector_store1 = Chroma(
#     persist_directory="./chroma_db",  
#     embedding_function=embedding
# )

In [None]:
# Run with API key

from dotenv import load_dotenv
load_dotenv()
import os


model = ChatGroq(
    model="llama-3.3-70b-versatile",   
    api_key=os.getenv("GROQ_API_KEY"),
    temperature=0.7,
    top_p=0.9,                        
    max_tokens=512,
    timeout=None,
    max_retries=3
)

                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.
  validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)


In [None]:
# ========================= BETTER RETRIEVER ============================
from langchain_community.retrievers import BM25Retriever
from collections import defaultdict
from sentence_transformers import CrossEncoder
import numpy as np
import os

# ---------- PREPARE TEXTS & METADATA ----------
all_texts = []
all_metadata = []

for i, d in enumerate(chunks):
    meta = d.metadata or {}

    src = meta.get("source") or meta.get("file") or f"doc_{i}"
    meta["source"] = src
    meta["chunk_id"] = f"{os.path.basename(src)}::chunk_{i}"

    all_texts.append(d.page_content)
    all_metadata.append(meta)

# ---------- SPARSE RETRIEVER ----------
bm25_retriever = BM25Retriever.from_texts(
    texts=all_texts,
    metadatas=all_metadata
)
bm25_retriever.k = 15

# ---------- DENSE  ----------
similarity_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 15}
)

# ---------- SCORE NORMALIZATION ----------
def normalize_scores(items):
    if not items:
        return []
    scores = np.array([s for _, s in items], dtype=float)
    lo, hi = scores.min(), scores.max()
    if hi == lo:
        return [(d, 1.0) for d, _ in items]
    norm = (scores - lo) / (hi - lo)
    return [(items[i][0], float(norm[i])) for i in range(len(items))]

# ---------- MERGE RETRIEVER RESULTS ----------
def merge_results(results_by_source, weights):
    agg = defaultdict(float)
    doc_map = {}

    for name, pairs in results_by_source.items():
        w = weights.get(name, 1.0)
        normed = normalize_scores(pairs)

        for doc, score in normed:
            doc_id = doc.metadata.get("chunk_id")
            agg[doc_id] += score * w
            doc_map[doc_id] = doc

    merged = [(doc_map[k], v) for k, v in agg.items()]
    merged_sorted = sorted(merged, key=lambda x: x[1], reverse=True)
    return merged_sorted

# ---------- CROSS ENCODER RERANKER ----------
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, docs, final_k=8):
    pairs = [(query, d.page_content) for d in docs]
    scores = reranker.predict(pairs)
    idx = np.argsort(scores)[::-1][:final_k]
    return [docs[i] for i in idx]

# ---------- MASTER RETRIEVE FUNCTION ----------
# ---------- MASTER RETRIEVE FUNCTION (MMR REMOVED) ----------
def better_retrieve(query, top_k=8):
    # get results from similarity and bm25 only
    sim_docs = similarity_retriever.get_relevant_documents(query) if similarity_retriever else []
    bm_docs = bm25_retriever.get_relevant_documents(query) if bm25_retriever else []

    results = {
        "similarity": [(d, getattr(d, "score", 1.0)) for d in sim_docs],
        "bm25":       [(d, getattr(d, "score", 1.0)) for d in bm_docs],
    }

    # Reweight: give more importance to semantic similarity, tune as needed
    weights = {"similarity": 0.60, "bm25": 0.40}

    merged = merge_results(results, weights)
    top_candidates = [d for d, _ in merged[:20]]  # pass top N to reranker

    final_docs = rerank(query, top_candidates, final_k=top_k)
    return final_docs


print("Advanced Hybrid Retriever (Sim + + BM25 + RERANK) Loaded.")
# ========================================================================


ðŸš€ Advanced Hybrid Retriever (Sim + + BM25 + RERANK) Loaded.


In [55]:
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """You are a helpful, friendly assistant specialized in medicinal plants.
Use only the provided information and chat history to answer.
Keep answers short and simple.
If the context is incomplete, say so.
Do NOT add phrases like 'as per the transcript', 
'based on the transcript', or 'from the transcript'."""
    ),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "information:\n{context}\n\nQuestion:\n{question}")
])


In [56]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)


In [57]:
# Clean function
def clean_output(text: str) -> str:
    
    if not text:
        return ""
    return text.replace("\\n", "\n").replace("\n\n", "\n")
cleaner = RunnableLambda(lambda x: clean_output(x))

In [58]:
parser = StrOutputParser()

In [59]:
chain = chat_prompt | model | parser | cleaner

In [60]:
store = {}  # store multiple sessions
console = Console()

def get_history(session_id: str):
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    return store[session_id]

with_history = RunnableWithMessageHistory(
    chain,
    get_history,
    input_messages_key="question",
    history_messages_key="chat_history"
)

# ================== Interactive Loop ==================
print("ðŸ’¬ Medical Chatbot Ready! Type 'exit' to quit.")
session_id = "user1"   # you can change per user

while True:
    user_query = input("\nðŸ§‘ You: ")
    if user_query.lower() in ["exit", "quit"]:
        console.print("\nðŸ¤– Bot: ðŸ‘‹ Chat ended.", style="bold green")
        break

    # Retrieve context
    
    retrieve_text = better_retrieve(user_query)
    knowledge_base = " ".join(d.page_content for d in retrieve_text)

    # Run with memory
    response = with_history.invoke(
        {"context": knowledge_base, "question": user_query},
        config={"configurable": {"session_id": session_id}}
    )

    # Show conversation in chat-like format
    console.print(f"\nðŸ§‘ You: {user_query}", style="bold cyan")
    console.print("\nðŸ¤– Bot:", style="bold green")
    console.print(Markdown(str(response)))

ðŸ’¬ Medical Chatbot Ready! Type 'exit' to quit.
