In [1]:
# going at root folder
import os
os.chdir('../')
# %pwd

## Langsmith setup

In [2]:
## Langsmith setup
from dotenv import load_dotenv

# Load variables from .env into environment
load_dotenv()

# Access them
api_key = os.getenv("LANGCHAIN_API_KEY")
endpoint = os.getenv("LANGCHAIN_ENDPOINT")
tracing = os.getenv("LANGSMITH_TRACING")

print("LangSmith endpoint:", endpoint)
print("Tracing enabled:", tracing)

LangSmith endpoint: https://api.smith.langchain.com
Tracing enabled: true


### Data Loading 

In [3]:
import mlflow
import time
import json
import dagshub
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 500
chunk_overlap = 100

#extract text from pdf files   #load all the pdf files from data folder
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    return loader.load()

In [4]:
#mlflow.set_tracking_uri("http://127.0.0.1:5000")
dagshub.init(repo_owner='264Gaurav', repo_name='medical-chatbot', mlflow=True)

# Set the experiment name
mlflow.set_experiment("RAG_ragas")

# Start an MLflow run to log this execution
with mlflow.start_run() as run:
    run_id = run.info.run_id
    print(f"MLflow Run ID: {run_id}")

    data_dir = 'data' # This is the path to your data directory
    mlflow.log_param("data_directory", data_dir)
    mlflow.log_param("pdf_loader_class", "PyPDFLoader")

    start_time = time.time()
    extracted_data = load_pdf_files(data_dir) # extracted_data will be a list of Document objects
    end_time = time.time()
    loading_duration = end_time - start_time

    mlflow.log_metric("num_documents_loaded", len(extracted_data))
    mlflow.log_metric("pdf_loading_duration_seconds", loading_duration)

    # Log the number of documents loaded - CORRECTED
    # 'extracted_data' is the list of loaded documents, so its length gives the number of documents
    num_documents = len(extracted_data)
    mlflow.log_param("num_documents", num_documents)



    print(f"MLflow Run ID: {run_id}")
    print(f"Loaded {num_documents} documents in {loading_duration:.2f} seconds.")
    print(f"MLflow run finished. View at {mlflow.get_tracking_uri()}")




MLflow Run ID: fc3ecbe0b980420c80d8c38f717e8717
MLflow Run ID: fc3ecbe0b980420c80d8c38f717e8717
Loaded 637 documents in 8.45 seconds.
MLflow run finished. View at https://dagshub.com/264Gaurav/medical-chatbot.mlflow
🏃 View run flawless-crow-691 at: https://dagshub.com/264Gaurav/medical-chatbot.mlflow/#/experiments/1/runs/fc3ecbe0b980420c80d8c38f717e8717
🧪 View experiment at: https://dagshub.com/264Gaurav/medical-chatbot.mlflow/#/experiments/1


### Filtering of Loaded DATA (i.e., extracted_data)

In [5]:
from typing import List
from langchain.schema import Document

##DATA cleaning and filtering
def filter_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, filter out those with new list of Document objects
    containing only 'source' and 'page' in metadata and the original page_content.

    MLflow Tracking:
    - Logs 'input_documents_count' as a metric.
    - Logs 'output_documents_count' as a metric.
    - Logs 'documents_filtered_count' as a metric.
    - Logs 'filter_function_name' as a parameter.
    """



    # Log initial state
    input_documents_count = len(docs)
    mlflow.log_metric("input_documents_count_for_filtering", input_documents_count)
    mlflow.log_param("filter_function_name", "filter_docs")

    minimal_docs: List[Document] = []

    for doc in docs:
        src = doc.metadata.get('source')
        page = doc.metadata.get('page')
        # Here, you might add more complex filtering logic if needed
        # For this specific function, it always adds the document, just with minimal metadata
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src , "page":page}
            )
        )

    output_documents_count = len(minimal_docs)
    documents_filtered_count = input_documents_count - output_documents_count # Will be 0 if all are kept

    # Log metrics after processing
    mlflow.log_metric("output_documents_count_after_filtering", output_documents_count)
    mlflow.log_metric("documents_filtered_count", documents_filtered_count)

    # Optional: Log a sample of filtered documents as an artifact
    if minimal_docs:
        sample_docs_path = "filtered_docs_sample.json"
        # Log metadata from first few documents as a sample
        sample_data = []
        for i, doc in enumerate(minimal_docs[:5]): # Log first 5 documents
            sample_data.append({
                "page_content_preview": doc.page_content[:200] + "...",
                "metadata": doc.metadata
            })
        with open(sample_docs_path, "w") as f:
            json.dump(sample_data, f, indent=4)
        mlflow.log_artifact(sample_docs_path, artifact_path="data_filtering_artifacts")
        os.remove(sample_docs_path) # Clean up local file

    return minimal_docs


minimal_docs = filter_docs(extracted_data) ## function calling here for filtering of extracted_docs

### Split the Filtered Docs (minimal_docs) into smaller chunks

In [6]:
## split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(minimal_docs)


chunks = text_split(minimal_docs)
print(f"Number of text chunks: {len(chunks)}")


# Log the number of chunks created
num_chunks = len(chunks)
mlflow.log_metric("num_chunks_created", num_chunks) # Changed to metric as it's an output count

# Log text splitter parameters
mlflow.log_param("chunk_size", chunk_size)
mlflow.log_param("chunk_overlap", chunk_overlap)
mlflow.log_param("text_splitter_class", "RecursiveCharacterTextSplitter")

# Optional: Log a sample of the first few chunks as an artifact
if chunks:
    chunks_sample_path = "chunks_sample.json"
    with open(chunks_sample_path, "w") as f:
        json.dump([chunk.dict() for chunk in chunks[:5]], f, indent=4) # Log first 5 chunks
    mlflow.log_artifact(chunks_sample_path, artifact_path="data_processing_artifacts")
    os.remove(chunks_sample_path) # Clean up local file


Number of text chunks: 6600


/var/folders/38/7jcjyd5s1cd09qhrzns_rny80000gn/T/ipykernel_28663/4157002724.py:27: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  json.dump([chunk.dict() for chunk in chunks[:5]], f, indent=4) # Log first 5 chunks


### Embedding model setup

In [7]:
import time
import mlflow

# Use the LangChain Ollama embedding class
# If your LangChain installation exposes it under a different module, see the note below.
from langchain_ollama.embeddings import OllamaEmbeddings


def download_embeddings(
    model_name: str = "nomic-embed-text:latest",
    base_url: str = "http://localhost:11434",
):
    """
    Connects to the locally-running Ollama instance and returns an OllamaEmbeddings object
    using the specified embedding model (nomic-embed-text:latest by default).
    """

    # Log embedding model parameters
    mlflow.log_param("embedding_model_name", model_name)
    mlflow.log_param("embedding_provider", "Ollama")
    mlflow.log_param("embedding_class", "OllamaEmbeddings")
    mlflow.log_param("ollama_base_url", base_url)

    start_time = time.time()
    # Create the LangChain wrapper for Ollama embeddings
    embeddings = OllamaEmbeddings(model=model_name, base_url=base_url)
    end_time = time.time()
    loading_duration = end_time - start_time

    # Log metrics
    mlflow.log_metric("embedding_loading_duration_seconds", loading_duration)

    print(f"Connected to Ollama embedding model '{model_name}' at {base_url} (init {loading_duration:.2f}s).")

    return embeddings



embedding = download_embeddings()
## example: embedding.embed_documents(["hello world", "another doc"])


Connected to Ollama embedding model 'nomic-embed-text:latest' at http://localhost:11434 (init 0.03s).


In [8]:
vector = embedding.embed_query("Hello to medical chatbot.")

vector_dim = len(vector);
print(f"Vector length: {vector_dim}")  # Check the length of the vector  ## Dimentions of the vector

mlflow.log_param("vector_embedding_size", vector_dim)


Vector length: 768


768

### Vector DB setup

In [9]:
## Pinecone is the leading vector database for building accurate and performant AI applications
from pinecone import Pinecone


PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
pinecone_api = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api)
pc

<pinecone.pinecone.Pinecone at 0x171d772f0>

In [10]:
from pinecone import Pinecone, ServerlessSpec


# Log stage name
mlflow.log_param("stage", "pinecone_index_setup")

index_name = 'medical-chatbot'
dimension = vector_dim  # Dimension of the embeddings (should match your embedding model output)
metric = 'cosine' # Similarity metric

# Log Pinecone index parameters
mlflow.log_param("pinecone_index_name", index_name)
mlflow.log_param("pinecone_index_dimension", dimension)
mlflow.log_param("pinecone_index_metric", metric)

# Define the serverless spec parameters
cloud_provider = "aws"
region = "us-east-1"
mlflow.log_param("pinecone_cloud_provider", cloud_provider)
mlflow.log_param("pinecone_region", region)


# Check if index exists and create if not
start_check_time = time.time()
index_exists = pc.has_index(index_name)
end_check_time = time.time()
check_duration = end_check_time - start_check_time

mlflow.log_metric("pinecone_index_exists_check_duration_seconds", check_duration)
mlflow.log_param("pinecone_index_existed_before_run", index_exists)

if not index_exists:
    mlflow.log_param("pinecone_index_action", "created_new_index")
    print(f"Pinecone index '{index_name}' does not exist. Creating...")
    start_create_time = time.time()
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(cloud=cloud_provider, region=region)
    )
    end_create_time = time.time()
    creation_duration = end_create_time - start_create_time
    mlflow.log_metric("pinecone_index_creation_duration_seconds", creation_duration)
    print(f"Pinecone index '{index_name}' created in {creation_duration:.2f} seconds.")
else:
    mlflow.log_param("pinecone_index_action", "connected_to_existing_index")
    print(f"Pinecone index '{index_name}' already exists. Connecting...")
    mlflow.log_metric("pinecone_index_creation_duration_seconds", 0) # Log 0 if not created

# Connect to the index
start_connect_time = time.time()
index = pc.Index(index_name)
end_connect_time = time.time()
connect_duration = end_connect_time - start_connect_time

mlflow.log_metric("pinecone_index_connection_duration_seconds", connect_duration)
print(f"Connected to Pinecone index '{index_name}' in {connect_duration:.4f} seconds.")

# Optional: Log some basic info about the connected index (e.g., number of vectors)
# This might require querying the index, which adds time, so consider if needed for every run.
try:
    index_info = index.describe_index_stats()
    mlflow.log_metric("pinecone_total_vector_count", index_info.dimension) # This gives dimension, not vector count directly
    # To get actual vector count:
    if index_info.namespaces:
        total_vectors_in_index = sum(ns.vector_count for ns_name, ns in index_info.namespaces.items())
        mlflow.log_metric("pinecone_total_vectors_in_index", total_vectors_in_index)
        mlflow.log_param("pinecone_namespaces", list(index_info.namespaces.keys()))
except Exception as e:
    print(f"Could not get Pinecone index stats: {e}")
    mlflow.log_param("pinecone_index_stats_error", str(e))


Pinecone index 'medical-chatbot' already exists. Connecting...


  from .autonotebook import tqdm as notebook_tqdm


Connected to Pinecone index 'medical-chatbot' in 0.8157 seconds.


### Injection of filtered and chunked data (i.e., chunks) into Pinecone vector DB

In [11]:
from langchain_pinecone import PineconeVectorStore

def upsert_to_pinecone(chunks, embedding, index_name, batch_size: int = 50):
    """
    Upserts documents to a Pinecone index in safe batches (<4MB per request).
    Logs metadata and timing with MLflow.
    """
    # Log input parameters
    mlflow.log_param("vector_store_type", "PineconeVectorStore")
    mlflow.log_param("index_name_for_upsertion", index_name)
    mlflow.log_param("num_text_chunks_for_upsertion", len(chunks))
    mlflow.log_param("batch_size for pinecone data injection", batch_size)
    if hasattr(embedding, "model"):
        mlflow.log_param("embedding_model_used_for_upsertion", getattr(embedding, "model", "unknown"))

    print(f"Starting batched upsertion to Pinecone index '{index_name}' with {len(chunks)} chunks...")

    start_time = time.time()

    # Upsert in batches to avoid Pinecone 4MB limit
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i : i + batch_size]
        PineconeVectorStore.from_documents(
            documents=batch,
            embedding=embedding,
            index_name=index_name
        )
        print(f"  Upserted batch {i // batch_size + 1} ({len(batch)} chunks).")

    end_time = time.time()
    upsertion_duration = end_time - start_time

    # Log metrics
    mlflow.log_metric("pinecone_upsertion_duration_seconds", upsertion_duration)

    print(f"✅ Upsertion to Pinecone completed in {upsertion_duration:.2f} seconds.")



# upsert_to_pinecone(chunks=chunks,  embedding=embedding, index_name=index_name, batch_size= 500)  ## uncomment only when you want to insert the data in pinecode DB (i.e., if data in DB not injected before)



### Retriever

In [None]:
# Load Existing index

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

# Define parameters for retrieval
retrieval_search_type = "similarity"
retrieval_k = 8
test_query = "What is Cancer?"

# 1. Log Retrieval Parameters
mlflow.log_param("retriever_search_type", retrieval_search_type)
mlflow.log_param("retriever_k_value", retrieval_k)
mlflow.log_param("retrieval_query", test_query) # Log the specific query used

# Configure the retriever
retriever = docsearch.as_retriever(search_type=retrieval_search_type, search_kwargs={"k": retrieval_k})

# 2. Measure and Log Retrieval Time on Test Query
start_time = time.time()
retrieved_docs = retriever.invoke(test_query) ## Retriever invoked
end_time = time.time()
retrieval_duration = end_time - start_time
mlflow.log_metric("retrieval_duration_seconds", retrieval_duration)

# 3. Log Retrieved Document Count
num_retrieved = len(retrieved_docs)
mlflow.log_metric("num_retrieved_documents", num_retrieved)


# 4. Log a Sample of Retrieved Documents as an Artifact
if retrieved_docs:
    sample_docs = []
    # Log details of the first 3 documents as a sample
    for i, doc in enumerate(retrieved_docs[:3]):
        sample_docs.append({
            "index": i + 1,
            "source": doc.metadata.get('source', 'N/A'),
            "page": doc.metadata.get('page', 'N/A'),
            "content_preview": doc.page_content[:200] + "..." # Log first 200 chars
        })

    temp_file_path = "retrieved_docs_sample.json"
    with open(temp_file_path, "w") as f:
        json.dump(sample_docs, f, indent=4)

    mlflow.log_artifact(temp_file_path, artifact_path="retrieval_output_samples")
    os.remove(temp_file_path) # Clean up the local temporary file

print(f"Retrieved {num_retrieved} documents for '{test_query}' in {retrieval_duration:.4f} seconds.")


Retrieved 8 documents for 'What is Cancer?' in 4.0153 seconds.


Failed to send compressed multipart ingest: Connection error caused failure to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. Please confirm your internet connection. ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/multipart (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x1508c20f0>: Failed to resolve \'api.smith.langchain.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))
Content-Length: 52230
API Key: lsv2_********************************************b5trace=cd8ff1d9-86f3-49b8-be8f-5ba4d2643923,id=08352d89-8c95-4151-8c2f-1d35d307271d; trace=cd8ff1d9-86f3-49b8-be8f-5ba4d2643923,id=9e8c1fa2-1ad7-4050-a464-6b2556490e81; trace=cd8ff1d9-86f3-49b8-be8f-5ba4d2643923,id=1480ba43-803b-4dbd-88a4-a76b6cade4b7; trace=cd8ff1d9-86f3-49b8-be8f-5ba4d2643923,id=ac72c974-56d4-4c7f-8fc8-3620d8535aaa; trace=cd8ff1d9-86f3-49b8-be8f-5ba4d2643923,id=8e

## LLM Model setup 

In [13]:
from langchain_community.llms import Ollama  # wrapper for local Ollama models


# Define LLM parameters
llm_model_name = "llama3.1:latest"
llm_temperature = 0
llm_timeout = None
llm_max_retries = 2

# Log LLM parameters
mlflow.log_param("llm_provider", "Ollama")
mlflow.log_param("llm_class", "Ollama")
mlflow.log_param("llm_model_name", llm_model_name)
mlflow.log_param("llm_temperature", llm_temperature)
mlflow.log_param("llm_timeout", llm_timeout if llm_timeout is not None else "None")
mlflow.log_param("llm_max_retries", llm_max_retries)


print(f"Initializing Ollama model: {llm_model_name}...")
start_time = time.time()
chatModel = Ollama(
    model=llm_model_name,
    temperature=llm_temperature,
    # Ollama wrapper doesn’t use `max_tokens` or `timeout` exactly the same way as Google GenAI,
    # but you can still pass client kwargs if needed.
)
end_time = time.time()
initialization_duration = end_time - start_time

# Log metrics
mlflow.log_metric("llm_initialization_duration_seconds", initialization_duration)

print(f"Ollama model '{llm_model_name}' initialized in {initialization_duration:.4f} seconds.")


Initializing Ollama model: llama3.1:latest...


  chatModel = Ollama(


Ollama model 'llama3.1:latest' initialized in 0.0015 seconds.


In [14]:
ans = chatModel.invoke("who is MS Dhoni")
ans

"MS Dhoni, also known as Mahendra Singh Dhoni, is a former Indian international cricketer who played for the Indian national team from 2004 to 2019. He was one of the most successful and popular cricketers in the world during his playing career.\n\nHere are some key facts about MS Dhoni:\n\n**Early Life**\n\nDhoni was born on July 7, 1981, in Ranchi, Jharkhand (then part of Bihar), India. His father, Pan Singh Dhoni, was a railway employee, and his mother, Devki Devi, was a homemaker.\n\n**Cricket Career**\n\nDhoni started playing cricket at the age of eight and quickly rose through the ranks to become one of the most successful wicket-keepers in Indian cricket history. He made his international debut in 2004 against Bangladesh and went on to play for India in all formats of the game (Test, ODI, T20).\n\n**Achievements**\n\nDhoni's achievements are numerous:\n\n1. **Captaincy**: Dhoni was a successful captain of the Indian team, leading them to several victories, including the 2011 Cri

In [15]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

## for ready-made RAG prompts
from langchain import hub
# Pull a pre-made RAG prompt from LangChain Hub
prompt = hub.pull("rlm/rag-prompt")



## System Prompt defining
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Keep the answer concise and understandable."
    "\n\n"
    "{context}"
)

# Create the ChatPromptTemplate
# prompt = ChatPromptTemplate.from_messages(
#     [
#         ("system", system_prompt),
#         ("human", "{input}"),
#     ]
# )


mlflow.log_param("system prompt for llm :", system_prompt)
print(prompt)
print("\n\n Prompt template defined and logged to MLflow.")

input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


 Prompt template defined and logged to MLflow.


### Defining RAG chain

In [40]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter

# Helper function to format retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Define the full RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | chatModel
    | StrOutputParser()
)

## another RAG chain - ragas compatible
retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | chatModel, "context": itemgetter("context")}
)

In [41]:
# Ask a question using the RAG chain
response = rag_chain.invoke("What dose of vitamin D should we take? What diseases can occur due to a lack of vitamin D?")
print(response)

I don't have the information on the dose of vitamin D that should be taken.

A lack of vitamin D can lead to diseases such as rickets in children and osteomalacia in adults, which are characterized by softening of bones.


## RAGAS Setup:

### GroundTruth dataset creation

In [18]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]


In [19]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

### Question generator llm model setup

In [20]:
from langchain_community.llms import Ollama  # wrapper for local Ollama models


# Define LLM parameters
llm_model_name = "llama3.1:latest"
llm_temperature = 0.5
llm_timeout = None
llm_max_retries = 2

# Log LLM parameters
mlflow.log_param("question generator llm_provider", "Ollama")
mlflow.log_param("question generator llm_model_name", llm_model_name)
mlflow.log_param("question generator llm_temperature", llm_temperature)
mlflow.log_param("question generator llm_timeout", llm_timeout if llm_timeout is not None else "None")
mlflow.log_param("question generator llm_max_retries", llm_max_retries)


print(f"Initializing Ollama model: {llm_model_name}...")
start_time = time.time()
question_generation_llm = Ollama(
    model=llm_model_name,
    temperature=llm_temperature,
    # Ollama wrapper doesn’t use `max_tokens` or `timeout` exactly the same way as Google GenAI,
    # but you can still pass client kwargs if needed.
)
end_time = time.time()
initialization_duration = end_time - start_time

# Log metrics
mlflow.log_metric("llm_initialization_duration_seconds", initialization_duration)

print(f"Ollama model '{llm_model_name}' initialized in {initialization_duration:.4f} seconds.")


bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

Initializing Ollama model: llama3.1:latest...
Ollama model 'llama3.1:latest' initialized in 0.0006 seconds.


In [21]:
from langchain.prompts import ChatPromptTemplate

qa_template = """\
You are a Professor of Medical University creating a test for advanced students.
For each context, create a question that is specific to the context. Avoid generic questions.

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=chunks[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
response


output_dict = question_output_parser.parse(response)

In [22]:
for k, v in output_dict.items():
  print(k)
  print(v)

question
What is the primary function of the hypothalamus in regulating body temperature?
context
{'page_content': 'The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION', 'metadata': {'source': 'data/Medical_book.pdf', 'page': 1}}


### create question set from initial few chunks

In [23]:
from tqdm import tqdm ## tqdm to show the progress bar of iterative or loops operation

qac_triples = []

for text in tqdm(chunks[:30]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

100%|██████████| 30/30 [03:19<00:00,  6.64s/it]


In [24]:
qac_triples[20]

{'question': 'What is the primary goal of holistic medicine, as described in the context?',
 'context': Document(metadata={'source': 'data/Medical_book.pdf', 'page': 6}, page_content='Diagnosis Description\nTreatment Preparation\nAlternative treatment Aftercare\nPrognosis Risks\nPrevention Normal/Abnormal results\nResources Resources\nKey terms Key terms\nIn recent years there has been a resurgence of interest\nin holistic medicine that emphasizes the connection\nbetween mind and body. Aimed at achieving and main-\ntaining good health rather than just eliminating disease,\nthis approach has come to be known as alternative medi-\ncine. The Gale Encyclopedia of Medicine 2 includes a')}

### Answer generator llm model setup

In [25]:
# Define LLM parameters
llm_model_name = "llama3.1:latest"
llm_temperature = 0.5
llm_timeout = None
llm_max_retries = 2

# Log LLM parameters
mlflow.log_param("answer generator llm_provider", "Ollama")
mlflow.log_param("answer generator llm_model_name", llm_model_name)
mlflow.log_param("answer generator llm_temperature", llm_temperature)
mlflow.log_param("answer generator llm_timeout", llm_timeout if llm_timeout is not None else "None")
mlflow.log_param("answer generator llm_max_retries", llm_max_retries)


print(f"Initializing Ollama model: {llm_model_name}...")
start_time = time.time()
answer_generation_llm = Ollama(
    model=llm_model_name,
    temperature=llm_temperature,
)
end_time = time.time()
initialization_duration = end_time - start_time

# Log metrics
mlflow.log_metric("llm_initialization_duration_seconds", initialization_duration)

print(f"Ollama model '{llm_model_name}' initialized in {initialization_duration:.4f} seconds.")


Initializing Ollama model: llama3.1:latest...
Ollama model 'llama3.1:latest' initialized in 0.0006 seconds.


In [26]:
answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]


answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()



qa_template = """\
You are a Professor of Medical university creating a test for advanced students. For each question and context, create an answer.

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""


prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[20]["context"],
    question=qac_triples[20]["question"],
    format_instructions=format_instructions
)


answer_generation_chain = bare_template | answer_generation_llm


In [27]:
response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response)

In [28]:
for k, v in output_dict.items():
  print(k)
  print(v)

answer
The primary goal of holistic medicine is to achieve and maintain good health, rather than just eliminating disease.
question
What is the primary goal of holistic medicine, as described in the context?


### Generating answer for all the question generated earlier (in earlier section)

In [29]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

100%|██████████| 30/30 [02:41<00:00,  5.37s/it]


### Now, process the qus-ans-context -> filtering and manipulating

In [30]:
qac_triples

[{'question': 'What is the primary function of the GALE ENCYCLOPEDIA OF MEDICINE, SECOND EDITION?',
  'context': Document(metadata={'source': 'data/Medical_book.pdf', 'page': 1}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
  'answer': 'The GALE ENCYCLOPEDIA OF MEDICINE, SECOND EDITION is a comprehensive reference work that provides in-depth information on various medical topics, including diseases, treatments, and health-related issues. Its primary function is to serve as a reliable source of medical knowledge for healthcare professionals, students, and the general public.'},
 {'question': 'What is the role of an Editor in a medical encyclopedia, as evident from the provided context?',
  'context': Document(metadata={'source': 'data/Medical_book.pdf', 'page': 2}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
  'additional_kwargs': {},
  'response_metada

In [31]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})
ground_truth_qac_set = ground_truth_qac_set.drop(columns=["additional_kwargs", "response_metadata"]) ##droping irrelavant column



eval_dataset = Dataset.from_pandas(ground_truth_qac_set)
ground_truth_qac_set


Unnamed: 0,question,context,ground_truth
0,What is the primary function of the GALE ENCYC...,page_content='The GALE\nENCYCLOPEDIA\nof MEDIC...,"The GALE ENCYCLOPEDIA OF MEDICINE, SECOND EDIT..."
1,What is the role of an Editor in a medical enc...,page_content='The GALE\nENCYCLOPEDIA\nof MEDIC...,The Editor is responsible for overseeing the c...
2,What is the role of a Project Manager in the c...,"page_content='STAFF\nJacqueline L. Longe, Proj...",The Project Manager is responsible for oversee...
3,Who is the Senior Editor of Imaging and Multim...,page_content='Multimedia Content\nDean Dauphin...,Dean Dauphinais
4,What is the role of a Permissions Manager in a...,"page_content='Maria Franklin, Permissions Mana...",The role of a Permissions Manager in a publish...
5,What is the purpose of including an extension ...,page_content='of MEDICINE\nSECOND EDITION\nSin...,The purpose of including an extension of the c...
6,"What type of paper does the book use, as menti...","page_content='ty for errors, omissions or disc...",recycled
7,What is the specific standard for paper perman...,page_content='This book is printed on recycled...,ANSI Z39.48-1984
8,What is the purpose of adding value to the und...,"page_content='petition, and other applicable l...",The purpose of adding value to the underlying ...
9,What is the name of the publisher mentioned in...,page_content='Copyright © 2002\nGale Group\n27...,Gale Group


In [32]:
eval_dataset

Dataset({
    features: ['question', 'context', 'ground_truth'],
    num_rows: 30
})

In [33]:
eval_dataset[0]

{'question': 'What is the primary function of the GALE ENCYCLOPEDIA OF MEDICINE, SECOND EDITION?',
 'context': "page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION' metadata={'source': 'data/Medical_book.pdf', 'page': 1}",
 'ground_truth': 'The GALE ENCYCLOPEDIA OF MEDICINE, SECOND EDITION is a comprehensive reference work that provides in-depth information on various medical topics, including diseases, treatments, and health-related issues. Its primary function is to serve as a reliable source of medical knowledge for healthcare professionals, students, and the general public.'}

### save the qus-ans-context dataset in a file

In [34]:
eval_dataset.to_csv("groundtruth_eval_dataset.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 97.67ba/s]


25092

## Let's Evaluate our RAG Pipelines:

### Now we can evaluate using RAGAS!

###### The set-up is fairly straightforward - we simply need to create a dataset with our generated answers and our contexts, and then evaluate using the framework.

In [80]:
def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    # print('answer_res', answer["response"])
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"],
         "contexts" : [context.page_content for context in answer["context"]],
         "reference" : row["ground_truth"]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

In [81]:
basic_qa_ragas_dataset = create_ragas_dataset(retrieval_augmented_qa_chain, eval_dataset)

100%|██████████| 30/30 [04:33<00:00,  9.13s/it]


### Save it for later : (qus,context,groundTruth and answer)

In [83]:
basic_qa_ragas_dataset.to_csv("basic_qa_ragas_dataset.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 225.73ba/s]


128666

In [84]:
basic_qa_ragas_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'reference'],
    num_rows: 30
})

In [85]:
basic_qa_ragas_dataset[0]

{'question': 'What is the primary function of the GALE ENCYCLOPEDIA OF MEDICINE, SECOND EDITION?',
 'answer': 'The primary function of the GALE ENCYCLOPEDIA OF MEDICINE, SECOND EDITION is to inform and educate readers about a wide variety of medical disorders, conditions, treatments, and diagnostic tests. It provides comprehensive information on nearly 1,700 common medical topics in an easy-to-understand format. The encyclopedia is designed to supplement consultation with a physician or other healthcare practitioner.',
 'contexts': ['The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION',
  'The Gale Encyclopedia of Medicine 2is a medical ref-\nerence product designed to inform and educate readers\nabout a wide variety of disorders, conditions, treatments,\nand diagnostic tests. The Gale Group believes the product\nto be comprehensive, but not necessarily definitive. It is\nintended to supplement, not replace, consultation with a\nphysician or other healthcare practitioner. While the Gal

In [90]:
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    faithfulness,
    answer_relevancy,
    context_recall,
    answer_correctness,
    answer_similarity,
)
from langchain_community.llms import Ollama
from langchain_ollama.embeddings import OllamaEmbeddings
import mlflow
import time


# ---------------------------
# ✅ Evaluation LLM (Ollama)
# ---------------------------
eval_llm_model_name = "llama3.1:latest"
eval_llm_temperature = 0.0   # keep deterministic for evaluation
eval_llm_max_retries = 2

mlflow.log_param("evaluation llm_provider", "Ollama")
mlflow.log_param("evaluation llm_model_name", eval_llm_model_name)
mlflow.log_param("evaluation llm_temperature", eval_llm_temperature)
mlflow.log_param("evaluation llm_max_retries", eval_llm_max_retries)

print(f"Initializing Ollama model for evaluation: {eval_llm_model_name}...")
start_time = time.time()
evaluation_llm = Ollama(
    model=eval_llm_model_name,
    temperature=eval_llm_temperature,
)
end_time = time.time()
mlflow.log_metric("evaluation_llm_initialization_duration_seconds", end_time - start_time)
print(f"Ollama eval model '{eval_llm_model_name}' initialized in {end_time-start_time:.4f} seconds.")


# ---------------------------
# ✅ Evaluation Function 1
# ---------------------------
def evaluate_ragas_dataset1(ragas_dataset):
    result = evaluate(
        ragas_dataset,
        metrics=[
            context_precision,
            context_recall
        ],
        llm=evaluation_llm,                 # 👈 Ollama LLM
        embeddings=embedding,   # 👈 Ollama embeddings (using the same , used earlier in the pipeline)
    )
    return result


# ---------------------------
# ✅ Evaluation Function 2
# ---------------------------
def evaluate_ragas_dataset2(ragas_dataset):
    result = evaluate(
        ragas_dataset,
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_correctness,
            answer_similarity,
        ],
        llm=evaluation_llm,                 # 👈 Ollama LLM
        embeddings=embedding,   # 👈 Ollama embeddings (using the same , used earlier in the pipeline)
    )
    return result


Initializing Ollama model for evaluation: llama3.1:latest...
Ollama eval model 'llama3.1:latest' initialized in 0.0009 seconds.


### Run evaluation

#### For Retrieval Evaluation (evaluation part-1)

In [91]:
basic_qa_result = evaluate_ragas_dataset1(basic_qa_ragas_dataset)

Evaluating:   8%|▊         | 5/60 [02:17<23:55, 26.10s/it]Exception raised in Job[0]: TimeoutError()
Evaluating:  10%|█         | 6/60 [03:00<28:22, 31.54s/it]Exception raised in Job[2]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[7]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[9]: TimeoutError()
Exception raised in Job[10]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[14]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
Exception raised in Job[16]: TimeoutError()
Evaluating:  30%|███       | 18/60 [03:18<04:14,  6.05s/it]Exception raised in Job[18]: TimeoutError()
Evaluating:  35%|███▌      | 21/60 [05:11<10:40, 16.42s/it]Exception raised in Job[20]: TimeoutError()
Evaluating:  42%|████▏     | 25/60 [05:51<07:38, 13.09s/it]Exception raised in Job[22]: TimeoutError()
Evaluating:  43%|████▎     | 26/60 [06:00<06:41, 11.80

In [92]:
basic_qa_result

{'context_precision': 0.0000, 'context_recall': 0.7675}

#### For Generation Evaluation (evaluation part-2)

In [None]:
basic_qa_result2 = evaluate_ragas_dataset2(basic_qa_ragas_dataset)