*********Leveraging RAGAS and Advanced Retrieval Techniques with LangChain**********

*****Load Docs****

In [1]:
!pip install -U -q langchain openai ragas pymupdf chromadb wandb tiktoken



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
!pip install -q beautifulsoup4
!pip install -q lxml

In [None]:
import os
import openai


openai.api_key = "" 
os.environ["OPENAI_API_KEY"] = openai.api_key


In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Source URLs
urls = [
    "https://www.pinecone.io/learn/retrieval-augmented-generation/",
    "https://docs.llamaindex.ai/en/stable/getting_started/concepts.html",
    "https://www.anyscale.com/blog/retrieval-augmented-generation-with-ray-and-hugging-face",
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"
]

# Load documents
docs = []
for url in urls:
    docs.extend(WebBaseLoader(url).load())




USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
for doc in docs:
    print(doc.metadata)

{'source': 'https://www.pinecone.io/learn/retrieval-augmented-generation/', 'title': 'Retrieval-Augmented Generation (RAG) | Pinecone', 'description': 'Explore the limitations of foundation models and how retrieval-augmented generation (RAG) can address these limitations so chat, search, and agentic workflows can all benefit.', 'language': 'en'}
{'source': 'https://docs.llamaindex.ai/en/stable/getting_started/concepts.html', 'title': 'Redirecting...', 'language': 'en'}
{'source': 'https://www.anyscale.com/blog/retrieval-augmented-generation-with-ray-and-hugging-face', 'title': 'Blog | Anyscale', 'description': 'Powered by Ray, Anyscale empowers AI builders to run and scale all ML and AI workloads on any cloud and on-prem.', 'language': 'en-US'}
{'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-con

In [6]:
!pip install -q pysqlite3-binary


In [9]:
!pip install -q -U "langchain-community>=0.2.10" "langchain-openai>=0.1.7" "chromadb>=0.4.22,<0.5" pysqlite3-binary


In [15]:
!pip uninstall -y chromadb



Found existing installation: chromadb 0.4.24
Uninstalling chromadb-0.4.24:
  Successfully uninstalled chromadb-0.4.24


In [17]:
!pip install -q chromadb==0.4.24


In [18]:
!pip show chromadb | grep Version


Version: 0.4.24


In [3]:
import sys
try:
    import pysqlite3
    sys.modules["sqlite3"] = pysqlite3
except Exception:
    pass

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Assume `base_docs` is a list[Document]
splitter = RecursiveCharacterTextSplitter(chunk_size=500)
split_docs = splitter.split_documents(docs)

vector_store = Chroma.from_documents(
    split_docs,
    OpenAIEmbeddings(),
    persist_directory="./chroma_store"
)


In [4]:
print(len(split_docs))

462


In [5]:
print(max(len(chunk.page_content) for chunk in split_docs))


499


****Basic QA Chain***

In [7]:
!pip install -q -U langchain-openai


In [9]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# Initialize the primary LLM for Q&A
qa_model = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)

# Create a RetrievalQA chain using the vector store retriever
qa_pipeline = RetrievalQA.from_chain_type(
    llm=qa_model,
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True
)


In [10]:
# Example user question
user_query = "What is RAG?"

# Run the question through the QA pipeline
answer = qa_pipeline({"query": user_query})

# Display only the answer text
print(answer["result"])


  answer = qa_pipeline({"query": user_query})


Retrieval-Augmented Generation (RAG) is a technique that uses authoritative, external data to enhance the accuracy, relevancy, and usefulness of a model's output. It involves ingesting authoritative data into a data source, retrieving relevant data from external sources, and using this data to improve the generation process.


***RAG***

In [11]:
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

# Define the schema for extracting a question from the context
schema_question = ResponseSchema(
    name="question",
    description="Formulated question based on the given context."
)

# List of response schemas to be used by the output parser
response_schemas = [schema_question]


In [12]:
# Create an output parser from the defined response schemas
question_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Retrieve the formatting instructions for the parser
format_instructions = question_parser.get_format_instructions()


In [14]:
texts = split_docs

In [16]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI  # updated import
# Define a prompt template for generating questions
question_prompt_text = """
You are a specialist in preparing challenging questions for advanced learners.
Given a context, formulate a relevant question.

{format_instructions}

Context:
{context}
"""

prompt_builder = ChatPromptTemplate.from_template(question_prompt_text)

message_batch = prompt_builder.format_messages(
    context=texts[0],
    format_instructions=format_instructions,  # from question_parser.get_format_instructions()
)


# Initialize the LLM
llm_model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# Generate the response
model_output = llm_model(message_batch)

# Parse the structured response
parsed_question = question_parser.parse(model_output.content)


  model_output = llm_model(message_batch)


In [18]:
for key, value in parsed_question.items():
    print(f"{key}:\n{value}\n")



question:
What are the limitations of foundation models and how can retrieval-augmented generation (RAG) address these limitations to benefit chat, search, and agentic workflows?



In [13]:
!pip install -q -U tqdm


In [19]:
print(len(texts))

462


In [21]:
from tqdm import tqdm

qa_context_pairs = []

for chunk in tqdm(texts, desc="Generating questions"):
    # Prepare the LLM input
    prompt_msgs = prompt_builder.format_messages(
        context=chunk,
        format_instructions=format_instructions
    )
    
    # Get LLM output
    llm_reply = llm_model(prompt_msgs)
    
    try:
        parsed_output = question_parser.parse(llm_reply.content)
    except Exception:
        # Ignore invalid responses
        continue
    
    # Attach the original context
    parsed_output["context"] = chunk
    qa_context_pairs.append(parsed_output)


Generating questions: 100%|██████████| 462/462 [06:20<00:00,  1.21it/s]


In [22]:
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_openai import ChatOpenAI  # updated import for latest LangChain

# Initialize the LLM for answer generation
answer_llm = ChatOpenAI(model="gpt-4", temperature=0)

# Create the schema for extracting the answer field
schema_answer = ResponseSchema(
    name="answer",
    description="The generated answer for the given question."
)

# Bundle schemas in a list (can add more if needed)
response_schemas = [schema_answer]

# Build the parser from the schema(s)
answer_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Get parser-specific formatting instructions
format_instructions = answer_parser.get_format_instructions()


In [23]:
from langchain.prompts import ChatPromptTemplate

# Template for generating answers from questions and context
answer_prompt_text = """
You are a domain expert. Given a question and its related context, provide an accurate answer.

{format_instructions}

Question: {question}
Context: {context}
"""

# Build the prompt template
answer_prompt = ChatPromptTemplate.from_template(template=answer_prompt_text)

# Prepare the prompt for the first QAC triple
answer_messages = answer_prompt.format_messages(
    context=qa_context_pairs[0]["context"],
    question=qa_context_pairs[0]["question"],
    format_instructions=format_instructions
)

# Get the model's response
answer_response = answer_llm(answer_messages)

# Parse the structured output
parsed_answer = answer_parser.parse(answer_response.content)


In [24]:
for k, v in parsed_answer.items():
    print(k)
    print(v)


answer
The context does not provide specific details on the limitations of foundation models or how Retrieval-Augmented Generation (RAG) can address these limitations.


In [25]:
from tqdm import tqdm

# Loop through each question-context pair and generate an answer
for entry in tqdm(qa_context_pairs, desc="Generating answers"):
    prompt_msgs = answer_prompt.format_messages(
        context=entry["context"],
        question=entry["question"],
        format_instructions=format_instructions
    )

    model_reply = answer_llm(prompt_msgs)

    try:
        parsed_result = answer_parser.parse(model_reply.content)
    except Exception:
        continue  # Skip if parsing fails

    # Store the generated answer back into the dictionary
    entry["answer"] = parsed_result["answer"]



Generating answers: 100%|██████████| 449/449 [23:11<00:00,  3.10s/it]


In [31]:
!pip install -q -U datasets pandas


In [27]:
import pandas as pd
from datasets import Dataset

# Create a DataFrame from the question-context-answer records
df_ground_truth = pd.DataFrame(qa_context_pairs)

# Make sure context values are stored as strings
df_ground_truth["context"] = df_ground_truth["context"].astype(str)

# Rename the answer column to match evaluation naming
df_ground_truth = df_ground_truth.rename(columns={"answer": "ground_truth"})

# Convert to a Hugging Face Dataset
eval_dataset = Dataset.from_pandas(df_ground_truth)

# Save to CSV for later evaluation
eval_dataset.to_csv("groundtruth_eval_dataset.csv")


Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 76.95ba/s]


925866

**************RAG Evaluation****************

In [43]:
import pandas as pd
from datasets import Dataset

# Load from CSV into pandas
df_eval = pd.read_csv("groundtruth_eval_dataset.csv")

# Convert back to Hugging Face Dataset
eval_dataset = Dataset.from_pandas(df_eval)

# (Optional) Keep only first 10 rows while preserving Dataset type
eval_dataset = eval_dataset.select(range(10))


In [35]:
!pip install -q -U Pillow


In [38]:
!pip uninstall -y ragas
!pip install git+https://github.com/explodinggradients/ragas.git


Found existing installation: ragas 0.3.0
Uninstalling ragas-0.3.0:
  Successfully uninstalled ragas-0.3.0
Collecting git+https://github.com/explodinggradients/ragas.git
  Cloning https://github.com/explodinggradients/ragas.git to /tmp/pip-req-build-nksvgahl
  Running command git clone --filter=blob:none --quiet https://github.com/explodinggradients/ragas.git /tmp/pip-req-build-nksvgahl
  Resolved https://github.com/explodinggradients/ragas.git to commit 701d66c54031e41a443bf9046c43d296ceefda27
[31mERROR: git+https://github.com/explodinggradients/ragas.git does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [40]:
!pip install -U ragas==0.3.0


Collecting ragas==0.3.0
  Using cached ragas-0.3.0-py3-none-any.whl.metadata (2.6 kB)
Using cached ragas-0.3.0-py3-none-any.whl (190 kB)
Installing collected packages: ragas
Successfully installed ragas-0.3.0


In [56]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
# from ragas.metrics.critique import harmfulness
from ragas import evaluate
import pandas as pd
from datasets import Dataset
from tqdm import tqdm

# def build_ragas_dataset(rag_pipeline, eval_data):
#     """Generate a RAGAS-compatible dataset from the given evaluation records."""
#     generated_entries = []
    
#     for record in tqdm(eval_data, desc="Running RAG pipeline"):
#         # Get the pipeline output for the given question
#         pipeline_result = rag_pipeline({"query": record["question"]})
        
#         # Collect structured data for RAGAS evaluation
#         generated_entries.append({
#             "question": record["question"],
#             "answer": pipeline_result["result"],
#             "contexts": [doc.page_content for doc in pipeline_result["source_documents"]],
#             "ground_truths": [record["ground_truth"]]
#         })
    
#     # Convert to Hugging Face Dataset
#     df_ragas = pd.DataFrame(generated_entries)
#     return Dataset.from_pandas(df_ragas)


def build_ragas_dataset(rag_pipeline, eval_data):
    from collections.abc import Mapping
    import pandas as pd
    from datasets import Dataset
    from tqdm import tqdm

    # Normalize input → list[dict]
    if isinstance(eval_data, pd.DataFrame):
        rows = eval_data.to_dict(orient="records")
    elif isinstance(eval_data, Dataset):
        rows = list(eval_data)
    elif isinstance(eval_data, list) and (len(eval_data) == 0 or isinstance(eval_data[0], Mapping)):
        rows = eval_data
    else:
        raise TypeError("eval_data must be DataFrame, Dataset, or list of dicts.")

    entries = []
    for record in tqdm(rows, desc="Running RAG pipeline"):
        q = record["question"]
        gt = record["ground_truth"]

        out = rag_pipeline({"query": q})
        entries.append({
            "question": q,
            "answer": out.get("result", ""),
            "contexts": [doc.page_content for doc in out.get("source_documents", [])],  # list[str]
            "reference": str(gt),  # <-- plain string, not list
        })

    return Dataset.from_pandas(pd.DataFrame(entries))



def run_ragas_evaluation(ragas_dataset):
    """Evaluate a RAGAS dataset with selected metrics."""
    return evaluate(
        ragas_dataset,
        metrics=[
            context_precision,
            faithfulness,
            answer_relevancy,
            context_recall,
        ],
    )


In [57]:
from tqdm import tqdm
import pandas as pd

# Build the evaluation dataset for RAGAS
basic_qa_ragas_dataset = build_ragas_dataset(qa_pipeline, eval_dataset)


Running RAG pipeline: 100%|██████████| 10/10 [00:16<00:00,  1.65s/it]


In [58]:
basic_qa_ragas_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'reference'],
    num_rows: 10
})

In [54]:
basic_qa_ragas_dataset.to_pandas().to_csv("basic_qa_ragas_dataset.csv", index=False)


In [59]:
basic_qa_result = run_ragas_evaluation(basic_qa_ragas_dataset)


Evaluating: 100%|██████████| 40/40 [00:33<00:00,  1.21it/s]


In [60]:
basic_qa_result 

{'context_precision': 0.8000, 'faithfulness': 0.7802, 'answer_relevancy': 0.9521, 'context_recall': 0.9083}

In [61]:
# Option A: strip outputs in-place (safe + simple)
!pip install nbconvert
find . -name "*.ipynb" -type f -print0 | xargs -0 -I{} jupyter nbconvert --to notebook --ClearOutputPreprocessor.enabled=True --inplace "{}"

# (Optional) enforce stripping on every commit
!pip install nbstripout
nbstripout --install


SyntaxError: invalid syntax (1003492158.py, line 3)