In [1]:
import os
import getpass
from langchain_community.document_loaders import PyMuPDFLoader
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from ragas.testset import TestsetGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("AZURE_OPENAI_API_KEY")

We will use RAGAS to generate a diverse set of questions for evaluating RAG.

In [36]:
loader = PyMuPDFLoader("wildfire.pdf")
data = loader.load()

In [40]:
generator_llm = LangchainLLMWrapper(
    AzureChatOpenAI(
        azure_deployment="gpt-4o",
        api_version="2024-08-01-preview",
        azure_endpoint="https://pdf-converter.openai.azure.com/",
    )
)
generator_embeddings = LangchainEmbeddingsWrapper(
    AzureOpenAIEmbeddings(
        azure_deployment="text-embedding-3-large",
        api_version="2024-08-01-preview",
        azure_endpoint="https://pdf-converter.openai.azure.com/",
    )
)

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(data, testset_size=10)

Applying HeadlineSplitter:   0%|          | 0/3 [00:00<?, ?it/s]          unable to apply transformation: 'headlines' property not found in this node
Generating personas: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s]                                          
Generating Scenarios: 100%|██████████| 3/3 [00:06<00:00,  2.19s/it]
Generating Samples: 100%|██████████| 12/12 [00:04<00:00,  2.52it/s]


In [47]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,how many acres burned in 1990s?,[https://crsreports.congress.gov Updated June ...,"In the 1990s, an average of 3.3 million acres ...",single_hop_specifc_query_synthesizer
1,What percentage of wildfires were human-caused...,[Most wildfires are human-caused (89% of the a...,89% of the average number of wildfires from 20...,single_hop_specifc_query_synthesizer
2,What happen with wildfires in 2022?,[Wildfire Statistics https://crsreports.congre...,"In 2022, 52% of the nationwide acreage burned ...",single_hop_specifc_query_synthesizer
3,Given the significant economic impacts on comm...,[Conflagrations Of the 1.6 million wildfires t...,Predicting which wildfires will escalate into ...,single_hop_specifc_query_synthesizer
4,How does the Department of the Interior (DOI) ...,[<1-hop>\n\nWildfire Statistics https://crsrep...,The Department of the Interior (DOI) plays a s...,multi_hop_abstract_query_synthesizer
5,How does the Department of the Interior (DOI) ...,[<1-hop>\n\nWildfire Statistics https://crsrep...,"In 2022, 52% of the federal acreage burned by ...",multi_hop_abstract_query_synthesizer
6,What are the responsibilities of federal agenc...,[<1-hop>\n\nWildfire Statistics https://crsrep...,Federal agencies are responsible for respondin...,multi_hop_abstract_query_synthesizer
7,"How do the statistics of wildfires on state, l...",[<1-hop>\n\nWildfire Statistics https://crsrep...,"In 2022, 52% of the nationwide acreage burned ...",multi_hop_abstract_query_synthesizer
8,How much of Alaska's land was affected by wild...,[<1-hop>\n\nhttps://crsreports.congress.gov Up...,"In 2022, wildfires burned 3.1 million acres in...",multi_hop_specific_query_synthesizer
9,How did the wildfires in Alaska in 2022 compar...,[<1-hop>\n\nhttps://crsreports.congress.gov Up...,"In 2022, wildfires in Alaska burned over 3.1 m...",multi_hop_specific_query_synthesizer


In [52]:
test_df = dataset.to_pandas()
queries = []
ragas_responses = []
for index, row in test_df[test_df['reference'] != 'nan'].iterrows():
    queries.append(row['user_input'])
    ragas_responses.append(row['reference'])

In [5]:
queries = [
    "how many acres burned in 1990s?",
    "What percentage of wildfires were human-caused from 2018 to 2022?",
    "What happen with wildfires in 2022?",
    "Given the significant economic impacts on communities affected by wildfires, what challenges are associated with predicting which wildfires will escalate into conflagrations?",
    "How does the Department of the Interior (DOI) contribute to post-wildfire recovery and what are the economic impacts on communities?",
    "How does the Department of the Interior (DOI) contribute to post-wildfire recovery, considering the acreage burned on DOI lands in 2022?",
    "What are the responsibilities of federal agencies in wildfire management on federal lands, and how do these responsibilities impact the acreage burned in different regions?",
    "How do the statistics of wildfires on state, local, or privately owned lands compare to those on federal lands in 2022, and what challenges are associated with predicting which wildfires will become conflagrations?",
    "How much of Alaska's land was affected by wildfires in 2022 and what percentage of the total U.S. wildfire acreage does this represent?",
    "How did the wildfires in Alaska in 2022 compare to the overall wildfire activity in the United States in terms of acreage burned and the distribution of federal versus nonfederal land?",
    "What was the impact of wildfires in Alaska in 2022, and how did it compare to the overall wildfire activity in the United States?",
    "How has the number of wildfires and the acreage burned changed over the last 30 years, and what role does the NICC play in compiling this data?"
]

To make the experimentation fair, we will not use the response generated by Ragas. Instead, we will feed these questions to ChatGPT (using the file upload functionalities) to generate the actual test dataset.

In [6]:
expected_responses = [
    "According to the PDF, in the 1990s, the average annual acreage burned was 3.3 million acres. This is significantly lower than the average annual acreage burned since 2000, which has more than doubled to 7.0 million acres per year.",
    "From 2018 to 2022, 89%% of wildfires were human-caused​",
    "In 2022, 52%% of the nationwide acreage burned by wildfires was on federal lands (4.0 million acres), which was lower than the 10-year average of 64%. The other 48%% of the acreage burned was on state, local, or privately owned lands, though these fires accounted for 83%% of total fires. In the West, just over 20,000 wildfires burned approximately 5.8 million acres, while in the East, over 48,000 fires burned just over 1.8 million acres. Additionally, over 2,700 structures were burned in wildfires, with the majority of the damage occurring in California.",
    "Predicting which wildfires will escalate into conflagrations is challenging due to multiple factors, including weather conditions, terrain, vegetation type, and fire behavior dynamics; only about 1%% of wildfires become large, destructive events, but identifying them in advance remains difficult.",
    "The Department of the Interior (DOI) contributes to post-wildfire recovery through land restoration, erosion control, and habitat rehabilitation, while the economic impacts on communities include property damage, loss of infrastructure, business disruptions, and long-term environmental degradation.",
    "In 2022, 2.1 million acres burned on DOI lands, and the department contributed to post-wildfire recovery through land restoration, erosion control, habitat rehabilitation, and reforestation efforts to mitigate environmental and economic impacts on affected areas.",
    "Federal agencies manage wildfire response on federal lands, with the Forest Service (FS) overseeing 193 million acres of National Forest System land and the Department of the Interior (DOI) managing over 400 million acres of national parks, wildlife refuges, and other public lands. In 2022, 52%% of the total burned acreage (4.0 million acres) was on federal lands, with FS accounting for 1.9 million acres and DOI for 2.1 million acres. Regional differences show that in the West, 64%% of burned acreage was on federal lands, whereas in the East, 85%% of burned acreage was on nonfederal lands, reflecting federal agencies’ larger role in wildfire response in western states",
    "In 2022, 48%% of burned acreage (3.6 million acres) was on state, local, or privately owned lands, while 52% (4.0 million acres) was on federal lands, despite 83%% of total wildfires occurring on nonfederal lands. Predicting which wildfires will become conflagrations is challenging due to factors like weather conditions, terrain, fuel availability, and fire behavior dynamics, with only 1%% of wildfires escalating into large, destructive events",
    "In 2022, wildfires burned 3.1 million acres in Alaska, accounting for over 40%% of the total 7.6 million acres burned in the U.S",
    "In 2022, Alaska wildfires burned 3.1 million acres, representing over 40%% of the 7.6 million acres burned nationwide. In Alaska, just over half (1.6 million acres) burned on nonfederal lands, while 1.5 million acres burned on DOI lands, reflecting a more balanced distribution between federal and nonfederal land compared to the overall U.S., where 52%% of burned acreage was on federal lands",
    "In 2022, wildfires in Alaska burned 3.1 million acres, accounting for over 40%% of the 7.6 million acres burned nationwide. In contrast to the overall U.S., where 52%% of burned acreage was on federal lands, Alaska saw a more balanced distribution, with 1.6 million acres burning on nonfederal lands and 1.5 million acres on DOI lands, highlighting its significant share of the nation's wildfire impact",
    "Over the last 30 years, the number of wildfires has slightly decreased, but the acreage burned has generally increased, with annual averages rising from 3.3 million acres in the 1990s to 7.0 million acres since 2000. The National Interagency Coordination Center (NICC) compiles and analyzes wildfire data nationwide, providing reports on wildfire frequency, burned acreage, and resource deployment, which help guide wildfire management strategies"
]

In [94]:
from abc import ABC, abstractmethod

class Rag(ABC):
    @abstractmethod
    def load_documents(self, doc_path):
        pass

    @abstractmethod
    def get_most_relevant_docs(self, query):
        """Stop the vehicle"""
        pass

    @abstractmethod
    def generate_answer(self, query, relevant_docs):
        pass

In [95]:
from langchain_chroma.vectorstores import Chroma

simple_store = Chroma(
    collection_name="simple",
    embedding_function=AzureOpenAIEmbeddings(
            azure_deployment="text-embedding-3-large",
            api_version="2024-08-01-preview",
            azure_endpoint="https://pdf-converter.openai.azure.com/",
        ),  
)
structured_store = Chroma(
    collection_name="structured",
     embedding_function=AzureOpenAIEmbeddings(
            azure_deployment="text-embedding-3-large",
            api_version="2024-08-01-preview",
            azure_endpoint="https://pdf-converter.openai.azure.com/",
        ),  
)

In [116]:
from pypdf import PdfReader
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter

class SimpleRag(Rag):
    def __init__(self, chunk_size=200, chunk_overlap=50):
        self.llm = AzureChatOpenAI(
            azure_deployment="gpt-4o",
            api_version="2024-08-01-preview",
            azure_endpoint="https://pdf-converter.openai.azure.com/",
        )
        self.embeddings = AzureOpenAIEmbeddings(
            azure_deployment="text-embedding-3-large",
            api_version="2024-08-01-preview",
            azure_endpoint="https://pdf-converter.openai.azure.com/",
        )

        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap
        )

    def load_documents(self, doc_path):
        """Load documents and compute their embeddings."""
        self.docs = []
        simple_store.reset_collection()
        reader = PdfReader(Path(doc_path))
        pages = [page.extract_text() for page in reader.pages]
        self.docs = self.splitter.create_documents(["\n\n".join(pages)])
        simple_store.add_documents(self.docs)

    def get_most_relevant_docs(self, query):
        """Find the most relevant document for a given query."""
        retriever = simple_store.as_retriever(
            search_kwargs={"k": 2}
        )
        if not self.docs:
            raise ValueError("Documents and their embeddings are not loaded.")

        return [doc.page_content for doc in retriever.invoke(query)]

    def generate_answer(self, query, relevant_docs):
        """Generate an answer for a given query based on the most relevant document."""
        prompt = f"question: {query}\n\nDocuments: {"\n\n".join(relevant_docs)}"
        messages = [
            ("system", "You are a helpful assistant that answers questions based on given documents only."),
            ("human", prompt),
        ]
        ai_msg = self.llm.invoke(messages)
        return ai_msg.content 

In [117]:
from pathlib import Path
from docling.document_converter import DocumentConverter

def analyze_doc(input_doc_path: Path):
    converter = DocumentConverter()
    result = converter.convert(input_doc_path)
    return result.document.export_to_markdown()

In [118]:
import tiktoken

class TokenEstimator():
    tokenizer = tiktoken.encoding_for_model("gpt-4o")
    def estimate_tokens(self, text):
        return len(self.tokenizer.encode(text, allowed_special="all"))

def merge_chunk_serially(chunks, max_size):
    token_estimator = TokenEstimator()
    total_size = 0
    current_chunk = ""
    for chunk in chunks:
        chunk_size = token_estimator.estimate_tokens(chunk)
        if total_size > 0:
            new_size = total_size + chunk_size
            if new_size > max_size:
                yield current_chunk
                current_chunk = ""
                total_size = 0

        total_size += chunk_size
        current_chunk += chunk
    
    if total_size > 0:
        yield current_chunk

In [126]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_experimental.text_splitter import SemanticChunker

class StructuredRag(Rag):
    def __init__(self, chunk_size=200, chunk_overlap=50, is_semantic=False):
        self.llm = AzureChatOpenAI(
            azure_deployment="gpt-4o",
            api_version="2024-08-01-preview",
            azure_endpoint="https://pdf-converter.openai.azure.com/",
        )
        self.embeddings = AzureOpenAIEmbeddings(
            azure_deployment="text-embedding-3-large",
            api_version="2024-08-01-preview",
            azure_endpoint="https://pdf-converter.openai.azure.com/",
        )
        self.doc_embeddings = None
        if is_semantic:
            self.splitter = SemanticChunker(self.embeddings)
        else:
            self.splitter = RecursiveCharacterTextSplitter(
                chunk_size = chunk_size,
                chunk_overlap = chunk_overlap
            ).from_tiktoken_encoder(
                model_name="gpt-4o",
                allowed_special="all"
            )
        self.chunk_size = chunk_size

    def load_documents(self, doc_path):
        """Load documents and compute their embeddings."""
        self.docs = []
        structured_store.reset_collection()
        md_text = analyze_doc(Path(doc_path))
        self.docs = self.splitter.create_documents([md_text])

        estimator = TokenEstimator()
        total_tokens = 0
        for doc in self.docs:
            total_tokens += estimator.estimate_tokens(doc.page_content)
        print(len(self.docs))
        print(f"Average chunk size: {total_tokens / len(self.docs)}")

        structured_store.add_documents(self.docs)

    def get_most_relevant_docs(self, query):
        """Find the most relevant document for a given query."""
        retriever = structured_store.as_retriever(
            search_kwargs={"k": 2}
        )
        if not self.docs:
            raise ValueError("Documents and their embeddings are not loaded.")

        return [doc.page_content for doc in retriever.invoke(query)]

    def generate_answer(self, query, relevant_docs):
        """Generate an answer for a given query based on the most relevant document."""
        prompt = f"question: {query}\n\nDocuments: {"\n\n".join(relevant_docs)}"
        messages = [
            ("system", "You are a helpful assistant that answers questions based on given documents only."),
            ("human", prompt),
        ]
        ai_msg = self.llm.invoke(messages)
        return ai_msg.content 

In [44]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, LLMContextPrecisionWithReference
from ragas import EvaluationDataset

def evaluate_rag(rag: Rag):
    dataset = []
    rag.load_documents("./wildfire.pdf")

    for query,reference in zip(queries,expected_responses):

        relevant_docs = rag.get_most_relevant_docs(query)
        response = rag.generate_answer(query, relevant_docs)
        dataset.append(
            {
                "user_input":query,
                "retrieved_contexts":relevant_docs,
                "response":response,
                "reference":reference,
            }
        )
    
    evaluator_llm = LangchainLLMWrapper(
        AzureChatOpenAI(
            azure_deployment="gpt-4o",
            api_version="2024-08-01-preview",
            azure_endpoint="https://pdf-converter.openai.azure.com/",
        )
    )
    evaluator_embeddings = LangchainEmbeddingsWrapper(
        AzureOpenAIEmbeddings(
            azure_deployment="text-embedding-3-large",
            api_version="2024-08-01-preview",
            azure_endpoint="https://pdf-converter.openai.azure.com/",
        )
    )

    evaluation_dataset = EvaluationDataset.from_list(dataset)
    result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), LLMContextPrecisionWithReference()],llm=evaluator_llm, embeddings=evaluator_embeddings)
    return result

In [25]:
evaluate_rag(SimpleRag())

Evaluating: 100%|██████████| 48/48 [01:27<00:00,  1.82s/it]


{'context_recall': 0.4653, 'faithfulness': 0.7573, 'factual_correctness': 0.6058, 'answer_relevancy': 0.8159}

In [26]:
evaluate_rag(StructuredRag())

Evaluating: 100%|██████████| 48/48 [01:25<00:00,  1.77s/it]


{'context_recall': 0.4653, 'faithfulness': 0.7440, 'factual_correctness': 0.6600, 'answer_relevancy': 0.8041}

In [121]:
evaluate_rag(SimpleRag(chunk_size=400, chunk_overlap=100))

Evaluating: 100%|██████████| 60/60 [02:10<00:00,  2.17s/it]


{'context_recall': 0.3819, 'faithfulness': 0.7780, 'factual_correctness': 0.6125, 'answer_relevancy': 0.8520, 'llm_context_precision_with_reference': 0.4167}

In [120]:
evaluate_rag(StructuredRag(chunk_size=400, chunk_overlap=100))

8
6
6


Evaluating: 100%|██████████| 60/60 [02:26<00:00,  2.45s/it]


{'context_recall': 0.8125, 'faithfulness': 0.7607, 'factual_correctness': 0.6350, 'answer_relevancy': 0.8357, 'llm_context_precision_with_reference': 0.8333}

In [125]:
evaluate_rag(StructuredRag(is_semantic=True))

5


Evaluating: 100%|██████████| 60/60 [02:27<00:00,  2.46s/it]


{'context_recall': 0.7917, 'faithfulness': 0.8428, 'factual_correctness': 0.5808, 'answer_relevancy': 0.7701, 'llm_context_precision_with_reference': 0.7083}

In [127]:
rag =  StructuredRag(is_semantic=True)
rag.load_documents("./wildfire.pdf")

5
Average chunk size: 517.8


In [128]:
relevant_docs = rag.get_most_relevant_docs("How has the number of wildfires and the acreage burned changed over the last 30 years, and what role does the NICC play in compiling this data?")

In [132]:
for doc in relevant_docs:
    print(doc)

wildfire activity. Nationwide data compiled by the National Interagency Coordination Center (NICC) indicate that the number of annual wildfires is variable but has decreased slightly over the last 30 years. The number of acres affected annually, while also variable, generally has increased (see Figure 1 ). Since 2000, an annual average of 70,025 wildfires have burned an annual average of 7.0 million acres. The acreage figure is more than double the average annual acreage burned in the 1990s (3.3 million acres), although a greater number of fires occurred annually in the 1990s on average (78,600). Table 1. Annual Wildfires and Acres Burned

|                             | 2018                        | 2019                        | 2020                        | 2021                        | 2022                        |
|-----------------------------|-----------------------------|-----------------------------|-----------------------------|-----------------------------|-------------------

In [90]:
retriever = structured_store.as_retriever(
    search_kwargs={"k": 2}
)
retriever.invoke("How has the number of wildfires and the acreage burned changed over the last 30 years, and what role does the NICC play in compiling this data?", )

[Document(id='486c1b6c-0a8e-4070-a214-66fe1e94eb2b', metadata={}, page_content="## Wildfire Statistics  \nWildfires are unplanned fires, including lightning-caused fires, unauthorized human-caused fires, and escaped fires from prescribed burn projects. States are responsible for responding to wildfires that begin on nonfederal (state, local, and private) lands, except for lands protected by federal agencies under cooperative agreements. The federal government is responsible for responding to wildfires that begin on federal lands. The Forest Service (FS)-within the U.S. Department of Agriculture-carries out wildfire management and response across the 193 million acres of the National Forest System (NFS). The Department of the Interior (DOI) manages wildfire response for more than 400 million acres of national parks, wildlife refuges and preserves, other public lands, and Indian reservations.  \nWildfire statistics help illustrate past U.S. wildfire activity. Nationwide data compiled by 