#  **HyDE RAG**

In [5]:
import os
from dotenv import load_dotenv
from utils.Evaluate_Rag import *
from utils.helper_functions import *

# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [6]:
path = "/workspaces/RAG-ADVANCED/Data/Understanding_Climate_Change.pdf"

Define the HyDe retriever class - creating vector store, generating hypothetical document, and retrieving

In [14]:
class HyDERetriever:
    
    def __init__(self,file_path,chunk_size=500,chunk_overlap=100):
        #Defining the llm
        self.llm = ChatOpenAI(temperature=0,model_name="gpt-4o-mini",max_tokens=4000)
        #Defining the parameters
        self.embeddings = OpenAIEmbeddings()
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.vectorstore = encode_pdf(file_path,chunk_size=self.chunk_size,chunk_overlap=self.chunk_overlap)
        
        
        self.hyde_prompt = PromptTemplate(
            input_variables=["query","chunk_size"],
            template = """Given the question '{query}', generate a hypothetical document that directly answers this question
            the document size has be exactly {chunk_size} characters.""",
        )
        
        self.hyde_chain = self.hyde_prompt | self.llm
        
    def generate_hypothetical_document(self,query):
        input_variables = {"query":query,"chunk_size":self.chunk_size}
        return self.hyde_chain.invoke(input_variables).content
    
    def retrieve(self,query,k=3):
        hypothetical_doc = self.generate_hypothetical_document(query)
        similar_docs = self.vectorstore.similarity_search(hypothetical_doc,k=k)
        return similar_docs , hypothetical_doc
                    

Create a HyDe retriever instance

In [15]:
retriever = HyDERetriever(path)

Demonstrate on a use case

In [17]:
test_query = "What is the main cause of climate change?"
results , hypothetical_doc = retriever.retrieve(test_query)

In [18]:
results

[Document(id='6899e6f8-23f7-4ee1-839e-503b9c05c6e6', metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': '/workspaces/RAG-ADVANCED/Data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 0, 'page_label': '1'}, page_content='predict future trends. The evidence overwhelmingly shows that recent changes are primarily \ndriven by human activities, particularly the emission of greenhouse gases. \nChapter 2: Causes of Climate Change \nGreenhouse Gases \nThe primary cause of recent climate change is the increase in greenhouse gases in the \natmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous \noxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential'),
 Document(id='877f2395-25c9-4492-a78d-4faaf7bbcffb', metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® W

Plot the hypothetical document and the retrieved documents

In [19]:
docs_content = [doc.page_content for doc in results]
print("hypothetical_doc:\n")
print(text_wrap(hypothetical_doc)+"\n")
show_context(docs_content)

hypothetical_doc:

**Title: The Main Cause of Climate Change**  Climate change is primarily driven by human activities, particularly the
burning of fossil fuels such as coal, oil, and natural gas. This process releases significant amounts of carbon dioxide
(CO2) and other greenhouse gases into the atmosphere, trapping heat and leading to global warming. Deforestation also
contributes by reducing the number of trees that can absorb CO2. Additionally, industrial processes, agriculture, and
waste management practices further exacerbate the problem. Addressing these causes is crucial for mitigating climate
change and protecting our planet.

Context 1:
predict future trends. The evidence overwhelmingly shows that recent changes are primarily 
driven by human activities, particularly the emission of greenhouse gases. 
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
atmosphere. Greenhouse gases, such

Langchain HyDe Implementation

In [25]:
!pip install chromadb
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_classic.chains import HypotheticalDocumentEmbedder

#Load the pdf
loader = PyPDFLoader("/workspaces/RAG-ADVANCED/Data/Understanding_Climate_Change.pdf")
documents = loader.load()

#split
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
chunks = splitter.split_documents(documents)

#create Hyde embeddings
base_embeddings = OpenAIEmbeddings()
llm = OpenAI(temperature=0)
hyde_embeddings = HypotheticalDocumentEmbedder.from_llm(
    llm,base_embeddings,prompt_key="web_search"
)

#create vector store
vector_store = Chroma.from_documents(chunks,hyde_embeddings)

#Query
retriever = vector_store.as_retriever(search_kwargs={"k":5})
docs = retriever.invoke("What are the roles of technology in climate change mitigation")

print(docs[0].page_content)

Collecting chromadb
  Obtaining dependency information for chromadb from https://files.pythonhosted.org/packages/96/da/048ea86c7cb04a873aaab912be62d90b403a8b15a98ae7781ea777371373/chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Obtaining dependency information for build>=1.0.3 from https://files.pythonhosted.org/packages/c5/0d/84a4380f930db0010168e0aa7b7a8fed9ba1835a8fbb1472bc6d0201d529/build-1.4.0-py3-none-any.whl.metadata
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Obtaining dependency information for pybase64>=1.4.1 from https://files.pythonhosted.org/packages/62/f7/965b79ff391ad208b50e412b5d3205ccce372a2d27b7218ae86d5295b105/pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata


Minimal Usage

In [27]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

# Load PDF
docs = PyPDFLoader("/workspaces/RAG-ADVANCED/Data/Understanding_Climate_Change.pdf").load()
chunks = RecursiveCharacterTextSplitter(chunk_size=1000).split_documents(docs)

# Create vector store and retriever
retriever = Chroma.from_documents(chunks, OpenAIEmbeddings()).as_retriever()
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# HYDE prompt
hyde_prompt = ChatPromptTemplate.from_template("Write a passage answering: {question}")

# Helper function to format documents
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

# Create HYDE RAG chain - CORRECTED ✅
hyde_chain = (
    {"question": RunnablePassthrough(), "hypothetical": hyde_prompt | llm | StrOutputParser()}
    | RunnableLambda(lambda x: {  # ✅ Wrap lambda in RunnableLambda
        "context": format_docs(retriever.invoke(x["hypothetical"])), 
        "question": x["question"]
    })
    | ChatPromptTemplate.from_template("Context: {context}\n\nQuestion: {question}\n\nAnswer:")
    | llm
    | StrOutputParser()
)

# Query
answer = hyde_chain.invoke("What is this about?")
print(answer)

This text is about the importance of informing the public about climate change through various means such as journalism, public engagement initiatives, integrating climate education into school curricula, public awareness campaigns, and lifelong learning initiatives. It emphasizes the role of media organizations, education, and community engagement in raising awareness, promoting sustainable behaviors, and preparing future generations to address climate challenges.
