# Hypothetical Document Embedding (HyDE) in Document Retrieval
This code implements a Hypothetical Document Embedding (HyDE) system for document retrieval. HyDE is an innovative approach that transforms query questions into hypothetical documents containing the answer, aiming to bridge the gap between query and document distributions in vector space.

In [None]:
!pip install langchain langchain-openai python-dotenv chromadb pypdf python-dotenv

In [None]:
import os
import sys
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [15]:

path = "/content/climate-change-evidence-causes.pdf"

## Define the HyDe retriever class - creating vector store, generating hypothetical document, and retrieving

In [16]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI


def encode_pdf(file_path, chunk_size, chunk_overlap):
    """Loads a PDF, chunks it, and creates a Chroma vector store."""
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    embeddings = OpenAIEmbeddings()
    # Create and store locally the vectorestore
    db = Chroma.from_documents(texts, embeddings, persist_directory="db")
    db.persist()
    return db



In [28]:

class HyDERetriver:
  def __init__(self, files_path, chunk_size=500, chunk_overlap=0):
    self.llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini", max_tokens=4000)
    self.embeddings = OpenAIEmbeddings()
    self.chunk_size = chunk_size
    self.chunk_overlap = chunk_overlap
    self.vectorestore = encode_pdf(files_path, chunk_size=self.chunk_size,chunk_overlap=self.chunk_overlap)
    self.hyde_prompt = PromptTemplate(
      input_variables=["query", "chunk_size"],
            template="""Given the question '{query}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
            the document size has be exactly {chunk_size} characters.""",
        )
    self.hyde_chain = self.hyde_prompt | self.llm

  def generate_hypothetical_doc(self, query):
    input_variables = {"query": query, "chunk_size": self.chunk_size}
    return self.hyde_chain.invoke(input_variables).content

  def retrivel(self, query, k=3):
    hypothetical_doc = self.generate_hypothetical_doc(query)
    similar_doc = self.vectorestore.similarity_search(hypothetical_doc, k=k)
    return similar_doc, hypothetical_doc

In [29]:
retriver = HyDERetriver(path)

In [30]:
test_query = "what is main cause of climate change"
results, hypothetical_doc = retriver.retrivel(test_query)

In [39]:
import textwrap
docs_content = [doc.page_content for doc in results]
print("hypothetical_doc:\n")
print(textwrap.wrap(hypothetical_doc, width=80))


hypothetical_doc:

['**Climate Change Report**  **Main Cause of Climate Change: Human Activities**', 'The primary cause of climate change is the result of human activities,', 'primarily:  1. **Greenhouse Gas Emissions**: 65% of CO2 emissions come from', 'fossil fuel combustion (coal, oil, gas). 2. **Deforestation**: 20% of emissions', 'result from land use changes, mainly deforestation. 3. **Agriculture**: 10% of', 'emissions come from agriculture, specifically beef, rice, and wheat production.', '4. **Industrial Processes**: 5% of emissions are from cement production and', 'other industrial processes.  Human activities release massive amounts of CO2 and', 'other greenhouse gases, leading to a global average temperature increase of 1Â°C', 'since the late 1800s.']
