In [1]:
import chromadb
from dotenv import load_dotenv
from langchain_community.document_loaders import WikipediaLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load gemini api key form .env file
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

In [3]:
llm = ChatGoogleGenerativeAI(model="gemini-pro" , api_key=api_key)

In [4]:
embedding_function = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-large-en-v1.5",
    model_kwargs = {'device':'cpu'},
    encode_kwargs = {'normalize_embeddings':True}
)

  embedding_function = HuggingFaceBgeEmbeddings(


In [5]:
# chunk_size = 600
# chunk_overlap = 100

chat = llm

In [6]:
# loader = WikipediaLoader(query="Pneumonia", load_max_docs=5)
# documents = loader.load()
# documents
# docs = [TextLoader("../DB/scraped_faqs.txt").load()]

In [7]:
# documents = docs[0]
# documents

In [8]:
# text splitting
# text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
# docs = text_splitter.split_documents(documents=documents)
# i need the text splitter such that it splits the text with "================================================================================" this comes
# in the text and i need to split the text at this point
faqs = []
with open("../DB/scraped_faqs.txt", "r", encoding="utf-8") as file:
    content = file.read().split("\n================================================================================\n\n")
    for entry in content:
        if entry.strip():
            lines = entry.split("\n")
            question = lines[0].replace("Question: ", "").strip()
            answer = " ".join(line.strip() for line in lines[1:] if line).replace("Answer: ", "").strip()
            faqs.append({"question": question, "answer": answer})


In [9]:
# faqs[:100]

In [10]:
from langchain.schema import Document

docs = [
    Document(
        page_content=f"{faq['question']} {faq['answer']}",
        metadata={"question": faq['question'], "answer": faq['answer']}
    )
    for faq in faqs
]

In [11]:
db = Chroma.from_documents(docs, embedding_function, persist_directory="../output/Airpott.db")

In [12]:
from langchain.retrievers.multi_query import MultiQueryRetriever
mq_retriever = MultiQueryRetriever.from_llm(retriever = db.as_retriever(), llm = chat)

In [19]:
query = "Can i drink and smoke in the plane?"
retrieved_docs = mq_retriever.get_relevant_documents(query=query)
retrieved_docs

[Document(metadata={'answer': 'Passenger can purchase Liquor from duty paid shops at the airport. It should be sealed packed and should be STEB (Security Tamper Evident Bag) packaging as per ICAO regulations.', 'question': 'Can a passenger travelling on International flight carry liquor?'}, page_content='Can a passenger travelling on International flight carry liquor? Passenger can purchase Liquor from duty paid shops at the airport. It should be sealed packed and should be STEB (Security Tamper Evident Bag) packaging as per ICAO regulations.'),
 Document(metadata={'answer': 'Passenger can buy Liquor from duty paid shops available at the airport and carry them, but according to the guidelines of the countries.', 'question': 'Can a passenger buy Liquor and carry them on board?'}, page_content='Can a passenger buy Liquor and carry them on board? Passenger can buy Liquor from duty paid shops available at the airport and carry them, but according to the guidelines of the countries.')]

In [20]:
retrieved_docs[0].metadata

{'answer': 'Passenger can purchase Liquor from duty paid shops at the airport. It should be sealed packed and should be STEB (Security Tamper Evident Bag) packaging as per ICAO regulations.',
 'question': 'Can a passenger travelling on International flight carry liquor?'}

In [21]:
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever

retriever = db.as_retriever()
# chat = ChatOpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(chat)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
compressed_docs = compression_retriever.get_relevant_documents(query = query)
compressed_docs

[]

In [22]:
from langchain.retrievers.document_compressors import LLMChainFilter

compressor = LLMChainFilter.from_llm(chat)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
compressed_docs = compression_retriever.get_relevant_documents(query = query)
compressed_docs

[]

In [23]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

# using similarity threshold of 0.6
embeddings_filter  = EmbeddingsFilter(embeddings=embedding_function, similarity_threshold=0.6)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)
compressed_docs = compression_retriever.get_relevant_documents(query = query)
print(compressed_docs)


[_DocumentWithState(metadata={'answer': 'Passenger can buy Liquor from duty paid shops available at the airport and carry them, but according to the guidelines of the countries.', 'question': 'Can a passenger buy Liquor and carry them on board?'}, page_content='Can a passenger buy Liquor and carry them on board? Passenger can buy Liquor from duty paid shops available at the airport and carry them, but according to the guidelines of the countries.', state={'embedded_doc': [-0.006638279650360346, 0.010497055947780609, 0.014490588568150997, 0.029446303844451904, -0.006939567159861326, -0.018801555037498474, -0.0073143476620316505, 0.025307977572083473, 0.005377341527491808, 0.03290307894349098, 0.0043898290023207664, 0.02217515930533409, 0.009090950712561607, -0.004596198443323374, 0.031682778149843216, 0.005671568214893341, 0.01791234128177166, 0.016533760353922844, -0.0029430475551635027, 0.001106217852793634, 0.01754920743405819, 0.03398576006293297, -0.0685366615653038, -0.04833265393

In [24]:
compressed_docs[0].metadata

{'answer': 'Passenger can buy Liquor from duty paid shops available at the airport and carry them, but according to the guidelines of the countries.',
 'question': 'Can a passenger buy Liquor and carry them on board?'}