In [168]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [169]:
%pip install -U --quiet langchain-google-genai langchain tiktoken pypdf sentence_transformers chromadb langchain_community

Final Chat Bot

*   Split text into sentences
*   Duplicate first and last page texts
*   Use Parent-Child Document Retriever

In [None]:
import os
import re
import PyPDF2
import pandas as pd
from google.colab import drive
from markdown import Markdown
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableMap
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# Define a Document class to store page content and metadata
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f"Document(page_content={self.page_content!r}, metadata={self.metadata})"

# Define a DataLoader class to handle the loading and chunking of PDF data
class DataLoaderParentChildChunks:
    def __init__(self, input_file, parent_chunk_size, child_chunk_size, chunk_overlap):
        self.input_file = input_file
        self.parent_chunk_size = parent_chunk_size
        self.child_chunk_size = child_chunk_size
        self.chunk_overlap = chunk_overlap

    # Get the total number of pages in the PDF
    def get_total_pages(self):
        with open(self.input_file, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            total_pages = len(reader.pages)
        return total_pages

    # Load a specific page from the PDF
    def load_pdf_page(self, page_num):
        with open(self.input_file, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            page = reader.pages[page_num]
            page_text = page.extract_text()
        return page_text, page_num + 1

    # Split text into sentences using regular expressions
    def split_into_sentences(self, text):
        sentence_endings = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
        return sentence_endings.split(text)

    # Create chunks from sentences with specified chunk size and overlap
    def create_chunks(self, sentences, page_numbers, chunk_size):
        chunks = []
        num_sentences = len(sentences)
        step = chunk_size - self.chunk_overlap
        for i in range(0, num_sentences, step):
            chunk_sentences = sentences[i:i + chunk_size]
            chunk_pages = page_numbers[i:i + chunk_size]
            chunk = ' '.join(chunk_sentences)
            if chunk:
                chunks.append({
                    'Text': chunk,
                    'Source': self.input_file,
                    'Page': ', '.join(map(str, sorted(set(chunk_pages))))
                })
        return chunks

    # Main function to load PDF, split into sentences, and create chunks
    def run(self, num_pages=None):
        total_pages = self.get_total_pages()
        if num_pages is None:
            num_pages = total_pages

        combined_text = ""
        page_texts = []

        # Use ThreadPoolExecutor to load pages in parallel
        with ThreadPoolExecutor() as executor:
            future_to_page = {executor.submit(self.load_pdf_page, page_num): page_num for page_num in range(min(num_pages, total_pages))}
            for future in as_completed(future_to_page):
                page_text, page_num = future.result()
                page_texts.append((page_text, page_num))
                combined_text += page_text + " "

        # Add first and last page texts again for context
        if total_pages > 0:
            first_page_text = self.load_pdf_page(0)[0]
            last_page_text = self.load_pdf_page(total_pages - 1)[0]
            page_texts.insert(1, (first_page_text, 1))
            page_texts.append((last_page_text, total_pages))
            combined_text = first_page_text + " " + combined_text + " " + last_page_text

        # Split combined text into sentences and track page numbers
        sentences = []
        page_numbers = []
        for page_text, page_num in page_texts:
            page_sentences = self.split_into_sentences(page_text)
            sentences.extend(page_sentences)
            page_numbers.extend([page_num] * len(page_sentences))

        # Create parentchunks and child chunks
        parent_chunks = self.create_chunks(sentences, page_numbers, self.parent_chunk_size)
        child_chunks = self.create_chunks(sentences, page_numbers, self.child_chunk_size)

        # Create document texts from the child chunks
        child_documents = []
        for idx, row in pd.DataFrame(child_chunks).iterrows():
            page_content = row['Text']
            metadata = {'source': row['Source'], 'page': row['Page']}
            child_documents.append(Document(page_content=page_content, metadata=metadata))

        # Create document texts from the parent chunks
        parent_documents = []
        for idx, row in pd.DataFrame(parent_chunks).iterrows():
            page_content = row['Text']
            metadata = {'source': row['Source'], 'page': row['Page']}
            parent_documents.append(Document(page_content=page_content, metadata=metadata))

        return child_documents, parent_documents

In [None]:
def question_answering(quest):
    # Mount Google Drive
    drive.mount('/content/drive')

    os.environ["GOOGLE_API_KEY"] = "Your Google API"

    # Load and split PDF using DataLoaderParentChildChunks
    pdf_path = "/content/drive/MyDrive/Colab Notebooks/GenAI_Handbook.pdf"
    loader = DataLoaderParentChildChunks(pdf_path, parent_chunk_size=500, child_chunk_size=200, chunk_overlap=100)
    child_documents, parent_documents = loader.run()

    # Initialize HuggingFace Embeddings
    model_name = "all-mpnet-base-v2"
    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )

    # Create a document search index and save embeddings in vector DB
    child_docsearch = Chroma.from_documents(child_documents, hf)
    parent_docsearch = Chroma.from_documents(parent_documents, hf)

    # Configure the retrieve
    retriever = child_docsearch.as_retriever(
        search_type="mmr",
        search_kwargs={"k": 5, "fetch_k": 152}
    )

    template = """Answer the question based only on the following context:
    {context}

    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    gemini = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)

    chain = RunnableMap({
        "context": lambda x: retriever.get_relevant_documents(x['question']),
        "question": lambda x: x['question']
    }) | prompt | gemini

    response = chain.invoke({'question': quest})

    # Find the parent chunks related to the retrieved child chunks
    relevant_child_chunks = retriever.get_relevant_documents(quest)
    relevant_parent_chunks = []
    for child_chunk in relevant_child_chunks:
        parent_docs = parent_docsearch.as_retriever(
            search_type="mmr",
            search_kwargs={"k": 5}
        ).get_relevant_documents(child_chunk.page_content)
        relevant_parent_chunks.extend(parent_docs)

    # Ensure relevant_parent_chunks is unique
    unique_parent_chunks = {doc.page_content: doc for doc in relevant_parent_chunks}.values()

    # Update the context in the template with the unique parent chunks
    parent_context = ' '.join([doc.page_content for doc in unique_parent_chunks])
    template = template.replace("{context}", parent_context)

    response = chain.invoke({'question': quest})
    return response.content

In [None]:
quest = "What are some chunking strategies??"
question_answering(quest)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




'- Fixed-size chunking\n- Recursive chunking\n- Parent-Child Document Retriever'

In [None]:
quest = "what is Parent-Child Document Retriever?"
question_answering(quest)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




'The Parent-Child Document Retriever is a technique used to address the challenge of creating chunks of data that are both small enough to reduce noise and large enough to provide sufficient context for a language model (LLM). It involves creating two chunk sizes and two chunk overlaps: one for large chunks and one for small chunks. The original data is first split into large chunks, and then the large chunks are split into small chunks. The small chunks contain a reference to the large chunks they were derived from. The small chunks are used to create vector embeddings, which are used during similarity search. The large chunks are used to provide the necessary context to the LLM for it to generate text.'

In [None]:
quest = "What are the two common chunking parameters?"
question_answering(quest)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




'Chunk size and chunk overlap'

In [None]:
quest = "What is prompt management?"
question_answering(quest)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




'Prompt management is essential to ensure developers can evaluate multiple prompts.'

In [None]:
quest = "How should users develop with LangChain?"
question_answering(quest)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




'When developing with the LangChain framework, it is recommended to pull down and extend the larger, more complex implementations. This is most notably the core classes (ex: AzureOpenAI, Agent). After pulling down the code from the relevant API version, developers can modify it for their use case. This simplifies extending LangChain’s existing classes as well as the debugging process. This method can also be used to incorporate logging and extra error handling.'

In [None]:
quest = "What are evaluation framework?"
question_answering(quest)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




'An evaluation framework is essential to efficiently compare LLM architectures, specifically architecture approaches and all the parameters involved. The framework defines specific key evaluation metrics and a process for calculating these metrics using variable prompts.'