# PDF_Pipeline:

## Setup and Installations

In [1]:
# Setup directory
%cd E:/Github_Repo/Info-Retrieve-AI/

E:\Github_Repo\Info-Retrieve-AI


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [4]:
%pip install -r requirements.txt




In [5]:
import os
import fitz
import time
import warnings
import numpy as np
import pandas as pd
from __init__ import cfg
from typing import List
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate
from langchain.vectorstores.deeplake import DeepLake
from sentence_transformers import SentenceTransformer
from langchain.document_loaders.pdf import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import StuffDocumentsChain, LLMChain
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

In [6]:
# Load DeepLake configuration
DEEPLAKE_API_TOKEN = cfg.DEEPLAKE_API_TOKEN

## PDFReader Class

In [8]:
class PDFReader:
    """Custom PDF Loader to embed metadata with the pdfs."""
    def __init__(self) -> None:
        self.file_name = ""
        self.total_pages = 0
        self.total_chunks = 0 # Counter for total chunks
        self.total_pages_chunked = 0 # Counter for total pages chunked
        self.model = SentenceTransformer('bert-base-nli-mean-tokens')

    def load_pdf(self, file_path, chunk_size=1000, progress_interval=100):
        self.file_name = os.path.basename(file_path)

        # Open PDF File
        pdf_document = fitz.open(file_path)
        self.total_pages = pdf_document.page_count

        chunks = []

        # Iterate through pages
        for page_number in range(self.total_pages):
            # Extract text content from the page
            page = pdf_document.load_page(page_number)
            page_text = page.get_text()

            # Split the text into chunks
            text_chunks = [page_text[i:i + chunk_size] for i in range(0, len(page_text), chunk_size)]

            # Encode the entire page text to get text_embedding_page
            text_embedding_page = self.model.encode(page_text)

            # Process each chunk
            start_time = time.time()
            for chunk_number, chunk in enumerate(text_chunks, start=1):
                chunk_embedding = self.model.encode(chunk)
                chunks.append({
                    'text': chunk,
                    'text_embedding_page': text_embedding_page,
                    'chunk_number': chunk_number,
                    'chunk_text': chunk,
                    'text_embedding_chunk': chunk_embedding,
                    'metadata': {
                        "file_name": self.file_name,
                        "page_no": str(page_number + 1),
                        "total_pages": str(self.total_pages),
                    }
                })

                # Print progress
                if chunk_number % progress_interval == 0:
                    elapsed_time = time.time() - start_time
                    print(f"Processed {chunk_number}/{len(text_chunks)} chunks in page {page_number+1}. "
                          f"Time elapsed: {elapsed_time:.2f} seconds.")
                    start_time = time.time()

            # Increment total pages chunked
            self.total_pages_chunked += 1

        print(f"Total number of chunks: {self.total_chunks}")
        print(f"Total number of pages chunked: {self.total_pages_chunked}")

        return chunks

In [10]:
# Instantiate PDFReader
# reader = PDFReader()

## Semantic Cache

In [11]:
# Semantic Cache Class
class SemanticCache:
    def __init__(self) -> None:
        # Initialize the embeddings model and cache vector store
        self.embeddings = SentenceTransformerEmbeddings(
            model_name="all-MiniLM-L12-v2"
        )
        self.cache_vectorstore = DeepLake(
            dataset_path="database/cache_vectorstore",
            token=DEEPLAKE_API_TOKEN, # Add Deeplake api
            embedding=self.embeddings,
            read_only=False,
            num_workers=4,
            verbose=False,
        )

    def cache_query_response(self, query: str, response: str):
        # Create a Document object using query as the content and its response as metadata
        doc = Document(
            page_content=query,
            metadata={"response": response},
        )

        # Insert the Document object into cache vectorstore
        _ = self.cache_vectorstore.add_documents(documents=[doc])

    def find_similar_query_response(self, query: str, threshold: int):
        try:
            # Find similar query based on the input query
            sim_response = self.cache_vectorstore.similarity_search_with_score(
                query=query, k=5
            )

            # If sim_response is empty, return an empty list
            if not sim_response:
                return []

            # Return the response from the fetched entry if its score is more than threshold
            return [
                {
                    "response": res[0].metadata["response"],
                }
                for res in sim_response
                if res[1] > threshold
            ]
        except Exception as e:
            raise Exception(e)


In [12]:
# Instantiate SemanticCache
# cache_service = SemanticCache()

## Ingestion Class

In [13]:
# Define the Ingestion class
class Ingestion:
    """Ingestion class for ingesting documents to vectorstore."""

    def __init__(self, semantic_cache: SemanticCache):
        self.text_vectorstore = None
        self.image_vectorstore = None
        self.text_retriever = None
        # Define and initialize the embeddings attribute
        self.embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=cfg.GOOGLE_API_KEY,
        )
        self.semantic_cache = semantic_cache

    def ingest_documents(
        self,
        file: str,
    ):
        # Initialize the vector store
        vstore = DeepLake(
            dataset_path="database/text_vectorstore",
            token=DEEPLAKE_API_TOKEN, # Add Deeplake api
            embedding=self.embeddings,
            overwrite=True,
            num_workers=4,
            verbose=True,
        )

        # Load PDF and process chunks
        chunks = reader.load_pdf(file_path, progress_interval=100) # added progress interval

        # Ingest the chunks
        ids = vstore.add_texts(
            texts=[chunk['text'] for chunk in chunks],
            metadatas=[{
                'chunk_number': chunk['chunk_number'],
                'chunk_text': chunk['chunk_text'],
                'text_embedding_page': chunk['text_embedding_page'],
                'text_embedding_chunk': chunk['text_embedding_chunk'],
                'file_name': chunk['metadata']['file_name'],
                'page_no': chunk['metadata']['page_no'],
                'total_pages': chunk['metadata']['total_pages'],

            } for chunk in chunks]
        )

        # Cache responses in the semantic cache
        for chunk in chunks:
            query = chunk['text']
            response = chunk['chunk_text']  # Assuming this is the response
            self.semantic_cache.cache_query_response(query, response)

        return ids

## QA System

In [16]:
class QASystem:
    def __init__(self, ingestion_pipeline, cache_service) -> None:
        # Initialize Google Generative AI Embeddings
        self.embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=cfg.GOOGLE_API_KEY,
            task_type="retrieval_query",
        )

        # Initialize Gemini Chat model
        self.model = ChatGoogleGenerativeAI(
            model="gemini-pro",
            temperature=0.3,
            google_api_key=cfg.GOOGLE_API_KEY,
            convert_system_message_to_human=True,
        )

        # Initialize GPT Cache
        self.cache_service = cache_service

        # Set up ingestion pipeline
        self.ingestion_pipeline = ingestion_pipeline

    def ask_question(self, query: str):
        try:
            # Search for similar query response in cache
            cached_response = self.cache_service.find_similar_query_response(
                query=query, threshold=cfg.CACHE_THRESHOLD
            )

            # If similar query response is present, return it
            if len(cached_response) > 0:
                print("Using cache")
                result = cached_response[0]["response"]
            # Else generate response for the query
            else:
                print("Generating response")
                result = self.generate_response(query=query)
        except Exception as e:
            print("Exception raised. Generating response.")
            result = self.generate_response(query=query)

        return result

    def generate_response(self, query: str):
        try:
            # Initialize the vectorstore and retriever object
            vstore = DeepLake(
                dataset_path="database/text_vectorstore",
                token=DEEPLAKE_API_TOKEN,  # Add Deeplake api
                embedding=self.embeddings,
                read_only=True,
                num_workers=4,
                verbose=False,
            )
            retriever = vstore.as_retriever(search_type="similarity")
            retriever.search_kwargs["distance_metric"] = "cos"
            retriever.search_kwargs["fetch_k"] = 20
            retriever.search_kwargs["k"] = 15

            # Write prompt to guide the LLM to generate response
            prompt_template = """
            Provide the response along with the source of the text from which your response is derived.
            Tasked with information retrieval-augmented generation, maintain a non-conversational tone.
            If uncertain, respond with "I don't know" instead of providing speculative answers. Keep responses concise, within five sentences.
            Always conclude with "Thanks for asking!".
            Use only the following context pieces to formulate your response: {context}.
            Context: {context}
            Question: {question}

            Answer:
            """

            PROMPT = PromptTemplate(
                template=prompt_template, input_variables=["context", "question"]
            )

            chain_type_kwargs = {"prompt": PROMPT}

            # Create Retrieval QA chain
            qa = RetrievalQA.from_chain_type(
                llm=self.model,
                retriever=retriever,
                verbose=False,
                chain_type_kwargs=chain_type_kwargs,
            )

            # Run the QA chain and store the response in cache
            result = qa({"query": query})["result"]
            self.cache_service.cache_query_response(query=query, response=result)
            print("Response generated")

            return result
        except Exception as e:
            print("Exception raised. Generating response.")
            result = self.generate_response(query=query)
            return result

## Q&A Testing for Sample 1.pdf

In [18]:
# Instantiate PDFReader
reader = PDFReader()

# Instantiate the SemanticCache
cache_service = SemanticCache()

# Instantiate the Ingestion class with the SemanticCache instance
ingestion = Ingestion(semantic_cache=cache_service)

# Ingest the documents
file_path = r"E:\Github_Repo\Info-Retrieve-AI\data_source\NLP.pdf"
document_ids = ingestion.ingest_documents(file_path)

# Check for any exceptions during the ingestion process
if document_ids is None:
    print("Error occurred during document ingestion.")
else:
    print("Document ingestion successful. Document IDs:", document_ids)

# Instantiate the QASystem class
qa_system = QASystem(ingestion_pipeline=ingestion, cache_service=cache_service)

Deep Lake Dataset in database/cache_vectorstore already exists, loading from the storage




Total number of chunks: 0
Total number of pages chunked: 52


Creating 52 embeddings in 1 batches of size 52:: 100%|██████████| 1/1 [00:03<00:00,  3.54s/it]


Dataset(path='database/text_vectorstore', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (52, 1)     str     None   
 metadata     json      (52, 1)     str     None   
 embedding  embedding  (52, 768)  float32   None   
    id        text      (52, 1)     str     None   


Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00,  5.10it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00, 10.71it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00,  3.07it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00, 10.60it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00, 11.55it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00,  8.37it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00, 12.47it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00, 12.68it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00,  9.98it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00,  9.57it/s]
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [0

Document ingestion successful. Document IDs: ['771553c8-347e-11ef-a68b-04d3b00d2969', '771553c9-347e-11ef-8624-04d3b00d2969', '771553ca-347e-11ef-bfaf-04d3b00d2969', '771553cb-347e-11ef-8100-04d3b00d2969', '771553cc-347e-11ef-8c6c-04d3b00d2969', '771553cd-347e-11ef-a1dc-04d3b00d2969', '771553ce-347e-11ef-8fa7-04d3b00d2969', '771553cf-347e-11ef-8d2a-04d3b00d2969', '771553d0-347e-11ef-a482-04d3b00d2969', '771553d1-347e-11ef-8a9c-04d3b00d2969', '771553d2-347e-11ef-9064-04d3b00d2969', '771553d3-347e-11ef-aad4-04d3b00d2969', '771553d4-347e-11ef-a3c9-04d3b00d2969', '771553d5-347e-11ef-9f5a-04d3b00d2969', '771553d6-347e-11ef-904c-04d3b00d2969', '771553d7-347e-11ef-8810-04d3b00d2969', '771553d8-347e-11ef-acfa-04d3b00d2969', '771553d9-347e-11ef-859b-04d3b00d2969', '771553da-347e-11ef-a9b3-04d3b00d2969', '771553db-347e-11ef-aee6-04d3b00d2969', '771553dc-347e-11ef-ad18-04d3b00d2969', '771553dd-347e-11ef-9727-04d3b00d2969', '771553de-347e-11ef-be30-04d3b00d2969', '771553df-347e-11ef-95ec-04d3b00d2




## NLP PDF Test

In [19]:
# Test Case:
query = "Is low perplexity better or bad?"
response = qa_system.generate_response(query)
print(response)

Deep Lake Dataset in database/text_vectorstore already exists, loading from the storage


  warn_deprecated(
Creating 1 embeddings in 1 batches of size 1:: 100%|██████████| 1/1 [00:00<00:00, 13.87it/s]

Response generated
            Lower perplexity scores are better. A language model with a lower perplexity score is better at predicting the next word in a sequence of text, and therefore, it is considered a more effective and accurate language model.
            Thanks for asking!





## Sample Test - 2 page of Annual Report

In [None]:
# Test Case:
query = "What factors does NCGC take into consideration during the selection procedures?"
response = qa_system.generate_response(query)
print(response)

In [None]:
# Test Case
query = "Can you give me the COMPARISON OF 5 YEAR CUMULATIVE TOTALRETURN Among NVIDIACorporation, the S&P500Index, and the Nasdaq100Index ?"
response = qa_system.generate_response(query)
print(response)

In [None]:
# Test Case
query = "Who is Narendra Modi"
response = qa_system.generate_response(query)
print(response)

In [None]:
# Test Case
query = "Who is John O. Dabiri in NVIDIA"
response = qa_system.generate_response(query)
print(response)

In [None]:
# Test Case
query = "What are the skills, competencies and attributes that the Board considers important for directors to have considering current business and future market opportunities:"
response = qa_system.generate_response(query)
print(response)

In [None]:
# Test Case
query = "How many male and female directors are present in the Board of Directors"
response = qa_system.generate_response(query)
print(response)

## Q&A Testing for 2023-Annual-Report-1.pdf

In [None]:
# Instantiate the SemanticCache
cache_service = SemanticCache()

# Instantiate the Ingestion class with the SemanticCache instance
ingestion = Ingestion(semantic_cache=cache_service)

# Ingest the documents
file_path = "/content/drive/MyDrive/Colab_Notebooks/Capstone_Project/PDF/2023-Annual-Report-1.pdf"
document_ids = ingestion.ingest_documents(file_path)

# Check for any exceptions during the ingestion process
if document_ids is None:
    print("Error occurred during document ingestion.")
else:
    print("Document ingestion successful. Document IDs:", document_ids)

# Instantiate the QASystem class
qa_system = QASystem(ingestion_pipeline=ingestion, cache_service=cache_service)

In [None]:
# Test Case
query = "Can you summarise the business overview like Fiscal 2023 Results of NVIDIA"
response = qa_system.generate_response(query)
print(response)

In [None]:
# Test Case
query = "What was the Fees Billed by the Independent Registered Public Accounting Firm in the Fiscal Year 2023?"
response = qa_system.generate_response(query)
print(response)