#Extraction the Answers from PDF docmeunts based on the submitted Question

In [None]:
%pip install -U langchain-community pymupdf langchain sentence-transformers chromadb requests sentence-splitter pypdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


In [None]:
#import fitz  # PyMuPDF
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from transformers import pipeline
import requests as re
import re as regex
from pypdf import PdfReader

import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

#from sentence_splitter import SentenceSplitter


# Ensure your API key is correct
api_key = ''

# Headers for Hugging Face API
headers = {"Authorization": f"Bearer {api_key}"}

# Define utility functions
def truncate_text(text, max_length=500):
    return text[:max_length]

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)


def paraphrase_text(text):
    max_length = 60  # Set your desired max length
    # Prepare the inputs using the tokenizer's __call__ method
    inputs = tokenizer(
        text,
        truncation=True,
        padding='longest',
        max_length=max_length,
        return_tensors="pt").to(torch_device)

    # Generate the paraphrase using the model with sampling
    translated = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=10,
        temperature=1.5,
        do_sample=True
    )

    # Decode the generated text
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

    return tgt_text[0]

# Function to check if a paragraph is relevant to a query
def is_relevant(query, paragraph, headers, threshold=0.6):
    API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
    query = truncate_text(query)
    paragraph = truncate_text(paragraph)
    payload = {
        "inputs": {
            "source_sentence": query,
            "sentences": [paragraph]
        }
    }
    response = re.post(API_URL, headers=headers, json=payload)
    response_json = response.json()
    if response_json and isinstance(response_json, list):
      #print(response_json) #Debugging Statement
      similarity = response_json[0]
      return similarity > threshold
    else:
        return False

# Function to clean the extracted text
def clean_text(text):
    # Define patterns for headers, footers, and irrelevant sections
    patterns = [
        r"Unit \d+",                          # Unit sections
        r"© Copyright.*",                     # Copyright statements
        r"Figure \d+-\d+.*",                  # Figure labels
        r"Course materials.*",                # Course material notices
        r"1-\d+",                             # Page numbers
        r"Uempty",                            # Placeholder text
        r"IBM Training"                       # Training headers
    ]

    for pattern in patterns:
        text = regex.sub(pattern, '', text)

    return text.strip()

# Load the PDF document
pdf_path = "/content/Course_exercices - SABSQ3-Big Data Engineer 2021-BigSQL.pdf"

pdf_document = PdfReader(pdf_path)
#pdf_document = fitz.open(pdf_path)
#print(pdf_document)   #Debugging Statement


# Extract text from each page and store in a list with metadata
documents = []
for page_num in range(len(pdf_document.pages)):
    page = pdf_document.pages[page_num]
    page_text = page.extract_text()
    page_text = clean_text(page_text)  # Clean the extracted text
    documents.append(Document(page_content=page_text, metadata={"source": pdf_path, "page": page_num + 1}))

# Split each document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
#text_splitter = SentenceSplitter(language='en')
docs = text_splitter.split_documents(documents)


embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

#print("Shape", embedding_function.shape)  #Debugging statement


# Load it into Chroma
db1 = Chroma.from_documents(docs, embedding_function)



# Query example test
# List of queries to process
queries = [
    "What is Db2 Big SQL?",
    "What is Data poisoning?",
    "Why Hadoop?"
]

# Process each query
for query in queries:
    results = db1.similarity_search(query)

    # Extract and process all results
    if results:
        for result in results:
            extracted_paragraph = result.page_content
            source = result.metadata.get("source")
            page = result.metadata.get("page")

            if is_relevant(query, extracted_paragraph, headers):
                # Summarize the extracted paragraph
                summarized = paraphrase_text(extracted_paragraph)

                print("-"*100)
                print("Submitted Question: ", query)
                print("-"*100)
                print(f"Answer: {summarized}\n")
                print("-"*100)
                print(f"Extracted Paragraph: {extracted_paragraph}\n")
                print("-"*100)
                print(f"Source: {source}\n")
                print(f"Page: {page}\n")
                print("-"*100)
                break  # Stop after the first relevant result
            else:
                print("-"*100)
                print(f"Submitted Question: , {query}")
                print("-"*100)
                print(f"Answer not found for question: {query}")
                break  # Stop after the first non-relevant result
    else:
        print(f"No results found for question: {query}")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


----------------------------------------------------------------------------------------------------
Submitted Question:  What is Db2 Big SQL?
----------------------------------------------------------------------------------------------------
Answer: IBM Db2 Bi g is a hybrid database.

----------------------------------------------------------------------------------------------------
Extracted Paragraph: IBM Db2 Big SQL is a hybrid SQL on Hadoop engine. It delivers a dvanced, scalable, and 
security-rich data querying for the enterprise business. Db2 Bi g SQL delivers massive parallel 
processing (MPP) and advanced data query. Db2 Big SQL offers a single database connection or 
query for disparate sources such as Hadoop HDFS and WebHDFS, RD MS, NoSQL databases and 
object stores. It provides low l atency, high performance, secur ity, SQL compatibility and federation 
capabilities to do ad hoc and complex queries.
Data scientists and analysts can  reuse their SQL skills, saving  the t