In [9]:
# Import necessary libraries
import fitz  # PyMuPDF
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# Load SentenceTransformer model for embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load BERT model for question answering
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
bert_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

def extract_text_from_pdf(pdf_path):
    """
    Extracts all text from a PDF file.
    """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()  # Extract text from each page
    return text

def preprocess_pdf_text(pdf_text, chunk_size=512):
    """
    Splits the PDF text into chunks of specified size.
    """
    chunks = []
    words = pdf_text.split()
    chunk = []
    for word in words:
        chunk.append(word)
        if len(" ".join(chunk)) > chunk_size:
            chunks.append(" ".join(chunk))
            chunk = []
    if chunk:
        chunks.append(" ".join(chunk))  # Add any remaining text
    return chunks

def generate_embeddings(document_chunks):
    """
    Generates embeddings for each document chunk.
    """
    return embedding_model.encode(document_chunks)

def create_faiss_index(doc_embeddings):
    """
    Creates a FAISS index for similarity search.
    """
    dimension = doc_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    index.add(np.array(doc_embeddings))   # Add embeddings to the index
    return index

def retrieve_documents(query, index, document_chunks, top_k=5):
    """
    Retrieves the most relevant document chunks based on a query.
    """
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    selected_chunks = [document_chunks[i] for i in indices[0]]

    # Combine the chunks into a single context
    context = " ".join(selected_chunks)

    return context

def split_long_context(context, max_length=512):
    """
    Splits a long context into smaller chunks that fit within the max token length.
    """
    tokens = bert_tokenizer.encode(context)
    chunks = []

    for i in range(0, len(tokens), max_length - 2):  # -2 for special tokens
        chunk = tokens[i:i + max_length - 2]
        chunks.append(chunk)

    return chunks

def prepare_input_for_bert(question, context, max_length=512):
    """
    Prepares input for the BERT model, truncating the context if necessary.
    """
    encoded = bert_tokenizer.encode_plus(
        question, context,
        add_special_tokens=True,
        return_tensors="pt",
        max_length=max_length,  # Truncate to the max token limit
        truncation=True,
        padding='max_length'
    )

    return encoded

def generate_answer_with_bert(question, context, max_length=512):
    """
    Generates an answer using BERT based on the question and context.
    """
    encoded = prepare_input_for_bert(question, context, max_length)
    input_ids = encoded["input_ids"]
    attention_mask = encoded["attention_mask"]

    outputs = bert_model(input_ids, attention_mask=attention_mask)

    # Extract start and end logits
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Get the start and end positions for the answer (Top 3 instead of just the highest)
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # Extract the answer tokens and decode them
    answer_tokens = input_ids[0][start_index:end_index + 1]
    answer = bert_tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer

def generate_answer_with_bert_multiple_chunks(question, context, max_length=512):
    """
    Generate an answer by using multiple chunks if the context is too long.
    """
    chunks = split_long_context(context, max_length)
    answers = []

    for chunk in chunks:
        chunk_text = bert_tokenizer.decode(chunk, skip_special_tokens=True)
        answer = generate_answer_with_bert(question, chunk_text, max_length)
        answers.append(answer)

    # Combine all answers, prioritizing the most relevant one
    return max(answers, key=len)  # You can adjust this logic to rank answers by relevance

def generate_answer(query, index, document_chunks):
    relevant_docs = retrieve_documents(query, index, document_chunks, top_k=5)
    print("Context for answering the question:")
    print(relevant_docs)  # Check the combined context

    # Handle long context by splitting if needed
    answer = generate_answer_with_bert_multiple_chunks(query, relevant_docs)

    return answer

def chatbot(pdf_path):
    # Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_path)
    document_chunks = preprocess_pdf_text(pdf_text)

    # Generate embeddings for the document chunks
    doc_embeddings = generate_embeddings(document_chunks)

    # Create the FAISS index for similarity search
    index = create_faiss_index(doc_embeddings)

    while True:
        user_input = input("Ask me anything: ")
        if user_input.lower() == "exit":
            print("Goodbye!")
            break
        answer = generate_answer(user_input, index, document_chunks)
        print(f"Answer: {answer}")

# Run the chatbot with a PDF document
chatbot("/content/drive/MyDrive/R-CNN PPT.pdf")


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Ask me anything: what is RCNN?


Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


Context for answering the question:
REGION BASED CONVOLUTIONAL NEURAL NETWORKS(R-CNN) PRESENTED BY: DHANUSHSHRUTHI S T CONVOLUTIONAL NEURAL NETWORKS: A Convolutional Neural Network (CNN) is a type of artificial neural network specifically designed for processing structured grid data, such as images. CNNs are particularly effective in tasks like image recognition, classification, object detection, and segmentation. CONVOLUTIONAL LAYER: ➢Imagine you have a flashlight and you're shining it over a wall covered in stickers. Each sticker has a number or prediction based on the information processed by the neural network. ➢In regular Convolutional Neural Networks (CNNs), the network looks at the whole picture to detect objects. It's like trying to find a specific item in a messy room by scanning the entire room at once. ➢Now, in Region-based Convolutional Neural Networks (R-CNNs), the network is smarter. Instead of looking at the whole picture all at once, it focuses on specific regions where

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Context for answering the question:
parts. So, you take a small window and move it across the picture. In each window, you pick the biggest number (max pooling) or calculate the average of all the numbers (average pooling). Then, you keep only the biggest number or the average and throw away the rest. ➢For example, if you're doing max pooling and your window covers four squares with values 3, 6, 2, and 8, you'd keep only the 8 and throw away the others. This process helps reduce the size of the picture while preserving the most important features. ROI pooling does this by taking the maximum value in each resized region. For example, if you're resizing a 5x5 region, you'd pick the biggest number in that area. 4.Output: Finally, you end up with a set of fixed-size regions, each summarized by a single value. These values represent the most important features of the original regions. ➢So, ROI pooling is like standardizing and summarizing different-sized regions in images, making them easie