In [2]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer, util
import torch
import pdfplumber

# Step 1: Extract corpus from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + " "
    return text.strip()

# Step 2: Dense Retriever using Sentence Transformers
retriever_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Load the PDF and process the content
pdf_path = "/content/drive/MyDrive/R-CNN PPT.pdf"  # Replace with your PDF file path
pdf_text = extract_text_from_pdf(pdf_path)
corpus = pdf_text.split(". ")  # Split into smaller passages (adjust if necessary)

# Step 3: Generate embeddings for the corpus
corpus_embeddings = retriever_model.encode(corpus, convert_to_tensor=True)

# Step 4: Define a function to retrieve relevant passages
def retrieve_passages(question, top_k=2):
    question_embedding = retriever_model.encode(question, convert_to_tensor=True)
    scores = util.cos_sim(question_embedding, corpus_embeddings)
    top_results = torch.topk(scores, k=top_k, dim=1)

    top_indices = top_results.indices[0].tolist()
    retrieved_passages = [corpus[idx] for idx in top_indices]
    return retrieved_passages

# Step 5: BERT Reader for Question Answering
qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)

# Define a function to get answers
def get_answer(question, passage):
    inputs = tokenizer(question, passage, return_tensors="pt", truncation=True)
    outputs = qa_model(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    answer = tokenizer.decode(inputs.input_ids[0][start_idx:end_idx], skip_special_tokens=True)
    return answer

# Step 6: Integrate Retrieval + Reader
def question_answering_system(question):
    # Step 6.1: Retrieve relevant passages
    retrieved_passages = retrieve_passages(question)

    # Step 6.2: Use BERT to extract answers
    answers = []
    for passage in retrieved_passages:
        answer = get_answer(question, passage)
        answers.append((answer, passage))

    return answers

# Chatbot Loop
print("Welcome to the Question Answering Chatbot! Type 'exit' to end the chat.")
while True:
    question = input("\nAsk a question: ")
    if question.lower() == 'exit':
        print("Goodbye!")
        break

    answers = question_answering_system(question)
    print("\nTop Answers:")
    for idx, (answer, passage) in enumerate(answers):
        print(f"{idx+1}. Answer: {answer}")
        print(f"   Context: {passage}")


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Welcome to the Question Answering Chatbot! Type 'exit' to end the chat.

Ask a question: what is RCNN?

Top Answers:
1. Answer: region based convolutional neural networks
   Context: REGION BASED CONVOLUTIONAL NEURAL
NETWORKS(R-CNN)
PRESENTED BY:
DHANUSHSHRUTHI S T CONVOLUTIONAL NEURAL NETWORKS:
A Convolutional Neural Network (CNN) is a type of artificial neural network specifically
designed for processing structured grid data, such as images
2. Answer: 
   Context: FASTER R-CNN
➢ Imagine you're playing a game where you need to find hidden objects in a large picture.
Faster R-CNN is like having a super-fast teammate who helps you find the objects quickly
and accurately.
➢ Here's how it works:
1.Scanning Quickly: Your teammate quickly scans the entire picture to spot areas where
objects might be hidden

Ask a question: what is Pooling layer?

Top Answers:
1. Answer: you ' re trying to shrink down the picture while keeping the important parts
   Context: Each
square represents a pixel in