In [None]:
import PyPDF2
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import numpy as np

# Step 1: Extract text from PDF files
def extract_text_from_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(file_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Extract text from both PDFs
pdf1_text = extract_text_from_pdf("/content/QA_System_Dense_Retrieval_BERT.pdf")

# Combine all text into a knowledge base
knowledge_base = [pdf1_text]

# Step 2: Dense Retrieval using Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')
document_embeddings = model.encode(knowledge_base, convert_to_tensor=True)

# Step 3: BERT for Question Answering
qa_pipeline = pipeline('question-answering', model='bert-large-uncased-whole-word-masking-finetuned-squad')

# Step 4: Answer a Question
def get_answer(question):
    # Encode the query
    query_embedding = model.encode(question, convert_to_tensor=True)

    # Find the most relevant document
    scores = util.cos_sim(query_embedding, document_embeddings)[0]
    top_result_idx = np.argmax(scores)
    context = knowledge_base[top_result_idx]

    # Use BERT QA pipeline to get the answer
    result = qa_pipeline({'question': question, 'context': context})
    return result['answer'], context

# Example Query
question = "What is dense retrieval?"
answer, context = get_answer(question)

# Output the result
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Context: {context[:800]}...")  # Display first 500 characters of the context
