In [25]:
!pip install torch transformers faiss-cpu PyPDF2



In [26]:
import PyPDF2
from transformers import BertTokenizer, BertModel, BertForQuestionAnswering
import torch
import faiss

# Load BERT Tokenizer and Model for Dense Retrieval (Embedding)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define function to get embeddings for a document (or corpus)
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean Pooling to get a single vector per document
    return embeddings

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to get the answer from a context using QA model
def get_answer(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = qa_model(**inputs)
    start_scores, end_scores = outputs.start_logits, outputs.end_logits
    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores)
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][start_idx:end_idx+1]))
    return answer

# Load BERT QA Model (Pre-trained)
qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Function to process PDF and answer questions
def query_pdf(pdf_path, question):
    # Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    # Split text into chunks for retrieval (if needed, can split by sentences or paragraphs)
    corpus = [pdf_text]  # If your PDF is small, you can keep it as a single document

    # Get embeddings for the corpus
    corpus_embeddings = get_embeddings(corpus)

    # Build FAISS index for fast retrieval
    faiss_index = faiss.IndexFlatL2(corpus_embeddings.shape[1])  # Use L2 distance for cosine similarity
    faiss_index.add(corpus_embeddings.numpy())  # Add embeddings to FAISS index

    # Query the system (perform dense retrieval)
    query_embedding = get_embeddings([question])
    D, I = faiss_index.search(query_embedding.numpy(), k=1)  # k=1 retrieves the most relevant document

    # Extract the answer using the most relevant document
    retrieved_doc = corpus[I[0][0]]
    answer = get_answer(question, retrieved_doc)

    return retrieved_doc, answer


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
# Interactive Chatbot
print("Chatbot is ready! Ask a question related to the PDF. Type 'exit' to quit.")
while True:
    question = input("You: ")
    if question.lower() == 'exit':
        print("Exiting the chatbot.")
        break

    # Set the path to your PDF file
    pdf_path = "/content/Title_ Robotics and Future Advancements in AI Subtitle_ Exploring the Role of Robotics in AI Evolution (1).pdf"  # Replace with your PDF path
    retrieved_doc, answer = query_pdf(pdf_path, question)

    print(f"Retrieved Document: {retrieved_doc[:500]}...")  # Print first 500 chars of the document for context
    print(f"Answer: {answer}")

Chatbot is ready! Ask a question related to the PDF. Type 'exit' to quit.
You: what are the types of robots?
Retrieved Document: Title: Robotics and Future Advancements in AI 
Subtitle : Exploring the Role of Robotics in AI Evolution 
Introduction to Robotics 
Definition : Robots are machines that can perform tasks autonomously or semi-autonomously. 
Applications : Manufacturing, healthcare, logistics, military, and space exploration. 
Types of Robots :
Industrial Robots : Used in assembly lines and manufacturing. 
Service Robots : Used for personal assistance, cleaning, etc. 
Autonomous Vehicles : Self-driving cars, dron...
Answer: industrial robots
You: what are the current trends of industries and robots?
Retrieved Document: Title: Robotics and Future Advancements in AI 
Subtitle : Exploring the Role of Robotics in AI Evolution 
Introduction to Robotics 
Definition : Robots are machines that can perform tasks autonomously or semi-autonomously. 
Applications : Manufacturing, healthca