In [2]:
# Install required libraries
!pip install -q PyPDF2 langchain sentence-transformers faiss-cpu transformers torch
!pip install -U langchain-community
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline
from google.colab import files
import io
import gc
import numpy as np
import torch



In [14]:
# Step 1: Ask user to upload a PDF
print("Please upload a PDF file.")
uploaded = files.upload()

# Get the uploaded PDF file
pdf_file = list(uploaded.keys())[0]
pdf_content = uploaded[pdf_file]

# Step 2: Extract text from the PDF
def extract_text_from_pdf(pdf_content):
    pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text

pdf_text = extract_text_from_pdf(pdf_content)
print("PDF text extracted successfully.")
del pdf_content  # Free memory
gc.collect()

# Step 3: Prepare RAG - Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len
)
chunks = text_splitter.split_text(pdf_text)
del pdf_text  # Free memory
gc.collect()

# Step 4: Create embeddings and vector store with batch processing
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
batch_size = 100
vector_store = None
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]
    if vector_store is None:
        vector_store = FAISS.from_texts(batch, embeddings)
    else:
        temp_store = FAISS.from_texts(batch, embeddings)
        vector_store.merge_from(temp_store)
        del temp_store  # Free memory
    gc.collect()
print("Vector store created successfully.")

# Step 5: Set up a smaller language model for answer generation
llm_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device=0 if torch.cuda.is_available() else -1,
    max_length=150
)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

# Step 6: Create the RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True
)


Please upload a PDF file.


Saving Brain and Cognitive Science Club Recruitment 2025.pdf to Brain and Cognitive Science Club Recruitment 2025 (14).pdf
PDF text extracted successfully.
Vector store created successfully.


Device set to use cpu


In [15]:
# Step 7: Interactive question-answering loop
print("PDF-Chatbot is ready! Ask questions about the PDF content (type 'exit' to stop).")
while True:
    query = input("Your question: ")
    if query.lower() == "exit":
        print("Exiting PDF-Chatbot.")
        break
    if not query.strip():
        print("Please enter a valid question.")
        continue
    result = qa_chain({"query": query})
    answer = result["result"]
    print(f"Answer: {answer}\n")

PDF-Chatbot is ready! Ask questions about the PDF content (type 'exit' to stop).
Your question: what is a corticon?
Answer: a digital virus capable of disrupting the brain’s natural balance between excitatory and inhibitory neurons

Your question: What is Dance of the Planets?
Answer: It’s the dawn of the Age of Space Exploration and in a distant solar system, two planetary bodies are Space Exploration and in a distant solar system, two planetary bodies are locked in a cosmic dance, bound by Newton’s law of gravitation. Your task is to learn of this two-body system using a Neural Network that not only predicts their motion over time, but does so while respecting the governing laws of physics — no numerical solvers allowed. Environment Setup and deep beneath the sands is the Archive Vault , containing 2,278 relic files :  2,244 high  res 

Your question: How to measure boredom in Mice?
Answer: By detecting behavioral signatures in trial via subjective interpretation, but by detecting be