In [1]:
import re
from pathlib import Path

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline





In [2]:
PDF_FOLDER = Path("pdf_data")

documents = []
for pdf in PDF_FOLDER.glob("*.pdf"):
    loader = PyPDFLoader(str(pdf))
    documents.extend(loader.load())

print("PDFs loaded:", len(documents))
print(documents[0].page_content[:300])


PDFs loaded: 108
1  Battery Management Systems 
Table of Contents        Page Number 
 
1. Introduction          - 2 
2. Battery Management System       - 13 
3. Associated Components of BMS      - 17 
4. Functioning of BMS        - 25 
5. Types of BMS         - 30 
6. Wireless Distributed Battery Management System 


In [3]:
def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", text)
    return text.strip()

cleaned_documents = [
    Document(
        page_content=clean_text(doc.page_content),
        metadata=doc.metadata
    )
    for doc in documents
]

print("Cleaning done")
print(cleaned_documents[0].page_content[:300])


Cleaning done
1 Battery Management Systems Table of Contents Page Number 1. Introduction - 2 2. Battery Management System - 13 3. Associated Components of BMS - 17 4. Functioning of BMS - 25 5. Types of BMS - 30 6. Wireless Distributed Battery Management System (wBMS) - 37 7. Adoption of AI technologies in Batter


In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

recursive_chunks = splitter.split_documents(cleaned_documents)
print("Total chunks:", len(recursive_chunks))


Total chunks: 321


In [5]:
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


  embeddings_model = HuggingFaceEmbeddings(


In [6]:
vectorstore = FAISS.from_documents(
    documents=recursive_chunks,
    embedding=embeddings_model
)

print("Indexed chunks:", vectorstore.index.ntotal)


Indexed chunks: 321


In [7]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [8]:
model_id = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300
)

llm = HuggingFacePipeline(pipeline=pipe)


Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


In [9]:
def build_prompt(context: str, question: str) -> str:
    return f"""
You are a technical assistant.

Using ONLY the information in the context, answer the question.
Combine duplicate or similar points into ONE clear answer.
Do NOT repeat sentences.
Do NOT add extra explanations.
If the answer is not present at all, respond with exactly:
"Not found in the provided document."

Context:
{context}

Question:
{question}

Final Answer (one paragraph only):
"""


In [10]:
def clean_answer(answer: str) -> str:
    answer = answer.strip()

    # Remove duplicate lines
    lines = list(dict.fromkeys(answer.splitlines()))
    answer = " ".join(lines)

    # Normalize spaces
    answer = " ".join(answer.split())

    return answer
def ask_question(question: str) -> str:
    docs = retriever.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)

    prompt = build_prompt(context, question)
    raw_answer = llm.invoke(prompt)

    return clean_answer(raw_answer)


In [11]:
print(ask_question("What is sensor fusion?"))


Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors


Sensor fusion technology is just like a "coach" because it is capable of kneading sensors with different abilities into a united team of players that can work together and complement each other to win the game(7).


In [12]:
print(ask_question("What is distributed BMS?"))


A distributed BMS incorporates all the electronic hardware on a control bo ard placed directly on the cell or module that is being monitored. This alleviates the bulk of the cabling to a few sensor wires and communication wires between adjacent BMS modules. Consequently, each BMS is more self -contained, and handles computations and communications as required. However, despite this apparent simplicity, this integrated affect the cell, and balances them to ensure the same voltage across cells. It is an embedded system, that has a number of electron ic components on a circuit board. This system comprises of purpose built electronics along with purpose built software to enable a specific applications(13). BMS is responsible for thermal management of the battery and monitors its temperature continuously. If required, BMS can adjust cooling and trigger other safety mechanisms to cease operations and minimize the mechanisms to cease operations and minimize the risks. Importance of BMS: The m

In [13]:
def chatbot():
    """
    Command-line chatbot for document-based question answering.
    Type 'exit' to stop.
    """
    print("PDF Q&A Chatbot is ready.")
    print("Ask questions based on the document. Type 'exit' to quit.\n")

    while True:
        user_question = input("User: ").strip()

        if user_question.lower() in ["exit", "quit", "q"]:
            print("Chatbot: Goodbye!")
            break

        if not user_question:
            print("Chatbot: Please ask a valid question.\n")
            continue

        try:
            answer = ask_question(user_question)
            print(f"\nChatbot: {answer}\n")

        except Exception as e:
            print("Chatbot: An error occurred while processing your question.")
            print(e)


In [14]:
chatbot()


PDF Q&A Chatbot is ready.
Ask questions based on the document. Type 'exit' to quit.

Chatbot: Goodbye!


In [15]:
vectorstore.save_local("faiss_index")
