### PROJECT SETUP

In [3]:
!pip install langchain chromadb sentence-transformers openai fastapi uvicorn pymupdf


Defaulting to user installation because normal site-packages is not writeable




In [7]:
!pip install -U langchain-community

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl (2.5 MB)
     ---------------------------------------- 2.5/2.5 MB 2.4 MB/s eta 0:00:00
Collecting dataclasses-json<0.7,>=0.5.7
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0
  Downloading pydantic_settings-2.10.1-py3-none-any.whl (45 kB)
     ---------------------------------------- 45.2/45.2 kB 2.2 MB/s eta 0:00:00
Collecting httpx-sse<1.0.0,>=0.4.0
  Downloading httpx_sse-0.4.1-py3-none-any.whl (8.1 kB)
Collecting marshmallow<4.0.0,>=3.18.0
  Downloading marshmallow-3.26.1-py3-none-any.whl (50 kB)
     ---------------------------------------- 50.9/50.9 kB 2.7 MB/s eta 0:00:00
Collecting typing-inspect<1,>=0.4.0
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting typing-inspection>=0.4.0
  Downloading typing_inspection-0.4.1-py3-none-any.whl (1



In [3]:
!pip install pdfminer.six


Defaulting to user installation because normal site-packages is not writeable
Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
     ---------------------------------------- 5.6/5.6 MB 2.6 MB/s eta 0:00:00
Collecting cryptography>=36.0.0
  Downloading cryptography-45.0.5-cp311-abi3-win_amd64.whl (3.4 MB)
     ---------------------------------------- 3.4/3.4 MB 2.9 MB/s eta 0:00:00
Installing collected packages: cryptography, pdfminer.six
Successfully installed cryptography-45.0.5 pdfminer.six-20250506




### PDF TEXT EXTRACTION AND CLEANING

In [4]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Save raw text
with open("bangla_book.txt", "w", encoding="utf-8") as f:
    f.write(extract_text_from_pdf("HSC26-Bangla1st-Paper.pdf"))


In [4]:
from pdfminer.high_level import extract_text

text = extract_text("HSC26-Bangla1st-Paper.pdf")


### TEXT CLEANING AND CHUNKING

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open("bangla_book.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # works well for Bengali
    chunk_overlap=50
)
chunks = text_splitter.split_text(raw_text)

with open("chunks.txt", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(chunk + "\n\n")


### VECTORIZE AND STORE IN CHROMADB

In [8]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

with open("chunks.txt", "r", encoding="utf-8") as f:
    texts = f.read().split("\n\n")

docs = [Document(page_content=text) for text in texts]

db = Chroma.from_documents(docs, embedding, persist_directory="./chroma_db")
db.persist()


  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  db.persist()


### HANDLE USER QUERY AND RETRIEVAL

In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding)

retriever = db.as_retriever(search_kwargs={"k": 3})

qa_chain = RetrievalQA.from_chain_type(
    llm=Ollama(model="mistral",temperature=0),  # use Ollama for local
    retriever=retriever
)

query = input("Enter your question (BN or EN): ")
result = qa_chain.run(query)
print("Answer:", result)


  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
  db = Chroma(persist_directory="./chroma_db", embedding_function=embedding)
  llm=Ollama(model="mistral",temperature=0),  # use Ollama for local
  result = qa_chain.run(query)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Answer:  In the given context, the word for a man in Sanskrit language is 'पुरुष' (Purush). So, in the language of Anupama, a man would be referred to as 'পুরুষ' (Purush).


In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Load embeddings and vector DB
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding)
retriever = db.as_retriever(search_kwargs={"k": 3})

# ✅ Custom Bengali prompt that forces groundedness
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
তুমি একজন সহকারী যাকে শুধুমাত্র প্রদত্ত প্রাসঙ্গিক তথ্য ব্যবহার করে উত্তর দিতে হবে।

প্রসঙ্গ:
{context}

প্রশ্ন:
{question}

উত্তর শুধুমাত্র উপরের প্রসঙ্গের উপর ভিত্তি করে বাংলায় লিখ:
"""
)

# Use the prompt inside the RAG chain
llm = Ollama(model="mistral", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

# Ask question
query = input("Enter your question (BN or EN): ")
result = qa_chain(query)

# Print the answer and optionally the source
print("\n🧠 Answer:", result["result"])
print("\n📄 Retrieved Context Chunks:")
for doc in result["source_documents"]:
    print("-", doc.page_content[:150], "...\n")


  result = qa_chain(query)



🧠 Answer:  সুপুরুষ অনপুমের ভাষায় 'Rishi' কাকে বলা হয়েছে।

📄 Retrieved Context Chunks:
- (ক) একটি 
 
(খ) দুইটি 
 
(গ) র্তনটি 
 
(ঘ) চািটি 
          উিি: খ 
৭। 'অপর্ির্চতা' গকে'কলযাণী' র্ব্ক কতযকানিকেিিার়্েপকিকিব্কলঅনুপমকেনাককি? [কু. দিা. ...

- যকানর্ব্কিষর্দককিকথাব্লাহক কি? [িা. দিা. ’২২] 
(ক) সািসজ্জা  
(খ) মাজিত সুরূর্চ 
(গ) যসৌন্দেি 
 
(ঘ) উদাসীনতা           উিি: খ
৪। 'অপর্ির্চতা' গকেগেব্ ...

- (গ) মতামকতি িনয 
(ঘ) কূটব্ুর্িি িনয
৮২। 'অপর্ির্চতা' গেটিপ্রথমগ্রন্থভুক্তহ যকানগ্রকন্থ?
(ক) গেগুে 
 
(খ) গেসংগ্রহ  
(গ) গেসিক 
 
(ঘ) গেস্বকে
37
৮৩। অপ ...



### RESTAPI with FastAPI

In [12]:
from fastapi import FastAPI
from pydantic import BaseModel
from query import qa_chain

app = FastAPI()

class Query(BaseModel):
    question: str

@app.post("/ask/")
def ask_question(query: Query):
    result = qa_chain.run(query.question)
    return {"answer": result}


ModuleNotFoundError: No module named 'query'