In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tiktoken



In [None]:
file_path = '/content/drive/MyDrive/dataset/HSC26 Bangla 1st paper.md'

with open(file_path, 'r', encoding='utf-8') as f:
    full_text = f.read()


In [None]:
import tiktoken

# Here we use GPT-3.5/4 tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

def tokenize(text):
    return tokenizer.encode(text)

def detokenize(tokens):
    return tokenizer.decode(tokens)

# Hybrid Chunker with paragraph and punctuation boundaries
def chunk_text(text, chunk_size=500, chunk_overlap=50):
    import re

    raw_splits = re.split(r'(?<=[।!?])\s+|\n{2,}', text)

    chunks = []
    current_chunk = []
    current_tokens = 0

    for segment in raw_splits:
        segment = segment.strip()
        if not segment:
            continue
        segment_tokens = tokenize(segment)
        if current_tokens + len(segment_tokens) > chunk_size:
            if current_chunk:
                chunk = ' '.join(current_chunk)
                chunks.append(chunk)
                overlap_tokens = tokenize(' '.join(current_chunk))[-chunk_overlap:]
                current_chunk = [detokenize(overlap_tokens), segment]
                current_tokens = len(tokenize(current_chunk[0])) + len(segment_tokens)
            else:
                chunks.append(segment)
                current_chunk = []
                current_tokens = 0
        else:
            current_chunk.append(segment)
            current_tokens += len(segment_tokens)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


In [None]:
chunks = chunk_text(full_text, chunk_size=500, chunk_overlap=50)
print(f"✅ Total Chunks Created: {len(chunks)}")

# Previewing some chunks
for i, chunk in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i+1} ---\n{chunk[:500]}")


✅ Total Chunks Created: 133

--- Chunk 1 ---
🎯 শিক্ষাফল ✔ নির্যাতিত ব্যক্তির বাইরে বিত্তশালী হয়ে ওঠার ফলে সমাজে পরিচয় সংকট সম্পর্কে ধারণা লাভ করবে। ✔ তৎকালীন সমাজ-সভ্যতা ও মানবতার অবমাননা সম্পর্কে জানতে পারবে। ✔ তৎকালীন সমাজের প্রধানত কুসংস্কার সম্পর্কে জানতে পারবে। ✔ তৎকালীন সমাজে ভদ্রলোকের স্বভাববৈশিষ্ট্য সম্পর্কে জ্ঞানলাভ করবে। ✔ নারী কোমল ঠিক, কিন্তু দুর্বল নয় \- কল্যাণী জীবনচরিতে দ্বারা প্রতিস্থাপিত এই সত্য অনুধাবন করতে পারবে।

--- Chunk 2 ---
া প্রতিস্থাপিত এই সত্য অনুধাবন করতে পারবে। ✔ মানুষ আশা নিয়ে বেঁচে থাকে \- অনুপ্রেরণার দৃষ্টিতে মানবজীবনের এই চিত্রের সত্যদর্শন সম্পর্কে জ্ঞানলাভ করবে। 📘 প্রাক্‌-মূল্যায়ন ১। অনুপমের বাবা কী করে জীবিকা নির্বাহ করতেন? খ) ওকালতি ২। মায়ের ভাষা দেবতার প্রাধান্য এডভার্ট বলার কারণ, তার- গ) বিচক্ষণতা নিচের অনুচ্ছেদটি পড়ে ৩ ও ৪ নম্বর প্রশ্নের উত্তর দাও। পিঁপড়ির দীপু চাচা ছিলেন পবিরের কর্ত্তা।

--- Chunk 3 ---
� পিঁপড়ির দীপু চাচা ছিলেন পবিরের কর্ত্তা। দীপু শিক্ষিত হলেও তার সিদ্ধান্ত নেওয়ার ক্ষমতা ছিল না। চাচা তার বিয়ের উদ্যোগ নিলে যৌতুক নিয়

In [None]:
output_path = '/content/drive/MyDrive/dataset/HSC26_chunks.txt'
with open(output_path, 'w', encoding='utf-8') as f:
    for chunk in chunks:
        f.write(chunk.strip() + '\n---\n')
print("Chunks saved to:", output_path)


Chunks saved to: /content/drive/MyDrive/dataset/HSC26_chunks.txt


In [None]:
!pip install langchain langchain-community langchain-chroma sentence-transformers chromadb transformers torch accelerate langchain-huggingface requests
import pandas as pd
import numpy as np
import json
import requests
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms.base import LLM
from typing import List, Optional, Any
import os



Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-0.2.5-py3-none-any.whl.metadata (1.1 kB)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.m

In [None]:
class HuggingFaceLLM(LLM):
    """
    Custom LLM wrapper for Hugging Face Inference API (free tier)
    """
    model_name: str = "microsoft/DialoGPT-medium"
    api_url: str = ""

    def __init__(self, model_name: str = "microsoft/DialoGPT-medium"):
        super().__init__()
        self.model_name = model_name
        self.api_url = f"https://api-inference.huggingface.co/models/{model_name}"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """
        Make API call to Hugging Face
        """
        api_url = "https://api-inference.huggingface.co/models/google/flan-t5-large"

        headers = {"Content-Type": "application/json"}
        payload = {
            "inputs": prompt,
            "parameters": {
                "max_new_tokens": 150,
                "temperature": 0.1,
                "return_full_text": False
            }
        }

        try:
            response = requests.post(api_url, headers=headers, json=payload)
            if response.status_code == 200:
                result = response.json()
                if isinstance(result, list) and len(result) > 0:
                    return result[0].get('generated_text', '').strip()
                elif isinstance(result, dict):
                    return result.get('generated_text', '').strip()
            else:
                return self._simple_answer_extraction(prompt)
        except Exception as e:
            print(f"API Error: {e}")
            return self._simple_answer_extraction(prompt)

    def _simple_answer_extraction(self, prompt: str) -> str:
        """
        Simple fallback method to extract answers from context
        """
        if "Question:" in prompt and "Context:" in prompt:
            context_start = prompt.find("Context:") + 8
            question_start = prompt.find("Question:") + 9
            context = prompt[context_start:prompt.find("Question:")].strip()
            question = prompt[question_start:].strip()

            question_lower = question.lower()
            context_lines = context.split('\n')

            for line in context_lines:
                if any(word in line for word in question.split() if len(word) > 2):
                    words = line.split()
                    if len(words) > 0:
                        return line.strip()[:100]

        return "তথ্য পাওয়া যায়নি"

    @property
    def _llm_type(self) -> str:
        return "huggingface"

In [None]:
class BilingualRAGPipeline:
    def __init__(self, file_path: str):
        """
        Initialize the bilingual RAG pipeline

        Args:
            file_path (str): Path to the text file containing chunked data
        """
        self.file_path = file_path
        self.vectorstore = None
        self.retriever = None
        self.qa_chain = None

        print("Initializing multilingual embeddings...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
            model_kwargs={'device': 'cpu'}
        )

        print("Initializing language model...")
        self.llm = HuggingFaceLLM()

    def load_and_prepare_documents(self) -> List[Document]:
        """
        Load the text file and convert chunks to LangChain Document format

        Returns:
            List[Document]: List of Document objects
        """
        print(f"Loading documents from: {self.file_path}")

        documents = []

        try:
            with open(self.file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            #spliting the chunks
            if '\n\n' in content:
                chunks = content.split('\n\n')
            else:
                chunks = content.split('\n')

            # Removing empty chunks
            chunks = [chunk.strip() for chunk in chunks if chunk.strip()]

            for idx, chunk in enumerate(chunks):
                if len(chunk) > 10:
                    documents.append(Document(
                        page_content=chunk,
                        metadata={
                            'chunk_id': idx,
                            'source': f"chunk_{idx}",
                            'length': len(chunk)
                        }
                    ))

            print(f"Loaded {len(documents)} document chunks")
            return documents

        except FileNotFoundError:
            print(f"File not found: {self.file_path}")
            return []
        except Exception as e:
            print(f"Error loading file: {e}")
            return []

    def create_vectorstore(self, documents: List[Document]):
        """
        Create Chroma vectorstore from documents

        Args:
            documents (List[Document]): List of documents to vectorize
        """
        print("Creating vector store...")

        if not documents:
            print("No documents to process!")
            return

        # Creating Chroma vectorstore
        self.vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings,
            persist_directory="/content/chroma_db"
        )

        self.retriever = self.vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 3}
        )

        print("Vector store created successfully!")

    def setup_qa_chain(self):
        """
        Set up the question-answering chain with custom prompt template
        """
        print("Setting up QA chain...")

        prompt_template = """Based on the following context, answer the question concisely.

Context:
{context}

Question: {question}

Provide a direct, short answer. If the question is in Bengali, answer in Bengali. If in English, answer in English.

Answer:"""

        PROMPT = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )

        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.retriever,
            chain_type_kwargs={"prompt": PROMPT},
            return_source_documents=True
        )

        print("QA chain setup complete!")

    def query(self, question: str) -> dict:
        """
        Query the RAG pipeline with enhanced answer extraction

        Args:
            question (str): Question to ask

        Returns:
            dict: Answer and source documents
        """
        if not self.qa_chain:
            raise ValueError("QA chain not initialized. Run setup() first.")

        print(f"Processing query: {question}")

        try:
            relevant_docs = self.retriever.get_relevant_documents(question)

            answer = self._extract_answer_from_context(question, relevant_docs)

            return {
                "question": question,
                "answer": answer,
                "source_documents": relevant_docs
            }
        except Exception as e:
            print(f"Error processing query: {e}")
            return {
                "question": question,
                "answer": "উত্তর খুঁজে পাওয়া যায়নি",
                "source_documents": []
            }

    def _extract_answer_from_context(self, question: str, docs: List[Document]) -> str:
        """
        Enhanced answer extraction using pattern matching
        """
        context = " ".join([doc.page_content for doc in docs])

        if "কার" in question or "কোন" in question or "কি" in question or "কত" in question:
            lines = context.split('।')

            for line in lines:
                if "বয়স" in question and "১৫" in line:
                    if "বছর" in line:
                        return "১৫ বছর"

                if "সপুরুষ" in question or "সুপুরুষ" in question:
                    words = line.split()
                    for i, word in enumerate(words):
                        if "শুম্ভনাথ" in word:
                            return "শুম্ভনাথ"

                if "ভাগ্য দেবতা" in question or "মামো" in question:
                    if "মামো" in line:
                        return "মামো"

        sentences = context.split('।')
        for sentence in sentences[:3]:
            if len(sentence.strip()) > 5:
                return sentence.strip()[:50]

        return "তথ্য পাওয়া যায়নি"

    def setup(self):
        """
        Complete setup of the RAG pipeline
        """
        print("Setting up Bilingual RAG Pipeline...")

        documents = self.load_and_prepare_documents()

        if not documents:
            print("No documents loaded. Please check the file path.")
            return False

        self.create_vectorstore(documents)

        self.setup_qa_chain()

        print("RAG Pipeline setup complete!")
        return True



In [None]:
def main():
    file_path = "/content/drive/MyDrive/dataset/HSC26_chunks.txt"

    print("🚀 Starting Bilingual RAG Pipeline Setup...")
    print("=" * 60)

    rag_pipeline = BilingualRAGPipeline(file_path)

    success = rag_pipeline.setup()

    if not success:
        print("Failed to setup pipeline. Please check your file path.")
        return

    test_questions = [
        "অনপেমর ভাষায় সপুরুষ কােক বলা হেয়েছ?",
        "কােক অনপেমর ভাগ্য দবতা বেল উে খ করা হেয়েছ?",
        "বিয়ের সময় কল্যাণীর প্রকত বয়স কত ছিল?"
    ]

    expected_answers = [
        "শুম্ভনাথ",
        "মামো",
        "১৫ বছর"
    ]

    print("\n" + "=" * 60)
    print("🧪 TESTING THE RAG PIPELINE")
    print("=" * 60)

    for i, question in enumerate(test_questions):
        print(f"\n📝 Test Case {i+1}:")
        print(f"Question: {question}")
        print(f"Expected: {expected_answers[i]}")

        try:
            result = rag_pipeline.query(question)
            print(f"Got Answer: {result['answer']}")

            if expected_answers[i].lower() in result['answer'].lower():
                print("✅ MATCH!")
            else:
                print("❌ Different answer")

            if result['source_documents']:
                print(f"📄 Source Context (first 100 chars):")
                print(f"   {result['source_documents'][0].page_content[:100]}...")

        except Exception as e:
            print(f"❌ Error: {str(e)}")

        print("-" * 50)

    print("\n🎉 Testing Complete!")

    print("\n" + "=" * 60)
    print("💬 INTERACTIVE TESTING")
    print("Type your questions (Bengali or English), or 'quit' to exit")
    print("=" * 60)

    while True:
        try:
            user_question = input("\n🤔 Your Question: ").strip()

            if user_question.lower() in ['quit', 'exit', 'বন্ধ']:
                print("👋 Goodbye!")
                break

            if user_question:
                result = rag_pipeline.query(user_question)
                print(f"🤖 Answer: {result['answer']}")

        except KeyboardInterrupt:
            print("\n👋 Goodbye!")
            break
        except Exception as e:
            print(f"❌ Error: {e}")



In [None]:
def check_file_content(file_path: str, lines: int = 5):
    """
    Check the first few lines of the file to understand its structure
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            print(f"📁 File size: {len(content)} characters")
            print(f"📄 First {lines} lines:")
            print("-" * 40)

            lines_content = content.split('\n')[:lines]
            for i, line in enumerate(lines_content, 1):
                print(f"{i}: {line[:100]}...")

            print("-" * 40)
            return True
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return False

print("🔍 Checking your file content...")
file_path = "/content/drive/MyDrive/dataset/HSC26_chunks.txt"
if check_file_content(file_path):
    print("✅ File found and readable!")
    print("\n🚀 Starting main pipeline...")
    main()
else:
    print("❌ Please check your file path and ensure the file exists.")

def quick_debug_test():
    """
    Quick test with minimal setup for debugging
    """
    file_path = "/content/drive/MyDrive/dataset/HSC26_chunks.txt"

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()[:1000]
            print("Sample content:")
            print(content)

        rag = BilingualRAGPipeline(file_path)
        documents = rag.load_and_prepare_documents()
        print(f"Loaded {len(documents)} documents")

        if documents:
            print("First document sample:")
            print(documents[0].page_content[:200])

    except Exception as e:
        print(f"Debug error: {e}")



🔍 Checking your file content...
📁 File size: 82863 characters
📄 First 5 lines:
----------------------------------------
1: 🎯 শিক্ষাফল ✔ নির্যাতিত ব্যক্তির বাইরে বিত্তশালী হয়ে ওঠার ফলে সমাজে পরিচয় সংকট সম্পর্কে ধারণা লাভ ক...
2: ---...
3: া প্রতিস্থাপিত এই সত্য অনুধাবন করতে পারবে। ✔ মানুষ আশা নিয়ে বেঁচে থাকে \- অনুপ্রেরণার দৃষ্টিতে মানব...
4: ---...
5: � পিঁপড়ির দীপু চাচা ছিলেন পবিরের কর্ত্তা। দীপু শিক্ষিত হলেও তার সিদ্ধান্ত নেওয়ার ক্ষমতা ছিল না। চা...
----------------------------------------
✅ File found and readable!

🚀 Starting main pipeline...
🚀 Starting Bilingual RAG Pipeline Setup...
Initializing multilingual embeddings...
Initializing language model...
Setting up Bilingual RAG Pipeline...
Loading documents from: /content/drive/MyDrive/dataset/HSC26_chunks.txt
Loaded 133 document chunks
Creating vector store...


  relevant_docs = self.retriever.get_relevant_documents(question)


Vector store created successfully!
Setting up QA chain...
QA chain setup complete!
RAG Pipeline setup complete!

🧪 TESTING THE RAG PIPELINE

📝 Test Case 1:
Question: অনপেমর ভাষায় সপুরুষ কােক বলা হেয়েছ?
Expected: শুম্ভনাথ
Processing query: অনপেমর ভাষায় সপুরুষ কােক বলা হেয়েছ?
Got Answer: �টা নিতান্ত নির্জীব, একবারে কোনো তেজ নাই
❌ Different answer
📄 Source Context (first 100 chars):
   �টা নিতান্ত নির্জীব, একবারে কোনো তেজ নাই। বেহাই-সম্পদানের আর যাই থাক, তেজ থাকটা দোষের, অতএব মামা মনে...
--------------------------------------------------

📝 Test Case 2:
Question: কােক অনপেমর ভাগ্য দবতা বেল উে খ করা হেয়েছ?
Expected: মামো
Processing query: কােক অনপেমর ভাগ্য দবতা বেল উে খ করা হেয়েছ?
Got Answer: িরপরিচয়ের আসনটির উপরে আসিয়া বসিয়াছ
❌ Different answer
📄 Source Context (first 100 chars):
   িরপরিচয়ের আসনটির উপরে আসিয়া বসিয়াছ। কী আশ্চর্য পরিপূর্ণ তুমি \- চঞ্চল কালের ক্ষুদ্ধ হৃদয়ের উপরে ...
--------------------------------------------------

📝 Test Case 3:
Question: বিয়ের সময় কল্যাণী