In [None]:
import os
from dotenv import load_dotenv
import gradio as gr

from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

from langchain.document_loaders import TextLoader
from langchain_core.documents import Document



In [None]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [None]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
txt_path = "D:/Udemy - LLM Engineering Master AI, Large Language Models & Agents 2024-12/llm_engineering/week5/HSC26_Bangla1st_OCR_With_Tables.txt"  # Change to your actual txt filename
loader = TextLoader(txt_path, encoding="utf-8")
documents = loader.load()

In [None]:
def clean_bengali_text(text):
    return ' '.join(text.replace('\u200c', '').replace('\u200b', '').replace('\n', ' ').split())

cleaned_documents = [
    Document(page_content=clean_bengali_text(doc.text), metadata=getattr(doc, "metadata", {}))
    for doc in documents
]


In [None]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.create_documents([doc.text for doc in documents])

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
# chunks = text_splitter.create_documents([doc.text for doc in documents])

In [None]:
len(chunks)

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

# embeddings = HuggingFaceEmbeddings(
#     model_name="l3cube-pune/bengali-sentence-similarity-sbert",
#     encode_kwargs={"normalize_embeddings": True}
# )

In [None]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [None]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
def rag_answer(query, history):
    
    docs = vectorstore.similarity_search(query, k=10)

    print("🔍 Retrieved Chunks:")
    for i, doc in enumerate(docs):
        print(f"\n--- Chunk {i+1} ---\n{doc.page_content}")

    context = "\n".join([doc.page_content for doc in docs])
    
    short_term = "\n".join([f"Q: {q}\nA: {a}" for q, a in history[-3:]])
    prompt = f"""You are a multilingual assistant capable of understanding and answering both Bengali and English queries. Your main purpose is to answer factual questions by retrieving information from a Bengali literature knowledge base, specifically the book "HSC26 Bangla 1st Paper". You must ground your answers in the retrieved content. However, you should also respond naturally to general conversation or small talk, even if it doesn't require retrieval. 
Context:
{context}

Recent Q&A:
{short_term}

User Question: {query}
Answer:"""
    llm = ChatOpenAI(model=MODEL, temperature=0)
    response = llm.invoke(prompt)
    return response.content

In [None]:
def chat_interface(query, history=[]):
    answer = rag_answer(query, history)
    history = history + [(query, answer)]
    return history, history

with gr.Blocks() as demo:
    gr.Markdown("# 📚 Multilingual RAG: Bangla & English")
    chatbot = gr.Chatbot()
    state = gr.State([])
    with gr.Row():
        txt = gr.Textbox(label="Ask a question (English or Bangla)")
        submit = gr.Button("Submit")
    submit.click(chat_interface, [txt, state], [chatbot, state])

demo.launch()