In [None]:
import os
from dotenv import load_dotenv
import gradio as gr

from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

from langchain.document_loaders import TextLoader
from langchain_core.documents import Document



In [8]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [9]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [11]:
# import pytesseract
# from pdf2image import convert_from_path
# import camelot
# import os

# # === File paths ===
# pdf_path = "HSC26-Bangla1st-Paper.pdf"      # Input PDF
# output_txt = "HSC26_Bangla1st_OCR_With_Tables.txt"  # Output text
# table_dir = "tables"                        # Save table CSVs here
# os.makedirs(table_dir, exist_ok=True)

# # === Poppler + Tesseract paths (adjust if needed) ===
# poppler_path = r"C:\poppler-24.08.0\Library\bin"
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# # === OCR Text Extraction ===
# print("ЁЯФН Starting OCR text extraction...")
# pages = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)

# full_text = ""
# for i, page in enumerate(pages):
#     print(f"[{i+1}/{len(pages)}] Processing page...")
#     text = pytesseract.image_to_string(page, lang='ben')
#     full_text += f"\n\n--- Page {i+1} ---\n{text}"

# # === Table Extraction using Camelot ===
# print("ЁЯУД Extracting tables using Camelot...")
# try:
#     tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
#     table_text = ""
#     for i, table in enumerate(tables):
#         table_path = os.path.join(table_dir, f"table_{i+1}.csv")
#         table.to_csv(table_path)
#         table_content = table.df.to_string(index=False, header=False)
#         table_text += f"\n\n--- Table {i+1} ---\n{table_content}"
#     print(f"тЬЕ Found and saved {tables.n} tables to CSVs.")
# except Exception as e:
#     table_text = "\n\n(No tables found or error extracting tables.)"
#     print(f" Camelot error: {e}")

# # === Write OCR text and tables to output file ===
# with open(output_txt, "w", encoding="utf-8") as f:
#     f.write(full_text)
#     f.write("\n\n========== Tables ==========\n")
#     f.write(table_text)

# print(f"\n All done. Saved full content to: {output_txt}")

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


ЁЯФН Starting OCR text extraction...
[1/49] Processing page...
[2/49] Processing page...
[3/49] Processing page...
[4/49] Processing page...
[5/49] Processing page...
[6/49] Processing page...
[7/49] Processing page...
[8/49] Processing page...
[9/49] Processing page...
[10/49] Processing page...
[11/49] Processing page...
[12/49] Processing page...
[13/49] Processing page...
[14/49] Processing page...
[15/49] Processing page...
[16/49] Processing page...
[17/49] Processing page...
[18/49] Processing page...
[19/49] Processing page...
[20/49] Processing page...
[21/49] Processing page...
[22/49] Processing page...
[23/49] Processing page...
[24/49] Processing page...
[25/49] Processing page...
[26/49] Processing page...
[27/49] Processing page...
[28/49] Processing page...
[29/49] Processing page...
[30/49] Processing page...
[31/49] Processing page...
[32/49] Processing page...
[33/49] Processing page...
[34/49] Processing page...
[35/49] Processing page...
[36/49] Processing page...


In [None]:
txt_path = "D:/Udemy - LLM Engineering Master AI, Large Language Models & Agents 2024-12/llm_engineering/week5/HSC26_Bangla1st_OCR_With_Tables.txt"  # Change to your actual txt filename
loader = TextLoader(txt_path, encoding="utf-8")
documents = loader.load()

In [None]:
def clean_bengali_text(text):
    return ' '.join(text.replace('\u200c', '').replace('\u200b', '').replace('\n', ' ').split())

cleaned_documents = [
    Document(page_content=clean_bengali_text(doc.text), metadata=getattr(doc, "metadata", {}))
    for doc in documents
]


In [31]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.create_documents([doc.text for doc in documents])

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
# chunks = text_splitter.create_documents([doc.text for doc in documents])

In [32]:
len(chunks)

1

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    encode_kwargs={"normalize_embeddings": True}
)

# embeddings = HuggingFaceEmbeddings(
#     model_name="l3cube-pune/bengali-sentence-similarity-sbert",
#     encode_kwargs={"normalize_embeddings": True}
# )

In [34]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [35]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 1 documents


In [38]:
def rag_answer(query, history):
    
    docs = vectorstore.similarity_search(query, k=5)

    print("ЁЯФН Retrieved Chunks:")
    for i, doc in enumerate(docs):
        print(f"\n--- Chunk {i+1} ---\n{doc.page_content}")

    context = "\n".join([doc.page_content for doc in docs])
    
    short_term = "\n".join([f"Q: {q}\nA: {a}" for q, a in history[-3:]])
    prompt = f"""You are a multilingual assistant capable of understanding and answering both Bengali and English queries. Your main purpose is to answer factual questions by retrieving information from a Bengali literature knowledge base, specifically the book "HSC26 Bangla 1st Paper". You must ground your answers in the retrieved content. However, you should also respond naturally to general conversation or small talk, even if it doesn't require retrieval. 
Context:
{context}

Recent Q&A:
{short_term}

User Question: {query}
Answer:"""
    llm = ChatOpenAI(model=MODEL, temperature=0)
    response = llm.invoke(prompt)
    return response.content

In [39]:
def chat_interface(query, history=[]):
    answer = rag_answer(query, history)
    history = history + [(query, answer)]
    return history, history

with gr.Blocks() as demo:
    gr.Markdown("# ЁЯУЪ Multilingual RAG: Bangla & English")
    chatbot = gr.Chatbot()
    state = gr.State([])
    with gr.Row():
        txt = gr.Textbox(label="Ask a question (English or Bangla)")
        submit = gr.Button("Submit")
    submit.click(chat_interface, [txt, state], [chatbot, state])

demo.launch()

  chatbot = gr.Chatbot()


* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.




ЁЯФН Retrieved Chunks:

--- Chunk 1 ---
ЁЯФН Retrieved Chunks:

--- Chunk 1 ---
ЁЯФН Retrieved Chunks:

--- Chunk 1 ---
ЁЯФН Retrieved Chunks:

--- Chunk 1 ---
