In [1]:
import pytesseract
from pdf2image import convert_from_path
import camelot
import os

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [None]:
# === File paths ===
pdf_path = "HSC26-Bangla1st-Paper.pdf"      # Input PDF
output_txt = "HSC26_Bangla1st_OCR_With_Tables.txt"  # Output text
table_dir = "tables"                        # Save table CSVs here
os.makedirs(table_dir, exist_ok=True)

In [3]:
poppler_path = r"C:\poppler-24.08.0\Library\bin"
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [4]:
# === OCR Text Extraction ===
print("🔍 Starting OCR text extraction...")
pages = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)

full_text = ""
for i, page in enumerate(pages):
    print(f"[{i+1}/{len(pages)}] Processing page...")
    text = pytesseract.image_to_string(page, lang='ben')
    full_text += f"\n\n--- Page {i+1} ---\n{text}"

🔍 Starting OCR text extraction...
[1/49] Processing page...
[2/49] Processing page...
[3/49] Processing page...
[4/49] Processing page...
[5/49] Processing page...
[6/49] Processing page...
[7/49] Processing page...
[8/49] Processing page...
[9/49] Processing page...
[10/49] Processing page...
[11/49] Processing page...
[12/49] Processing page...
[13/49] Processing page...
[14/49] Processing page...
[15/49] Processing page...
[16/49] Processing page...
[17/49] Processing page...
[18/49] Processing page...
[19/49] Processing page...
[20/49] Processing page...
[21/49] Processing page...
[22/49] Processing page...
[23/49] Processing page...
[24/49] Processing page...
[25/49] Processing page...
[26/49] Processing page...
[27/49] Processing page...
[28/49] Processing page...
[29/49] Processing page...
[30/49] Processing page...
[31/49] Processing page...
[32/49] Processing page...
[33/49] Processing page...
[34/49] Processing page...
[35/49] Processing page...
[36/49] Processing page...
[37

In [5]:
# === Table Extraction using Camelot ===
print("📄 Extracting tables using Camelot...")
try:
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
    table_text = ""
    for i, table in enumerate(tables):
        table_path = os.path.join(table_dir, f"table_{i+1}.csv")
        table.to_csv(table_path)
        table_content = table.df.to_string(index=False, header=False)
        table_text += f"\n\n--- Table {i+1} ---\n{table_content}"
    print(f"✅ Found and saved {tables.n} tables to CSVs.")
except Exception as e:
    table_text = "\n\n(No tables found or error extracting tables.)"
    print(f" Camelot error: {e}")

📄 Extracting tables using Camelot...
✅ Found and saved 14 tables to CSVs.


In [6]:
# === Write OCR text and tables to output file ===
with open(output_txt, "w", encoding="utf-8") as f:
    f.write(full_text)
    f.write("\n\n========== Tables ==========\n")
    f.write(table_text)

print(f"\n All done. Saved full content to: {output_txt}")


 All done. Saved full content to: HSC26_Bangla1st_OCR_With_Tables_2.txt
