In [3]:
import asyncio
from asyncio import Semaphore
import pdfplumber
import tempfile
import httpx
import time

MAX_CONCURRENT_TASKS = 30
BATCH_SIZE = 20

async def download_pdf(url: str) -> str:
    print("📥 Starting PDF download...")
    start_time = time.perf_counter()
    
    async with httpx.AsyncClient() as client:
        response = await client.get(url)
        duration = time.perf_counter() - start_time
        print(f"📶 HTTP Status: {response.status_code} (Download took {duration:.2f} seconds)")
        response.raise_for_status()
        
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
            temp_file.write(response.content)
            print(f"✅ PDF successfully downloaded and saved to: {temp_file.name}")
            return temp_file.name

async def extract_text_from_pdf(pdf_path: str) -> list[str]:
    print(f"📂 Opening PDF for extraction: {pdf_path}")
    semaphore = Semaphore(MAX_CONCURRENT_TASKS)

    with pdfplumber.open(pdf_path) as pdf:
        pages = pdf.pages
        total_pages = len(pages)
        print(f"📄 Total pages in PDF: {total_pages}")

        async def process_page_batch(start_idx: int, end_idx: int):
            print(f"🧩 Processing batch: pages {start_idx + 1} to {end_idx}")
            async with semaphore:
                def read_batch():
                    batch_text = ""
                    for i in range(start_idx, end_idx):
                        try:
                            print(f"🔍 Extracting text from page {i+1}...")
                            page = pages[i]
                            text = page.extract_text()
                            if text:
                                print(f"✅ Text found on page {i+1} ({len(text)} characters)")
                                batch_text += text + "\n"
                            else:
                                print(f"⚠️ No text found on page {i+1}")
                        except Exception as e:
                            print(f"❌ Error processing page {i+1}: {e}")
                    return batch_text
                return await asyncio.to_thread(read_batch)

        tasks = []
        for i in range(0, total_pages, BATCH_SIZE):
            print(f"📦 Queuing batch from page {i+1} to {min(i + BATCH_SIZE, total_pages)}")
            tasks.append(process_page_batch(i, min(i + BATCH_SIZE, total_pages)))

        print("🚀 Launching all batch tasks...")
        batch_texts = await asyncio.gather(*tasks)
        print("✅ All batch tasks completed.")

        full_text = "\n".join(batch_texts)
        print(f"📚 Total extracted text length: {len(full_text)} characters")

    # Save full text to file
    with open("extracted_text.txt", "w", encoding="utf-8") as f:
        f.write(full_text)
    print("💾 Extracted text saved to 'extracted_text.txt'")

    # Split into chunks
    chunked = [chunk.strip() for chunk in full_text.split("\n\n") if chunk.strip()]
    print(f"📄 Extracted {len(chunked)} chunks from PDF using pdfplumber")
    if chunked:
        print(f"🔍 First chunk preview (100 chars): {repr(chunked[0][:100])}")
    else:
        print("❗ No chunks were extracted.")

    return chunked

# Run the test
pdf_url = "https://hackrx.blob.core.windows.net/assets/principia_newton.pdf?sv=2023-01-03&st=2025-07-28T07%3A20%3A32Z&se=2026-07-29T07%3A20%3A00Z&sr=b&sp=r&sig=V5I1QYyigoxeUMbnUKsdEaST99F5%2FDfo7wpKg9XXF5w%3D"

print("🧪 Starting test...")

chunks = await extract_text_from_pdf(await download_pdf(pdf_url))

print("🧾 Previewing extracted chunks:")
for i, chunk in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i+1} (length: {len(chunk)} chars) ---\n{chunk[:300]}")

print("✅ Test complete.")


🧪 Starting test...
📥 Starting PDF download...
📶 HTTP Status: 200 (Download took 82.92 seconds)
✅ PDF successfully downloaded and saved to: C:\Users\debdi\AppData\Local\Temp\tmpyem5ypw1.pdf
📂 Opening PDF for extraction: C:\Users\debdi\AppData\Local\Temp\tmpyem5ypw1.pdf
📄 Total pages in PDF: 594
📦 Queuing batch from page 1 to 20
📦 Queuing batch from page 21 to 40
📦 Queuing batch from page 41 to 60
📦 Queuing batch from page 61 to 80
📦 Queuing batch from page 81 to 100
📦 Queuing batch from page 101 to 120
📦 Queuing batch from page 121 to 140
📦 Queuing batch from page 141 to 160
📦 Queuing batch from page 161 to 180
📦 Queuing batch from page 181 to 200
📦 Queuing batch from page 201 to 220
📦 Queuing batch from page 221 to 240
📦 Queuing batch from page 241 to 260
📦 Queuing batch from page 261 to 280
📦 Queuing batch from page 281 to 300
📦 Queuing batch from page 301 to 320
📦 Queuing batch from page 321 to 340
📦 Queuing batch from page 341 to 360
📦 Queuing batch from page 361 to 380
📦 Queuing ba

Data-loss while decompressing corrupted data


⚠️ No text found on page 38⚠️ No text found on page 315
🔍 Extracting text from page 316...

🔍 Extracting text from page 39...
⚠️ No text found on page 298
🔍 Extracting text from page 299...
⚠️ No text found on page 193
🔍 Extracting text from page 194...
⚠️ No text found on page 137
🔍 Extracting text from page 138...
⚠️ No text found on page 74
🔍 Extracting text from page 75...
⚠️ No text found on page 239
🔍 Extracting text from page 240...
⚠️ No text found on page 54
🔍 Extracting text from page 55...
⚠️ No text found on page 114
🔍 Extracting text from page 115...
⚠️ No text found on page 155
🔍 Extracting text from page 156...
⚠️ No text found on page 258
🔍 Extracting text from page 259...
⚠️ No text found on page 316
🔍 Extracting text from page 317...
⚠️ No text found on page 217
🔍 Extracting text from page 218...
⚠️ No text found on page 98
🔍 Extracting text from page 99...
⚠️ No text found on page 299
🔍 Extracting text from page 300...
⚠️ No text found on page 75
🔍 Extracting text fr

Data-loss while decompressing corrupted data


⚠️ No text found on page 503
🔍 Extracting text from page 504...
⚠️ No text found on page 385
🔍 Extracting text from page 386...
⚠️ No text found on page 543
🔍 Extracting text from page 544...
⚠️ No text found on page 581
🔍 Extracting text from page 582...
⚠️ No text found on page 366
🔍 Extracting text from page 367...
⚠️ No text found on page 464
🔍 Extracting text from page 465...
⚠️ No text found on page 386
🔍 Extracting text from page 387...
⚠️ No text found on page 563
🔍 Extracting text from page 564...
⚠️ No text found on page 523
🔍 Extracting text from page 524...
⚠️ No text found on page 426
🔍 Extracting text from page 427...
⚠️ No text found on page 484
🔍 Extracting text from page 485...
⚠️ No text found on page 544
🔍 Extracting text from page 545...
⚠️ No text found on page 200
⚠️ No text found on page 327
🔍 Extracting text from page 328...
⚠️ No text found on page 406
🔍 Extracting text from page 407...
⚠️ No text found on page 445
🔍 Extracting text from page 446...
⚠️ No text 

Data-loss while decompressing corrupted data


✅ Text found on page 429 (2748 characters)
🔍 Extracting text from page 430...
⚠️ No text found on page 584
🔍 Extracting text from page 585...
⚠️ No text found on page 547
🔍 Extracting text from page 548...
⚠️ No text found on page 566
🔍 Extracting text from page 567...
⚠️ No text found on page 349
🔍 Extracting text from page 350...
⚠️ No text found on page 390
🔍 Extracting text from page 391...
⚠️ No text found on page 409
🔍 Extracting text from page 410...
⚠️ No text found on page 486
🔍 Extracting text from page 487...
⚠️ No text found on page 467
🔍 Extracting text from page 468...
⚠️ No text found on page 506
🔍 Extracting text from page 507...
⚠️ No text found on page 330
🔍 Extracting text from page 331...
⚠️ No text found on page 448
🔍 Extracting text from page 449...
⚠️ No text found on page 430
🔍 Extracting text from page 431...
⚠️ No text found on page 525
🔍 Extracting text from page 526...
⚠️ No text found on page 391
🔍 Extracting text from page 392...
⚠️ No text found on page 5