# Tunneling to kaggle, we have to install popper-utils

In [1]:
!apt-get install -y poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.8).
0 upgraded, 0 newly installed, 0 to remove and 84 not upgraded.


# Use asynchronous programming to read the pdf files in parallel

In [2]:
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from pdf2image import convert_from_path
from PIL import Image
import io

async def compress_and_save_image(image, filepath, quality=85, optimize=True):
    def _compress_save(img=image):  # pass image explicitly
        if img.mode in ('RGBA', 'LA', 'P'):
            background = Image.new('RGB', img.size, (255, 255, 255))
            background.paste(img, mask=img.split()[-1])
            img = background
        else:
            img = img.convert('RGB')

        img.save(filepath, 'JPEG', quality=quality, optimize=optimize, progressive=True)
        return os.path.getsize(filepath)

    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        return await loop.run_in_executor(executor, _compress_save)


async def convert_pdf_to_compressed_images(pdf_path, output_dir="economic_survey_images", 
                                         dpi=150, quality=85):
    """Convert PDF to compressed images asynchronously"""
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Converting PDF: {pdf_path}")
    print(f"Output directory: {output_dir}")
    
    # Convert PDF to images (reduce DPI for smaller file size)
    def _convert_pdf():
        return convert_from_path(pdf_path, dpi=dpi, output_folder=output_dir)
    
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        images = await loop.run_in_executor(executor, _convert_pdf)
    
    print(f"Converted {len(images)} pages")
    
    # Create tasks for compressing and saving all images concurrently
    tasks = []
    for i, image in enumerate(images):
        filepath = os.path.join(output_dir, f'page_{i+1:03d}.jpg')  # Use .jpg extension
        task = compress_and_save_image(image, filepath, quality=quality)
        tasks.append(task)
    
    # Execute all compression tasks concurrently
    print("Compressing and saving images...")
    file_sizes = await asyncio.gather(*tasks)
    
    # Report results
    total_size = sum(file_sizes)
    avg_size = total_size / len(file_sizes) if file_sizes else 0
    
    print(f"✅ Conversion complete!")
    print(f"📁 Total files: {len(file_sizes)}")
    print(f"💾 Total size: {total_size / (1024*1024):.2f} MB")
    print(f"📊 Average size per image: {avg_size / 1024:.2f} KB")
    
    return len(images), total_size

# Main execution
async def main():
    pdf_path = "/kaggle/input/knbs-real-estate/2023-24-Real-Estate-Survey-Report_1.pdf"
    
    await convert_pdf_to_compressed_images(
        pdf_path=pdf_path,
        output_dir="real_estate_images",
        dpi=150,
        quality=85
    )

# Run the async function
if __name__ == "__main__":
    await main()

Converting PDF: /kaggle/input/knbs-real-estate/2023-24-Real-Estate-Survey-Report_1.pdf
Output directory: real_estate_images
Converted 60 pages
Compressing and saving images...
✅ Conversion complete!
📁 Total files: 60
💾 Total size: 11.87 MB
📊 Average size per image: 202.61 KB


# Set up cohere

In [None]:
import requests
import os
import io
import base64
from PIL import Image
import tqdm
import time
import numpy as np
import cohere
import os
cohere_api_key = 'HBTo...'
co = cohere.ClientV2(api_key=cohere_api_key)

In [None]:
max_pixels = 1568*1568  #Max resolution for images

def resize_image(pil_image):
    org_width, org_height = pil_image.size

    # Resize image if too large
    if org_width * org_height > max_pixels:
        scale_factor = (max_pixels / (org_width * org_height)) ** 0.5
        new_width = int(org_width * scale_factor)
        new_height = int(org_height * scale_factor)
        pil_image.thumbnail((new_width, new_height))

# Convert images to a base64 string before sending it to the API
def base64_from_image(img_path):
    pil_image = Image.open(img_path)
    img_format = pil_image.format if pil_image.format else "PNG"

    resize_image(pil_image)

    with io.BytesIO() as img_buffer:
        pil_image.save(img_buffer, format=img_format)
        img_buffer.seek(0)
        img_data = f"data:image/{img_format.lower()};base64,"+base64.b64encode(img_buffer.read()).decode("utf-8")

    return img_data

