In [2]:
import pytesseract
from pdf2image import convert_from_path
from reportlab.pdfgen import canvas
from PIL import Image

In [3]:
POP_PATH = r'C:\Users\LENOVO\Documents\new_project\development\prod_tools\base\poppler-24.02.0\Library\bin'
tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe' 
# Configuration
INPUT = "2024-10-14-015-0009-002-News.jpg"
OUTPUT_PDF = "output.pdf"
DPI = 200  # Match the DPI used when converting PDF to images
FONT_NAME = "Helvetica"  # Use a standard font for crisp text

In [10]:
import pytesseract
from PIL import Image
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import numpy as np
from sklearn.cluster import DBSCAN

class ImageToPdfConverter:
    def __init__(self):
        # Register Times New Roman fonts
        pdfmetrics.registerFont(TTFont('TimesNewRoman', 'times.ttf'))
        
    def process_image(self, image_path, output_pdf_path):
        img = Image.open(image_path)
        if img.mode != 'RGB':
            img = img.convert('RGB')
            
        width, height = img.size
        
        # Get OCR data with bounding boxes
        ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
        
        # Group text into lines and analyze sizes
        lines = self._group_into_lines(ocr_data)
        
        # Detect and normalize columns
        normalized_lines = self._normalize_columns(lines)
        
        # Analyze text sizes
        self._analyze_text_sizes(normalized_lines, width, height)
        
        # Create PDF
        c = canvas.Canvas(output_pdf_path)
        c.setPageSize((width, height))
        
        # Write text lines to PDF
        self._write_text_to_pdf(c, normalized_lines, height)
        c.save()
    
    def _group_into_lines(self, ocr_data):
        """Group OCR results into lines based on vertical position"""
        words = []
        for i in range(len(ocr_data['text'])):
            if ocr_data['conf'][i] > 0 and ocr_data['text'][i].strip():
                words.append({
                    'text': ocr_data['text'][i],
                    'left': ocr_data['left'][i],
                    'top': ocr_data['top'][i],
                    'width': ocr_data['width'][i],
                    'height': ocr_data['height'][i]
                })
        
        # Sort words by vertical position first, then horizontal
        words.sort(key=lambda x: (x['top'], x['left']))
        
        # Group words into lines
        lines = []
        current_line = []
        current_top = None
        tolerance = 10  # Vertical tolerance for same line
        
        for word in words:
            if current_top is None:
                current_top = word['top']
                
            if abs(word['top'] - current_top) <= tolerance:
                current_line.append(word)
            else:
                if current_line:
                    current_line.sort(key=lambda x: x['left'])
                    line_text = ' '.join(w['text'] for w in current_line)
                    lines.append({
                        'text': line_text,
                        'left': current_line[0]['left'],
                        'top': current_line[0]['top'],
                        'height': max(w['height'] for w in current_line),
                        'original_left': current_line[0]['left']  # Keep original position for clustering
                    })
                current_line = [word]
                current_top = word['top']
        
        # Handle last line
        if current_line:
            current_line.sort(key=lambda x: x['left'])
            line_text = ' '.join(w['text'] for w in current_line)
            lines.append({
                'text': line_text,
                'left': current_line[0]['left'],
                'top': current_line[0]['top'],
                'height': max(w['height'] for w in current_line),
                'original_left': current_line[0]['left']
            })
        
        return lines
    
    def _normalize_columns(self, lines):
        """Detect and normalize text columns"""
        if not lines:
            return lines
            
        # Extract x-coordinates for clustering
        X = np.array([[line['original_left']] for line in lines])
        
        # Use DBSCAN for column detection
        eps = 100  # Maximum distance between points in same cluster
        clustering = DBSCAN(eps=eps, min_samples=1).fit(X)
        
        # Get unique clusters (columns)
        unique_clusters = np.unique(clustering.labels_)
        
        # Find the leftmost position for each cluster
        cluster_positions = {}
        for cluster in unique_clusters:
            cluster_mask = clustering.labels_ == cluster
            cluster_positions[cluster] = np.min(X[cluster_mask])
        
        # Normalize lines by assigning them to their cluster's position
        normalized_lines = []
        for line, cluster_label in zip(lines, clustering.labels_):
            normalized_line = line.copy()
            normalized_line['left'] = cluster_positions[cluster_label]
            normalized_lines.append(normalized_line)
        
        return normalized_lines
    
    def _analyze_text_sizes(self, lines, img_width, img_height):
        """Analyze text sizes and set appropriate scaling"""
        if not lines:
            self.base_font_size = 18
            return
            
        # Get all line heights
        heights = [line['height'] for line in lines]
        median_height = np.median(heights)
        
        # Calculate base font size based on image dimensions
        img_diagonal = np.sqrt(img_width**2 + img_height**2)
        size_factor = img_diagonal / 1500
        
        # Set font size
        self.base_font_size = max(18, min(median_height * size_factor * 3, 32))
    
    def _write_text_to_pdf(self, canvas_obj, lines, img_height):
        """Write text lines to PDF"""
        # Set font
        canvas_obj.setFont('TimesNewRoman', self.base_font_size)
        
        # Write each line
        for line in lines:
            # Calculate y position (flip coordinate system)
            y = img_height - (line['top'] + line['height'])
            
            # Write the entire line
            canvas_obj.drawString(line['left'], y, line['text'])

def convert_image_to_pdf(image_path, output_pdf_path):
    converter = ImageToPdfConverter()
    converter.process_image(image_path, output_pdf_path)

In [11]:
convert_image_to_pdf(INPUT, 'output1.pdf')

In [15]:
import pytesseract
from PIL import Image
import numpy as np
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import portrait
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

# Register Times New Roman fonts (ensure the .ttf files are available on your system)
pdfmetrics.registerFont(TTFont('Times-Roman', 'times.ttf'))  # Regular Times New Roman
pdfmetrics.registerFont(TTFont('Times-Bold', 'timesbd.ttf'))  # Bold Times New Roman

def image_to_editable_pdf(image_path, output_pdf):
    # Load image and extract metadata
    img = Image.open(image_path)
    dpi = img.info.get('dpi', (72, 72))
    img_width, img_height = img.size

    # Calculate PDF page size in points
    pdf_width = (img_width * 72) / dpi[0]
    pdf_height = (img_height * 72) / dpi[1]
    pdf_size = (pdf_width, pdf_height)

    # Perform OCR to get word data
    ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
    words = []
    n_boxes = len(ocr_data['level'])
    for i in range(n_boxes):
        text = ocr_data['text'][i].strip()
        if text and int(ocr_data['conf'][i]) > 60:  # Filter out low-confidence detections
            words.append({
                'text': text,
                'left': ocr_data['left'][i],
                'top': ocr_data['top'][i],
                'width': ocr_data['width'][i],
                'height': ocr_data['height'][i],
            })

    # Group words into lines and columns
    lines = group_words_into_lines(words)
    
    # Create PDF canvas
    c = canvas.Canvas(output_pdf, pagesize=portrait(pdf_size))
    for line in lines:
        columns = group_line_into_columns(line)
        for col in columns:
            if not col:
                continue
            # Calculate median font size for the column
            median_height = np.median([w['height'] for w in col])
            font_size_pt = (median_height * 72) / dpi[1]
            
            # Calculate baseline position (convert image coordinates to PDF)
            median_top = np.median([w['top'] for w in col])
            baseline_y_pdf = (img_height - (median_top + median_height)) * (72 / dpi[1])
            
            # Starting x position of the column
            first_word = col[0]
            start_x_pdf = (first_word['left'] * 72) / dpi[0]
            
            # Check if bold
            is_bold = is_text_bold(first_word, font_size_pt, dpi[0])
            font_name = 'Times-Bold' if is_bold else 'Times-Roman'
            
            # Combine words in column into a single string
            line_text = ' '.join(word['text'] for word in col)
            
            # Set font and draw text
            c.setFont(font_name, font_size_pt)
            c.drawString(start_x_pdf, baseline_y_pdf, line_text)
    
    c.save()

def group_words_into_lines(words):
    if not words:
        return []
    # Sort words by their top coordinate
    sorted_words = sorted(words, key=lambda x: x['top'])
    lines = []
    current_line = [sorted_words[0]]
    current_avg_top = sorted_words[0]['top']
    current_avg_height = sorted_words[0]['height']
    
    for word in sorted_words[1:]:
        # Dynamic threshold based on current average height
        if abs(word['top'] - current_avg_top) <= current_avg_height * 0.6:
            current_line.append(word)
            # Update averages
            current_avg_top = np.mean([w['top'] for w in current_line])
            current_avg_height = np.mean([w['height'] for w in current_line])
        else:
            lines.append(current_line)
            current_line = [word]
            current_avg_top = word['top']
            current_avg_height = word['height']
    lines.append(current_line)
    
    # Sort each line by left coordinate
    for line in lines:
        line.sort(key=lambda x: x['left'])
    return lines

def group_line_into_columns(line, threshold_factor=1.5):
    if not line:
        return []
    # Sort words by left coordinate
    sorted_line = sorted(line, key=lambda x: x['left'])
    columns = []
    current_col = [sorted_line[0]]
    
    median_width = np.median([w['width'] for w in sorted_line])
    threshold = median_width * threshold_factor
    
    for word in sorted_line[1:]:
        last_word = current_col[-1]
        gap = word['left'] - (last_word['left'] + last_word['width'])
        if gap > threshold:
            columns.append(current_col)
            current_col = [word]
        else:
            current_col.append(word)
    columns.append(current_col)
    return columns

def is_text_bold(word, font_size_pt, dpi_x):
    # Calculate actual width in points
    actual_width_pt = (word['width'] * 72) / dpi_x
    # Estimate expected width for regular font (0.6 factor is heuristic)
    expected_width_pt = len(word['text']) * font_size_pt * 0.6
    return actual_width_pt > expected_width_pt * 1.2

# Example usage
image_to_editable_pdf(INPUT, 'output2.pdf')