In [60]:
import argparse
from enum import Enum
import io
import os
from pprint import pprint

from google.cloud import vision
from PIL import Image, ImageDraw


class FeatureType(Enum):
    PAGE = 1
    BLOCK = 2
    PARA = 3
    WORD = 4
    SYMBOL = 5

# Grant credentials to notebook
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials.json'


def draw_boxes(image, bounds, color):
    """Draw a border around the image using the hints in the vector list."""
    draw = ImageDraw.Draw(image)

    for bound in bounds:
        draw.polygon(
            [
                bound.vertices[0].x,
                bound.vertices[0].y,
                bound.vertices[1].x,
                bound.vertices[1].y,
                bound.vertices[2].x,
                bound.vertices[2].y,
                bound.vertices[3].x,
                bound.vertices[3].y,
            ],
            None,
            color,
        )
    return image


def get_document_bounds(image_file, feature):
    document = getAnnotations(image_file)
    
    bounds = []

    # Collect specified feature bounds by enumerating all document features
    # Lmao this is so hacky
    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    for symbol in word.symbols:
                        if feature == FeatureType.SYMBOL:
                            bounds.append(symbol.bounding_box)

                    if feature == FeatureType.WORD:
                        bounds.append(word.bounding_box)

                if feature == FeatureType.PARA:
                    bounds.append(paragraph.bounding_box)

            if feature == FeatureType.BLOCK:
                bounds.append(block.bounding_box)

    # The list `bounds` contains the coordinates of the bounding boxes.
    return bounds

def getAnnotations(image_file):
    """Returns document bounds given an image."""
    client = vision.ImageAnnotatorClient()

    with io.open(image_file, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)
    document = response.full_text_annotation
    return document


def render_doc_text(filein, fileout):
    image = Image.open(filein)
    bounds = get_document_bounds(filein, FeatureType.BLOCK)
    draw_boxes(image, bounds, "blue")
    bounds = get_document_bounds(filein, FeatureType.PARA)
    draw_boxes(image, bounds, "red")
    bounds = get_document_bounds(filein, FeatureType.WORD)
    draw_boxes(image, bounds, "yellow")

    if fileout != 0:
        image.save(fileout)
    else:
        image.show()

def get_doc_metadata(img):
    annotations = getAnnotations(img)
    
    return annotations

In [68]:
# Execution sandbox
# Create a pdf with the img as background
from PIL import Image, ImageDraw
from io import BytesIO
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from PyPDF2 import PdfMerger

# render_doc_text("resources/ocrtest2.png", "out/doc_ocr_out.png")

import reportlab.rl_config
reportlab.rl_config.warnOnMissingFontGlyphs = 0

from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
pdfmetrics.registerFont(UnicodeCIDFont('HeiseiMin-W3'))

def imgToPdf(imgF, pdf_merger):
    annotations = get_doc_metadata(imgF)

    # Create in-memory PDF files
    pdf_buffer = BytesIO()
    can = canvas.Canvas(pdf_buffer)
    can.setFont('HeiseiMin-W3', 16)

    # Also throw the image onto the canvas
    img = Image.open(imgF)
    imgReader = ImageReader(img)

# Write img to the canvas (stretch to fill page)
    can.drawImage(imgReader, 0, 0, can._pagesize[0], can._pagesize[1])

    rot = -90
    can.translate(can._pagesize[0]/2, can._pagesize[1]/2)
    can.rotate(rot)

    for (i, page) in enumerate(annotations.pages):
    # Just gonna assume we're on page 1 here with the request
        for (j, block) in enumerate(page.blocks):
            for (k, paragraph) in enumerate(block.paragraphs):
            # Get paragraph text
                parTxt = ""
            
                for (l, word) in enumerate(paragraph.words):
                    for (m, symbol) in enumerate(word.symbols):
                        text = symbol.text
                        poly = symbol.bounding_box.vertices
                        parTxt += text  
                    
            # Draw paragraph text
                poly = paragraph.bounding_box.vertices
                coords = [(p.x, img.size[1]-p.y) for p in poly]
            
            # Coords are currently image-size, convert to pdf-size
                pdf_coords = [(p[0] * can._pagesize[0] / img.size[0], p[1] * can._pagesize[1] / img.size[1]) for p in coords]
            
            # Center coordinates
                pdf_coords = [(p[0] - can._pagesize[0]/2, p[1] - can._pagesize[1]/2) for p in pdf_coords]
            
            # Convert pdf coords to rotated coordinates
                rot_coords = [(-p[1], p[0]) for p in pdf_coords]
            
                textObj = can.beginText(rot_coords[0][0], rot_coords[0][1])
                textObj.textOut(parTxt)
            
            # Set to clear text
                textObj.setFillColor('white', alpha=0.0)
                        
                can.drawText(textObj)

    # Test draw string
    can.drawString(100, 100, "你好")

    can.rotate(-rot)


    # Save PDF file
    can.save()
    pdf_buffer.seek(0)

    # Merge PDF files
    pdf_merger.append(pdf_buffer)

pdf_merger = PdfMerger()
imgToPdf("resources/ocrtest2.png", pdf_merger)
imgToPdf("resources/wakeupcat.jpg", pdf_merger)

pdf_merger.write('out/doc_ocr_output.pdf')
pdf_merger.close()

In [62]:
# Dump annotations to JSON
with open('out/doc_ocr_output.json', 'w', encoding="utf-8") as f:
    f.write(str(annotations))

In [63]:
import sys
print(sys.getdefaultencoding())

utf-8
