<a href="https://colab.research.google.com/github/Bergrebell/CarrierApp/blob/master/20220822_layoutparser_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!sudo apt-get install poppler-utils #pdf2image dependency -- restart runtime/kernel after installation
!sudo apt-get install tesseract-ocr-eng #install Tesseract OCR Engine --restart runtime/kernel after installation

In [None]:
%%capture
!pip install layoutparser torchvision && pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2"
!pip install pdf2image
!pip install "layoutparser[ocr]"

# fixes wrong version of PIL issue. -- restart runtime/kernel after installation
!pip uninstall -y Pillow
!pip install Pillow

In [None]:
# grant access to your google drive to read and store files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
import nltk
nltk.download('book') # import resources for nlp with python

In [10]:
import layoutparser as lp
from pdf2image import convert_from_bytes
import numpy as np
import glob
from pathlib import Path
from matplotlib import pyplot as plt
import pandas

import warnings
warnings.filterwarnings("ignore") # used to hide UserWarning: __floordiv__ 

INPUT_DIR = '/content/drive/MyDrive/bizres/good_bad_test_report_set'
OUTPUT_DIR = '/content/drive/MyDrive/bizres/good_bad_test_report_set_output'
VERSION = 'v1'
DEBUG_IMAGES = False
IMAGE_DETEC_TRESH = 0.5

MODEL1 = lp.Detectron2LayoutModel('lp://PrimaLayout/mask_rcnn_R_50_FPN_3x/config',
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", IMAGE_DETEC_TRESH],
                                 label_map={1:"TextRegion", 2:"ImageRegion", 3:"TableRegion", 4:"MathsRegion", 5:"SeparatorRegion", 6:"OtherRegion"})

def convert_pdf_to_text(input_path, model):
  pdf_images = convert_from_bytes(open(input_path, 'rb').read())
  full_text = ''
  # loops through each page
  for index, image in enumerate(pdf_images, start=1):
      ocr_agent = lp.TesseractAgent()
      
      # converts image to array of pixel values
      image = np.array(image)
      layout = model.detect(image)
    
      # shows image for debugging purposes
      if DEBUG_IMAGES:
        display_debug_images(image, layout)  

      # loops through each text box on page
      text_blocks = lp.Layout([b for b in layout if b.type == 'TextRegion']) 
      
      for block in text_blocks:
          segment_image = (block
                          .pad(left=3, right=3, top=3, bottom=3)
                          .crop_image(image))
          text = ocr_agent.detect(segment_image)
          block.set(text=text, inplace=True)
      
      
      for i, txt in enumerate(text_blocks.get_texts()):
        full_text += txt # Add page text to full text
        full_text += '\n' # Separate pages with a double new line
      print(f'STATUS: Page #{index} scanned.')
  return full_text

def display_debug_images(image, layout):
    debug_image = lp.draw_box(image, layout, box_width=4)
    plt.rcParams["figure.figsize"] = [25, 18]
    plt.rcParams["figure.autolayout"] = True
    plt.imshow(debug_image)
    plt.show()

def store_csv_file_from_pdf_path(input_pdf_path, model, version):
    raw_text = convert_pdf_to_text(input_pdf_path, model)
    clean_text_list = cleanup_text(raw_text)
    df = pandas.DataFrame(data=clean_text_list)
    clean_filename = Path(input_pdf_path).stem
    print(f'STATUS: Storing {OUTPUT_DIR}/{version}_{clean_filename}.csv')
    df.to_csv(f'{OUTPUT_DIR}/{version}_{clean_filename}.csv', sep=',',index=False)

def cleanup_text(raw_text):
    tok = nltk.tokenize.sent_tokenize(raw_text.replace("\n"," ").replace("- "," "))
    _c = '\n'.join(tok)
    corpus = [i for i in _c.split('\n')if i != ''and len(i.split(' '))>=4]
    return corpus

def iterate_over_input_pdfs(input_dir, model, version):
  print('Start iterating...')
  input_pdf_paths = glob.glob(f'{input_dir}/*.pdf')
  for input_pdf_path in input_pdf_paths:
    store_csv_file_from_pdf_path(input_pdf_path, model, version)

iterate_over_input_pdfs(INPUT_DIR, MODEL1, VERSION)


  [35mpixel_mean[0m
  [35mpixel_std[0m
  [35mproposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}[0m


Start iterating...
STATUS: Page #1 scanned.
STATUS: Page #2 scanned.
STATUS: Page #3 scanned.
STATUS: Page #4 scanned.
STATUS: Page #5 scanned.
STATUS: Page #6 scanned.
STATUS: Page #7 scanned.
STATUS: Page #8 scanned.
STATUS: Page #9 scanned.
STATUS: Page #10 scanned.
STATUS: Page #11 scanned.
STATUS: Page #12 scanned.
STATUS: Page #13 scanned.
STATUS: Page #14 scanned.
STATUS: Page #15 scanned.
STATUS: Page #16 scanned.
STATUS: Page #17 scanned.
STATUS: Page #18 scanned.
STATUS: Page #19 scanned.
STATUS: Page #20 scanned.
STATUS: Page #21 scanned.
STATUS: Page #22 scanned.
STATUS: Page #23 scanned.
STATUS: Page #24 scanned.
STATUS: Page #25 scanned.
STATUS: Page #26 scanned.


KeyboardInterrupt: ignored