pip install ocrmypdf

pip install pytesseract

pip install opencv-python

pip install pdf2image

In [1]:
# import libs
try:
    from PIL import Image
except ImportError:
    import Image
import cv2
import pytesseract
import os
import numpy as np
import pandas as pd
import re
from pdf2image import convert_from_bytes

In [2]:
# Some help functions 
def get_conf(page_gray):
    '''return a average confidence value of OCR result '''
    df = pytesseract.image_to_data(page_gray,output_type='data.frame')
    df.drop(df[df.conf==-1].index.values,inplace=True)
    df.reset_index()
    return df.conf.mean()
  
def deskew(image):
    '''deskew the image'''
    gray = cv2.bitwise_not(image)
    temp_arr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    coords = np.column_stack(np.where(temp_arr > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

In [3]:
'''
Main part of OCR:
pages_df: save eextracted text for each pdf file, index by page
OCR_dic : dict for saving df of each pdf, filename is the key
'''
PATH = your_direcotry #DEFINE YOU PATH HERE
file_list = ["FILE1.pdf","FILE2.pdf"] #LIST OF YOUR PDF FILE NAMES
OCR_dic={} 
for file in file_list:
    # convert pdf into image
    pdf_file = convert_from_bytes(open(os.path.join(PATH,file), 'rb').read())
    # create a df to save each pdf's text
    pages_df = pd.DataFrame(columns=['conf','text'])
    for (i,page) in enumerate(pdf_file) :
        try:
            # transfer image of pdf_file into array
            page_arr = np.asarray(page)
            # transfer into grayscale
            page_arr_gray = cv2.cvtColor(page_arr,cv2.COLOR_BGR2GRAY)
            # deskew the page
            page_deskew = deskew(page_arr_gray)
            # cal confidence value
            page_conf = get_conf(page_deskew)
            # extract string 
            pages_df = pages_df.append({'conf': page_conf,'text': pytesseract.image_to_string(page_deskew)}, ignore_index=True)
        except:
            # if can't extract then give some notes into df
            pages_df = pages_df.append({'conf': -1,'text': 'N/A'}, ignore_index=True)
            continue
    # save df into a dict with filename as key        
    OCR_dic[file]=pages_df
    print('{} is done'.format(file))

11312470_BCB Group annual 2021.pdf is done
08458210_Elliptic annual 2022.pdf is done


In [6]:
OCR_dic

{'11312470_BCB Group annual 2021.pdf':    conf text
 0    -1  N/A
 1    -1  N/A
 2    -1  N/A
 3    -1  N/A
 4    -1  N/A
 5    -1  N/A
 6    -1  N/A
 7    -1  N/A
 8    -1  N/A
 9    -1  N/A
 10   -1  N/A
 11   -1  N/A,
 '08458210_Elliptic annual 2022.pdf':    conf text
 0    -1  N/A
 1    -1  N/A
 2    -1  N/A
 3    -1  N/A
 4    -1  N/A
 5    -1  N/A
 6    -1  N/A
 7    -1  N/A
 8    -1  N/A
 9    -1  N/A
 10   -1  N/A
 11   -1  N/A
 12   -1  N/A
 13   -1  N/A
 14   -1  N/A
 15   -1  N/A
 16   -1  N/A
 17   -1  N/A
 18   -1  N/A
 19   -1  N/A
 20   -1  N/A
 21   -1  N/A
 22   -1  N/A
 23   -1  N/A
 24   -1  N/A
 25   -1  N/A
 26   -1  N/A
 27   -1  N/A
 28   -1  N/A
 29   -1  N/A
 30   -1  N/A
 31   -1  N/A
 32   -1  N/A
 33   -1  N/A
 34   -1  N/A
 35   -1  N/A
 36   -1  N/A
 37   -1  N/A
 38   -1  N/A
 39   -1  N/A
 40   -1  N/A
 41   -1  N/A
 42   -1  N/A
 43   -1  N/A}