# OCR demo for Coins and Medals registers

This Juyptyer notebook contains code and data files for a demonstration of Optical Character Recognition on scanned coin aquisition registers for the Fitzwilliam Museum between 1975 - 1989. 

In [6]:
#!/usr/bin/python
## Split pdf files into pages
## Daniel Pett 11/2/2021
__author__ = 'portableant'
## Tested on Python 2.7.16 & Python 3.9.1 on OSX Big Sur
## Usage example
## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr
## On mac osx you may need to install poplar and add to path
## brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc

To begin with, import the libraries needed for this script to run. 

For this you will need os, sys, argparse, Pillow, Pytesseract, PdfFileWriter and PdfFileReader

In [32]:
import argparse
import os
import sys
from PIL import Image
import pytesseract
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdf2image import convert_from_path, pdfinfo_from_path


If you don't have tesseract executable in your PATH, include the following:

```pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'```

Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

As I built this on OSX, I used this path.

In [8]:
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'


Next we'll be setting up our arguments for the command line script to run

In [26]:
parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')

parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
parser.add_argument('-f', '--file', help='The file to process', required=True)
parser.add_argument('-n', '--name', help='The new file name', required=True)
parser.add_argument('-d', '--destination', help='The processed folder', required=True)
parser.add_argument('-o', '--ocr', help='The ocr folder', required=True)


_StoreAction(option_strings=['-o', '--ocr'], dest='ocr', nargs=None, const=None, default=None, type=None, choices=None, help='The ocr folder', metavar=None)

Next we'll set up the parsing of arguments sent to the script (and print them out so you can see what they are). In this demo I have sent these arguments. as shown below:

In [33]:
### Parse arguments and create paths

args = parser.parse_args(args=['--p', '.', '--f', '1975_1989.pdf', '--n', '1975_1989_processed', '--d', 'processed', '--o', 'ocr'])

path = args.path

destination = os.path.join(path,args.destination)

ocrfolder = os.path.join(path,args.ocr)

pageName = os.path.join(destination,args.name)+'%s.pdf'

fileName = os.path.join(path,args.file)


Now we will make the folders if they do not exist

In [28]:
if not os.path.exists(destination):
    os.makedirs(destination)

if not os.path.exists(ocrfolder):
    os.makedirs(ocrfolder)

if not os.path.exists('images'):
    os.makedirs('images')

Next we open the input file:

In [29]:
inputpdf = PdfFileReader(open( fileName, "rb"))

Next we go through the pdf and break into pages

In [30]:
for i in range(inputpdf.numPages):
    output = PdfFileWriter()
    output.addPage(inputpdf.getPage(i))
    with open(pageName % (i+1), "wb") as outputStream:
        output.write(outputStream)

Next we convert the pdfs to images and then run the tesseract OCR

In [31]:
for file in os.listdir(destination):
     filepath = os.path.join(destination,file)
     if file.endswith(".pdf"):
        img = convert_from_path(filepath)
        imgName = os.path.splitext(file)[0]
        jpgName = os.path.join('./images/',imgName + '.jpg')
        for page in img:
          page.save(jpgName, 'JPEG')
          text = pytesseract.image_to_string(Image.open(jpgName), config='psm 4')
          ocrName = os.path.join('./ocr/',imgName + '.txt')
          with open(ocrName, mode = 'w') as f:
            f.write(text)

This has now run OCR on each image file in a loop and saved the output to a text file.