In [1]:
!apt-get install -y tesseract-ocr
!pip install pytesseract
!pip install easyocr
!pip install passporteye
!pip install fastapi uvicorn pyngrok
!pip install python-multipart

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 3s (1,893 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123623 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

<h3 style='font-weight: bold'>Import necesary packages</h3>

In [2]:
import os
import string as st
from dateutil import parser
import matplotlib.image as mpimg
import cv2
from passporteye import read_mrz
import json
import easyocr
from fastapi import FastAPI, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from typing import Dict
from pyngrok import ngrok
from threading import Thread
import uvicorn
import warnings
warnings.filterwarnings('ignore')

<h3 style='font-weight: bold'>Load easyOCR engine</h3>

In [3]:
# lOAD OCR ENGINE (easyOCR)
reader=easyocr.Reader(lang_list=['en'], gpu=True)  # Enable gpu if available



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

<h3 style='font-weight: bold'>Load country codes from <strong>country_codes.json</strong> file</h3>

In [4]:
with open('country_codes.json') as f:
    country_codes = json.load(f)

<h3 style='font-weight: bold'>Define functions</h3>

In [5]:
def parse_date(string, iob=True):
    date = parser.parse(string, yearfirst=True).date()
    return date.strftime('%d/%m/%Y')

def clean(string):
    return ''.join(i for i in string if i.isalnum()).upper()

def get_country_name(country_code):
    country_name = ''
    for country in country_codes:
        if country['alpha-3'] == country_code:
            country_name = country['name']
            return country_name.upper()
    return country_code

def get_sex(code):
    if code in ['M', 'm', 'F', 'f']:
        sex = code.upper()
    elif code == '0':
        sex = 'M'
    else:
        sex = 'F'
    return sex

def print_data(data):
    for key in data.keys():
        info = key.replace('_', ' ').capitalize()
        print(f'{info}\t:\t{data[key]}')
    return

def get_data(img_name):
    """ Extract personal info from img_name

    Args:
        img_name (str or fp): name or path of the passport image

    Returns:
        dict: dictionary of extracted data with keys and corresponding values as follows:
                surname         : surname
                name            : name
                sex             : sex
                date_of_birth   : date of birth
                nationality     : nationality
                passport_type   : passport type
                issuing_country : issuing country
                expiration_date : expiration date
                personal_number : personal number
    """

    user_info = {}
    new_im_path = 'tmp.png'
    im_path = img_name
    # Crop image to Machine Readable Zone(MRZ)
    mrz = read_mrz(im_path, save_roi=True)

    if mrz:
        mpimg.imsave(new_im_path, mrz.aux['roi'], cmap='gray')

        img = cv2.imread(new_im_path)
        img = cv2.resize(img, (1110, 140))

        allowlist = st.ascii_letters+st.digits+'< '
        code = reader.readtext(img, paragraph=False, detail=0, allowlist=allowlist)
        a, b = code[0].upper(), code[1].upper()

        if len(a) < 44:
            a = a + '<'*(44 - len(a))
        if len(b) < 44:
                b = b + '<'*(44 - len(b))

        surname_names = a[5:44].split('<<', 1)
        if len(surname_names) < 2:
            surname_names += ['']
        surname, names = surname_names

        user_info['name'] = names.replace('<', ' ').strip().upper()
        user_info['surname'] = surname.replace('<', ' ').strip().upper()
        user_info['sex'] = get_sex(clean(b[20]))
        user_info['date_of_birth'] = parse_date(b[13:19])
        user_info['nationality'] = get_country_name(clean(b[10:13]))
        user_info['passport_type'] = clean(a[0:2])
        user_info['passport_number']  = clean(b[0:9])
        user_info['issuing_country'] = get_country_name(clean(a[2:5]))
        user_info['expiration_date'] = parse_date(b[21:27])
        user_info['personal_number'] = clean(b[28:42])

    else:
        return print(f'Machine cannot read image {img_name}.')

    os.remove(new_im_path)

    return user_info

<h3 style='font-weight: bold'>Examples</h3>

In [6]:
app = FastAPI()

# Configure CORS settings
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post("/extract_passport_data")
async def extract_passport_data(file: UploadFile = File(...)) -> Dict:
    # Save the uploaded file temporarily
    img_name = "uploaded_passport_image.png"
    with open(img_name, "wb") as image_file:
        content = await file.read()
        image_file.write(content)

    # Extract data using the pre-defined function
    data = get_data(img_name)

    # Print data for logging (optional)
    print_data(data)

    # Return extracted data as JSON
    return data

# Run the server in a separate thread
def run():
    uvicorn.run(app, host="0.0.0.0", port=8001)

# Step 3: Start the FastAPI app
server = Thread(target=run)
server.start()

In [7]:
# Set the authtoken
# Authentication Token is available at https://dashboard.ngrok.com/get-started/your-authtoken
ngrok.set_auth_token("YOUR AUTHENTICATION TOKEN")

# Connect to ngrok
ngrok_tunnel = ngrok.connect(8001)

# Print the public URL
print('Public URL:', ngrok_tunnel.public_url)

Downloading ngrok ...

INFO:     Started server process [568]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)


Public URL: https://f0ad-34-81-225-169.ngrok-free.app
