## Importing Relevant Libraries


In [1]:
import json
import time
import requests
import os
from azure.storage.blob import BlockBlobService
import pprint
from os import listdir
from os.path import isfile, join
import shutil
import pickle

## Create Local Folders


In [2]:
# Create local directories if they don't exist
# *mfg_source* contains all the pdf files to be converted to json
if (not os.path.isdir(os.getcwd()+"/incidentreport")):
    os.makedirs(os.getcwd()+"/incidentreport")
# *formrecogoutput* will contain all the converted json files
if (not os.path.isdir(os.getcwd()+"/formrecogoutput")):
    os.makedirs(os.getcwd()+"/formrecogoutput")

## Downloading the PDF form from a cotainer

Downloads all PDF from a container
Does not download PDF which are already downloaded

In [3]:
# %%time
# Downloading pdf files from a container named *form-dataset* to a local folder *form-dataset**
# Set up configs for blob storage
STORAGE_ACCOUNT_NAME = "#STORAGE_ACCOUNT_NAME#"
STORAGE_ACCOUNT_ACCESS_KEY = "#STORAGE_ACCOUNT_KEY#"
STORAGE_CONTAINER_NAME = "incidentreport"
STORAGE_LOG_CONTAINER="formrecoglog"

# Instantiating a blob service object
blob_service = BlockBlobService(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_ACCESS_KEY) 

# Download the log files used for tracking the processing of forms
log_blobs = blob_service.list_blobs(STORAGE_LOG_CONTAINER)
for blob in log_blobs:
    blob_service.get_blob_to_path(STORAGE_LOG_CONTAINER,blob.name,os.path.join(os.getcwd(), blob.name))

blobs = blob_service.list_blobs(STORAGE_CONTAINER_NAME)
# Downloading pdf files from the container *form-dataset** and storing them locally to *form-dataset** folder
for blob in blobs:
    # Check if the blob.name is already present in the folder form-dataset*. If yes then continue
    if not blob.name.rsplit('.',1)[-1] == 'pdf':
        continue
    try:
        with open('merged_log','rb') as f:
            merged_files = pickle.load(f)
    except FileNotFoundError:
        merged_files = set()
    # If file is already processed then continue to next file
    if (blob.name in merged_files): 
        continue
    download_file_path = os.path.join(os.getcwd(), "incidentreport", blob.name)
    blob_service.get_blob_to_path(STORAGE_CONTAINER_NAME, blob.name ,download_file_path)
    merged_files.add(blob.name)
    # Keep trace of all the processed files at the end of your script (to keep track later)
    with open('merged_log', 'wb') as f:
        pickle.dump(merged_files, f)

<azure.storage.blob.models.Blob object at 0x7ff81cfd6b00>

In [4]:
# Total number of forms to be converted to JSON
files = [f for f in listdir(os.getcwd()+"/incidentreport") if isfile(join(os.getcwd()+"/incidentreport", f))]
len(files)

1

Querying the custom form recognizer model (PDF -> JSON)
?
Converts PDF -> JSON by querying the trained custom model.
Clean the JSON file
If a file has already been converted to JSON then skip it.## Cell title


In [5]:
# %%time
# Endpoint parameters for querying the custom trained form-recognizer model to return the processed JSON
# Processes PDF files one by one and return CLEAN JSON files
endpoint = r"https://#LOCATION#.api.cognitive.microsoft.com/"
# Change if api key is expired
apim_key = "#APIM_KEY#"
# This model is the one trained on 5 forms
model_id = "#MODEL_ID#"
post_url = endpoint + r"/formrecognizer/v2.0/custom/models/%s/analyze" % model_id
params = {"includeTextDetails": True}
headers = {'Content-Type': 'application/pdf', 'Ocp-Apim-Subscription-Key': apim_key}

local_path = os.path.join(os.getcwd(), "incidentreport//")
output_path = os.path.join(os.getcwd(), "formrecogoutput//")
files = [f for f in listdir(local_path) if isfile(join(local_path, f))]

for file in files:
    if not file.rsplit('.',1)[-1] == 'pdf':
        continue
    try:
        with open('json_log','rb') as l:
            json_files = pickle.load(l)
    except FileNotFoundError:
        json_files = set()
    if (file in json_files): 
        continue
    else:
        with open(join(local_path,file), "rb") as f:
            data_bytes = f.read()
        
    try:
        resp = requests.post(url = post_url, data = data_bytes, headers = headers, params = params)
        if resp.status_code != 202:
            print("POST analyze failed:\n%s" % json.dumps(resp.json()))
            #quit()
        else:
            print("POST analyze succeeded:\n%s" % resp.headers)
            get_url = resp.headers["operation-location"]
    except Exception as e:
        print("POST analyze failed:\n%s" % str(e))
        #quit()
     
    n_tries = 15
    n_try = 0
    wait_sec = 5
    max_wait_sec = 60
    while n_try < n_tries:
        try:
            resp = requests.get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": apim_key})
            resp_json = resp.json()
            if resp.status_code != 200:
                print("GET analyze results failed:\n%s" % json.dumps(resp_json))
                #quit()
            status = resp_json["status"]
            if status == "succeeded":
                print("Analysis succeeded:\n%s" % file[:-4])
                allkeys = resp_json['analyzeResult']['documentResults'][0]['fields'].keys()
                new_dict = {}
                for i in allkeys:
                    if resp_json['analyzeResult']['documentResults'][0]['fields'][i] != None:
                        key = i.replace(" ", "_")
                        new_dict[key] = resp_json['analyzeResult']['documentResults'][0]['fields'][i]['valueString']
                    else:
                        key = i.replace(" ", "_")
                        new_dict[key] = None
                # Appending form url to json
                new_dict['form_url'] = 'https://#STORAGE_ACCOUNT_NAME#.blob.core.windows.net/incidentreport/' + file 
                with open(join(output_path,file[:-4]+".json"), 'w') as outfile:
                    json.dump(new_dict, outfile)
                # Change the encoding of file in case of spanish forms. It will detected random characters
                with open(join(output_path,file[:-4]+".json"), 'w', encoding='utf-8') as outfile:
                    json.dump(new_dict, outfile, ensure_ascii=False)
                # Once JSON is saved log it otherwise don't log it.
                json_files.add(file)
                with open('json_log', 'wb') as f:
                    pickle.dump(json_files, f)

                break
            if status == "failed":
                print("Analysis failed:\n%s" % json.dumps(resp_json))
                #quit()
            # Analysis still running. Wait and retry.
            time.sleep(wait_sec)
            n_try += 1
            wait_sec = min(2*wait_sec, max_wait_sec)     
        except Exception as e:
            msg = "GET analyze results failed:\n%s" % str(e)
            print(msg)
            #quit()

POST analyze succeeded:
{'Content-Length': '0', 'Operation-Location': 'https://westus2.api.cognitive.microsoft.com/formrecognizer/v2.0/custom/models/a002139f-7f8f-403b-b864-6ecfc692c16f/analyzeresults/18d84eee-e167-4644-929d-1da0728287f9', 'x-envoy-upstream-service-time': '62', 'apim-request-id': '88f06ed2-2edb-482f-b6f8-bdbd6cd2e302', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'x-content-type-options': 'nosniff', 'Date': 'Fri, 25 Sep 2020 23:29:47 GMT'}
Analysis succeeded:
212045000

Upload the JSON files to a cotainer## Cell title


In [6]:
# Total number of converted JSON
files = [f for f in listdir(output_path) if isfile(join(output_path, f))]
len(files)

1

In [7]:
# %%time
# Connect to the container for uploading the JSON files
# Set up configs for blob storage
STORAGE_ACCOUNT_NAME = "#STORAGE_ACCOUNT_NAME#"
STORAGE_ACCOUNT_ACCESS_KEY = "#STORAGE_ACCOUNT_KEY#"
# Upload the JSON files in this container
STORAGE_CONTAINER_NAME = "formrecogoutput"
# Instantiating a blob service object
blob_service = BlockBlobService(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_ACCESS_KEY)

In [8]:
# %%time
# Upload JSON files from local folder *formrecogoutput* to the container *formrecogoutput*
local_path = os.path.join(os.getcwd(), "formrecogoutput")
print(local_path)
for files in os.listdir(local_path):
    print(os.path.join(local_path,files))
    blob_service.create_blob_from_path(STORAGE_CONTAINER_NAME, files, os.path.join(local_path,files))

/mnt/var/hadoop/tmp/nm-local-dir/usercache/trusted-service-user/appcache/application_1601072998832_0005/container_1601072998832_0005_01_000001/formrecogoutput
/mnt/var/hadoop/tmp/nm-local-dir/usercache/trusted-service-user/appcache/application_1601072998832_0005/container_1601072998832_0005_01_000001/formrecogoutput/212045000.json
<azure.storage.blob.models.ResourceProperties object at 0x7ff81c8c7828>

In [9]:
# Upload the log files used for tracking form processing to blob container
blob_service.create_blob_from_path(STORAGE_LOG_CONTAINER,"merged_log",os.path.join(os.getcwd(), "merged_log"))
blob_service.create_blob_from_path(STORAGE_LOG_CONTAINER,"json_log",os.path.join(os.getcwd(), "json_log"))

<azure.storage.blob.models.ResourceProperties object at 0x7ff81c8c7588>