## Importing Relevant Libraries


In [None]:
!pip install --use-feature=2020-resolver azure-storage-blob==2.1.0
import json
import time
import requests
import os
from azure.storage.blob import BlockBlobService
import pprint
from os import listdir
from os.path import isfile, join
import shutil
import pickle

## Create Local Folders


In [None]:
# Create local directories if they don't exist
# *mfg_source* contains all the pdf files to be converted to json
if (not os.path.isdir(os.getcwd()+"/form-datasets")):
    os.makedirs(os.getcwd()+"/form-datasets")
# *formrecogoutput* will contain all the converted json files
if (not os.path.isdir(os.getcwd()+"/formrecogoutput")):
    os.makedirs(os.getcwd()+"/formrecogoutput")

## Downloading the PDF form from a cotainer

Downloads all PDF from a container
Does not download PDF which are already downloaded

In [None]:
%%time
# Downloading pdf files from a container named *form-datasets* to a local folder *form-datasets**
# Set up configs for blob storage
STORAGE_ACCOUNT_NAME = "#STORAGE_ACCOUNT_NAME#"
STORAGE_ACCOUNT_ACCESS_KEY = "#STORAGE_ACCOUNT_KEY#"
STORAGE_CONTAINER_NAME = "form-datasets"

# Instantiating a blob service object
blob_service = BlockBlobService(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_ACCESS_KEY) 

blobs = blob_service.list_blobs(STORAGE_CONTAINER_NAME)
# Downloading pdf files from the container *form-datasets** and storing them locally to *form-datasets** folder
for blob in blobs:
    # Check if the blob.name is already present in the folder form-datasets*. If yes then continue
    try:
        with open('merged_log','rb') as f:
            merged_files = pickle.load(f)
    except FileNotFoundError:
        merged_files = set()
    # If file is already processed then continue to next file
    if (blob.name in merged_files): 
        continue
    download_file_path = os.path.join(os.getcwd(), "form-datasets", blob.name)
    blob_service.get_blob_to_path(STORAGE_CONTAINER_NAME, blob.name ,download_file_path)
    merged_files.add(blob.name)
    # Keep trace of all the processed files at the end of your script (to keep track later)
    with open('merged_log', 'wb') as f:
        pickle.dump(merged_files, f)

In [None]:
# Total number of forms to be converted to JSON
files = [f for f in listdir(os.getcwd()+"/form-datasets") if isfile(join(os.getcwd()+"/form-datasets", f))]
len(files)

Querying the custom form recognizer model (PDF -> JSON)

Converts PDF -> JSON by querying the trained custom model.
Clean the JSON file
If a file has already been converted to JSON then skip it.## Cell title


In [None]:
%%time
# Endpoint parameters for querying the custom trained form-recognizer model to return the processed JSON
# Processes PDF files one by one and return CLEAN JSON files
endpoint = r"https://westus2.api.cognitive.microsoft.com/"
# Change if api key is expired
apim_key = "1d6a7980b0a84bc9b040a1196436675e"
# This model is the one trained on 5 forms
model_id = "0986b966-c6ae-4b9c-8eec-4db9bb846eac"
post_url = endpoint + r"/formrecognizer/v2.0/custom/models/%s/analyze" % model_id
files = [f for f in listdir(os.getcwd()+"/form-datasets") if isfile(join(os.getcwd()+"/form-datasets", f))]
params = {"includeTextDetails": True}
headers = {'Content-Type': 'application/pdf', 'Ocp-Apim-Subscription-Key': apim_key}

local_path = os.path.join(os.getcwd(), "form-datasets//")
output_path = os.path.join(os.getcwd(), "formrecogoutput//")

for file in files:
    try:
        with open('json_log','rb') as l:
            json_files = pickle.load(l)
    except FileNotFoundError:
        json_files = set()
    if (file in json_files): 
        continue
    else:
        with open(local_path+file, "rb") as f:
            data_bytes = f.read()
        
    try:
        resp = requests.post(url = post_url, data = data_bytes, headers = headers, params = params)
        print('resp',resp)
        if resp.status_code != 202:
            print("POST analyze failed:\n%s" % json.dumps(resp.json()))
            quit()
        print("POST analyze succeeded:\n%s" % resp.headers)
        get_url = resp.headers["operation-location"]
    except Exception as e:
        print("POST analyze failed:\n%s" % str(e))
        quit()
     
    n_tries = 15
    n_try = 0
    wait_sec = 5
    max_wait_sec = 60
    while n_try < n_tries:
        try:
            resp = requests.get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": apim_key})
            resp_json = resp.json()
            if resp.status_code != 200:
                print("GET analyze results failed:\n%s" % json.dumps(resp_json))
                quit()
            status = resp_json["status"]
            if status == "succeeded":
                print("Analysis succeeded:\n%s" % file[:-4])
                allkeys = resp_json['analyzeResult']['documentResults'][0]['fields'].keys()
                new_dict = {}
                for i in allkeys:
                    if resp_json['analyzeResult']['documentResults'][0]['fields'][i] != None:
                        key = i.replace(" ", "_")
                        new_dict[key] = resp_json['analyzeResult']['documentResults'][0]['fields'][i]['valueString']
                    else:
                        key = i.replace(" ", "_")
                        new_dict[key] = None
                # Appending form url to json
                new_dict['form_url'] = 'https://dreamdemostrggen2pocrs11.blob.core.windows.net/formupload/' + file 
                with open(output_path+file[:-4]+".json", 'w') as outfile:
                    json.dump(new_dict, outfile)
                # Change the encoding of file in case of spanish forms. It will detected random characters
                with open(output_path+file[:-4]+".json", 'w', encoding='utf-8') as outfile:
                    json.dump(new_dict, outfile, ensure_ascii=False)
                # Once JSON is saved log it otherwise don't log it.
                json_files.add(file)
                with open('json_log', 'wb') as f:
                    pickle.dump(json_files, f)

                break
            if status == "failed":
                print("Analysis failed:\n%s" % json.dumps(resp_json))
                quit()
            # Analysis still running. Wait and retry.
            time.sleep(wait_sec)
            n_try += 1
            wait_sec = min(2*wait_sec, max_wait_sec)     
        except Exception as e:
            msg = "GET analyze results failed:\n%s" % str(e)
            print(msg)
            quit()

Upload the JSON files to a cotainer## Cell title


In [None]:
# Total number of converted JSON
files = [f for f in listdir(os.getcwd()+"/formrecogoutput") if isfile(join(os.getcwd()+"/formrecogoutput", f))]
len(files)

In [None]:
%%time
# Connect to the container for uploading the JSON files
# Set up configs for blob storage
STORAGE_ACCOUNT_NAME = "#STORAGE_ACCOUNT_NAME#"
STORAGE_ACCOUNT_ACCESS_KEY = "#STORAGE_ACCOUNT_KEY#"
# Upload the JSON files in this container
STORAGE_CONTAINER_NAME = "formrecogoutput"
# Instantiating a blob service object
blob_service = BlockBlobService(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_ACCESS_KEY)

In [None]:
%%time
# Upload JSON files from local folder *formrecogoutput* to the container *formrecogoutput*
local_path = os.path.join(os.getcwd(), "formrecogoutput")
print(local_path)
for files in os.listdir(local_path):
    print(os.path.join(local_path,files))
    blob_service.create_blob_from_path(STORAGE_CONTAINER_NAME, files, os.path.join(local_path,files))