In [None]:
!pip install -U veryfi

In [39]:
import veryfi
import json
import re
import os

client_id = 'your_client_id'
client_secret = 'your_client_secret'
username = 'your_username'
api_key = 'your_password'

veryfi_client = veryfi.Client(client_id, client_secret, username, api_key)

def is_valid_document(response):
    # DEFINE THE PATTERNS. THESE 4 PARAMETERS WILL DETERMINE IF THE FILE HAS THE CORRECT FORMAT OR NOT AND ARE MANDATORY

    # COULD HAVE REQUESTED THE TOTAL VALUE OF THE INVOICE SO THAT THE DATA WAS A LITTLE MORE COMPLETE
    patterns = {
    'vendor_name': r'\w+',  # VENDOR NAME SHOULD BE AT LEAST ONE WORD. THIS IS ASSUMING THAT OTHER COMPANIES MAY HAVE THE SAME FORMAT; OTHERWISE, THE PATTERN SHOULD BE CHANGED TO r'^switch' AND APPLY re.IGNORECASE.
    'vendor_address': r'\w+',  # VENDOR ADDRESS SHOULD BE AT LEAST ONE WORD
    'invoice_number': r'\d+',  # INVOICE NUMBER SHOULD BE NUMERIC
    'date': r'\d{4}-\d{2}-\d{2}',  # DATE SHOULD BE IN YYYY-MM-DD FORMAT
    }

    # VERIFY THE PATTERNS
    vendor_name = response.get('vendor', {}).get('name', '')
    vendor_address = response.get('vendor', {}).get('address', '')
    invoice_number = response.get('invoice_number', '')
    date = response.get('date', '')

    if not isinstance(vendor_name, str) or not re.match(patterns['vendor_name'], vendor_name):
        return False
    if not isinstance(vendor_address, str) or not re.match(patterns['vendor_address'], vendor_address):
        return False
    if not isinstance(invoice_number, str) or not re.match(patterns['invoice_number'], invoice_number):
        return False
    if not isinstance(date, str) or not re.match(patterns['date'], date):
        return False

    return True

# EXTRACT THE DATA
def extract_information(response):

    ocr_text = response.get('ocr_text', '')

    # DEFINE THE FILE NAME FOR SAVING THE OCR TEXT
    base_name = os.path.splitext(file_name)[0]  # GET THE NAME WITHOUT EXTENSION
    txt_file_name = f'{base_name}.txt'  # ADD THE .txt EXTENSION
    txt_file_path = os.path.join(ocr_texts_folder, txt_file_name)  # FULL PATH TO SAVE

    # SAVE THE OCR TEXT TO THE FILE
    with open(txt_file_path, 'w', encoding='utf-8') as file:
        file.write(ocr_text)


    sku_pattern = r'\(([a-zA-Z0-9]{8})\)'
    extracted_data = {
        'vendor_name': response.get('vendor', {}).get('name'),
        'vendor_address': response.get('vendor', {}).get('address'),
        'bill_to_name': response.get('bill_to', {}).get('name'),
        'invoice_number': response.get('invoice_number'),
        'date': response.get('date'),
        'line_items': []
    }

    for item in response.get('line_items', []):
        description = item.get('description', '')

        #I ASSUMED THAT THE SKU WAS THE 8-CHARACTER ALPHANUMERIC DATA THAT ALWAYS COMES BETWEEN PARENTHESES IN THE DESCRIPTION. IF I WAS WRONG, I WOULD HAVE TO REPEAT THE PROCESS WITHOUT THE EXTRACTION STEP.
        result = re.search(sku_pattern, description)
        sku = result.group(1) if result else item.get('sku')

        line_item = {
            'sku': sku,
            'description': item.get('description'),
            'quantity': item.get('quantity'),
            'tax_rate': item.get('tax_rate'), # I DID NOT IDENTIFY ANY DATA THAT CAN BE TAKEN AS A TAX_RATE. THE MAIN REASON IS THAT I DID NOT FIND ANY DATA MARKED AS PERCENTAGE.
            'price': item.get('price'),
            'total': item.get('total')
        }
        extracted_data['line_items'].append(line_item)

    return extracted_data


folder = 'folder_with_the_files_to_extract_data_from'
ocr_texts_folder = 'folder_for_ocr_texts'
json_files_folder = 'folder_for_json_files'

# ENSURE THE FOLDERS EXIST
os.makedirs(ocr_texts_folder, exist_ok=True)
os.makedirs(json_files_folder, exist_ok=True)

for file_name in os.listdir(folder):
    full_path = os.path.join(folder, file_name)

    # EXTRACT THE OCR TEXT
    try:
        response = veryfi_client.process_document(full_path)

        # USE THE VALIDATION FUNCTION
        if is_valid_document(response):
            extracted_data = extract_information(response)

            # PRINT THE EXTRACTED DATA
            print(json.dumps(extracted_data, indent=4))

            # DEFINE THE FILE NAME FOR SAVING EXTRACTED DATA
            json_file_name = f'{base_name}_extracted.json'
            json_file_path = os.path.join(json_files_folder, json_file_name)

            # SAVE THE EXTRACTED DATA TO A JSON FILE
            with open(json_file_path, 'w', encoding='utf-8') as f:
                json.dump(extracted_data, f, indent=4)
        else:
            print(f"INVALID FORMAT FOR FILE: {file_name}")

    except Exception as e:
        print(f"ERROR PROCESSING FILE {file_name}: {e}")


ERROR PROCESSING FILE .ipynb_checkpoints: [Errno 21] Is a directory: 'pdfs/.ipynb_checkpoints'
{
    "vendor_name": "Switch, Ltd.",
    "vendor_address": "PO Box 674592\nDallas, TX 75267-4592",
    "bill_to_name": "Nu Life Health",
    "invoice_number": "16005913",
    "date": "2023-09-11 00:00:00",
    "line_items": [
        {
            "sku": "U4M7QOSJ",
            "description": "Carrier Taxes for Transport | 58 Gbps Wavelength Diverse between Sparks, OR 98765 and\nLasVegas, OR 56789 (ymEw7J) (U4M7QOSJ) (10/2023 Taxes) (10/2023)",
            "quantity": 6016.09,
            "tax_rate": null,
            "price": -1781.87,
            "total": -10719890.29
        },
        {
            "sku": "X6HCHK1C",
            "description": "Transport | 971 Gbps Fiber to wXv21fam (X6HCHK1C) (10/2023)",
            "quantity": 3372.59,
            "tax_rate": null,
            "price": 3868.31,
            "total": 13046223.62
        },
        {
            "sku": "JTUKEGVE",
        