In [None]:
#1.Digitize the invoices– Invoices are in the form of pdfs that need to be digitized.
#Depending on the quality of the input, we need to add an image preprocessing pipeline for best results.
#2.Extract data– Data extraction is done using AI algorithms.
#We can process this extracted information using Optical Character Recognition.
#Here, it is important to identify which piece of text corresponds to which field.
#3.Create database– After the data has been extracted, we need to create a database based on a unique identifier.

In [None]:
!pip install pytesseract
!sudo apt-get install tesseract-ocr-ind

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind tesseract-ocr-osd
0 upgraded, 4 newly installed, 0 to remove and 23 not upgraded.
Need to get 5,387 kB of archives.
After this operation, 17.4 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1 [1,598 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1 [2,990 kB]
Get:3 http://archiv

In [None]:
#libraries
import os
import cv2
import numpy as np
import pandas as pd
import pytesseract
import imutils
import re


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
#processing the invoice image
def preprocess_image(image):
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  blurred = cv2.GaussianBlur(gray, (5, 5), 0)
  edged = cv2.Canny(blurred, 50, 200, 255)
  return edged


In [None]:
#Object Detection using YOLO algorithm
def detect_objects(image):
  net = cv2.dnn.readNetFromDarknet("/content/drive/MyDrive/yolov3.cfg", "/content/drive/MyDrive/yolov3.weights")
  ln = net.getLayerNames()
  ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
  blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
  net.setInput(blob)
  layerOutputs = net.forward(ln)
  boxes = []
  confidences = []
  classIDs = []
  for output in layerOutputs:
    for detection in output:
      scores = detection[5:]
      classID = np.argmax(scores)
      confidence = scores[classID]
      if confidence > 0.5:
        box = detection[0:4] * np.array([w, h, w, h])
        (centerX, centerY, width, height) = box.astype("int")
        x = int(centerX - (width / 2))
        y = int(centerY - (height / 2))
        boxes.append([x, y, int(width), int(height)])
        confidences.append(float(confidence))
        classIDs.append(classID)
  idxs = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.3)
  return idxs, boxes, classIDs


In [None]:
#Optical Character Recognition
def extract_text(image, roi):
  x, y, w, h = roi
  roi_image = image[y:y+h, x:x+w]
  text = pytesseract.image_to_string(roi_image, lang='eng')
  return text


In [None]:
#list
field_identifiers=[    "Invoice number",    "Invoice date",    "Payment terms",    "Due date",    "Buyer's name and address",    "Seller's name and address",    "Item description",    "Quantity",    "Price",    "Total amount due",    "Taxes","Proforma invoice number",    "Date of the proforma invoice",    "Buyer's name and address",    "Seller's name and address",    "Item description",    "Quantity",    "Price",    "Total amount due",    "Shipping terms and charges",
   "Invoice number",    "Invoice date",    "Buyer's name and address",    "Seller's name and address",    "Item description",    "Quantity",    "Price",    "Total amount due",    "Shipping terms and charges",    "Country of origin",    "Harmonized System (HS) code for each item",

   "Invoice number",    "Invoice date",    "Client name and address",    "Contractor name and address",    "Work description",    "Number of hours worked",    "Hourly rate",    "Total amount due",

  "Invoice number",    "Invoice date",    "Client name and address",    "Service provider name and address",    "Service description",    "Hourly rate or fixed fee",    "Total amount due"]



In [None]:
#Mapping Fields with Data
def map_fields_with_data(text, field_identifiers):
  data = {}
  # loop through each field identifier and use regular expressions to match the text with the fields
  for identifier in field_identifiers:
    match = re.search(rf'{identifier}:(.*)', text)
    if match:
      data[identifier] = match.group(1).strip()
  return data


In [None]:
#printing the data
print(map_fields_with_data)

<function map_fields_with_data at 0x7f4e8e2750d0>


In [None]:
# Define the unique identifier for each invoice
unique_identifiers = ["Invoice No", "Date"]

# Load the invoice images from Google Drive
image_path ='/content/drive/MyDrive/'
image_name=['WhatsApp Image 2023-03-18  at 14.56.45.jpg','WhatsApp Image 2023-03-18 at 15.01.20.jpg','WhatsApp Image 2023-03-18 at 15.26.59.jpg','WhatsApp Image 2023-03-18 at 15.33.20.jpg']


# Create a dictionary to store the extracted data for each invoice
invoice_data = {}

# Loop through each invoice image and extract the data
for name in image_name:
  image = cv2.imread(os.path.join(image_path,name))
  h, w = image.shape[:2]
  processed_image = preprocess_image(image)
   idxs, boxes, classIDs = detect_objects(processed_image)

  # Loop through each object detected by YOLO and extract the text using OCR
  for i in idxs.flatten():
    if classIDs[i] == 0: # Only extract text from objects classified as text
      box = boxes[i]
      text = extract_text(image, box)

      # Map the extracted text to its corresponding field using regular expressions
      data = map_fields_with_data(text, field_identifiers)
      if len(data) > 0:
        # If a matching field is found, add the text to the data dictionary
        data.update(data)

      # Get the unique identifier for the invoice
      identifier_values = [data[identifier] for identifier in unique_identifiers]
      identifier = "-".join(identifier_values)

      # Add the extracted data to the dictionary for the current invoice
      if identifier in invoice_data:
        invoice_data[identifier].update(data)
      else:
        invoice_data[identifier] = data

# Print the extracted data for each invoice
for identifier, data in invoice_data.items():
  print(f"Data for invoice {identifier}:")
  for field, value in data.items():
    print(f"{field}: {value}")
  print()


IndexError: ignored

In [None]:
image_path = '/content/drive/MyDrive/WhatsApp Image 2023-03-18 at 14.56.45.jpg' # set the full path to the image file
image_name = os.path.basename(image_path) # extract the filename from the path

# Load the image and extract the data
image = cv2.imread(image_path)
h, w = image.shape[:2]
processed_image = preprocess_image(image)
print(processed_image)
idxs, boxes, classIDs = detect_objects(processed_image)

# Loop through each object detected by YOLO and extract the text using OCR
for i in idxs.flatten():
  if classIDs[i] == 0: # Only extract text from objects classified as text
    box = boxes[i]
    text = extract_text(image, box)

    # Map the extracted text to its corresponding field using regular expressions
    data = map_fields_with_data(text, field_identifiers)
    if len(field_data) > 0:
      # If a matching field is found, add the text to the data dictionary
      data.update(field_data)

# Print the extracted data
print("Extracted data:")
for field, value in data.items():
  print(f"{field}: {value}")


In [None]:
idxs, boxes, classIDs = detect_objects(processed_image)

# Loop through each object detected by YOLO and extract the text using OCR
for i in idxs.flatten():
  if classIDs[i] == 0: # Only extract text from objects classified as text
    box = boxes[i]
    text = extract_text(image, box)

    # Map the extracted text to its corresponding field using regular expressions
    data = map_fields_with_data(text, field_identifiers)
    if len(field_data) > 0:
      # If a matching field is found, add the text to the data dictionary
      data.update(field_data)

# Print the extracted data
print("Extracted data:")
for field, value in data.items():
  print(f"{field}: {value}")
