In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pytesseract
import pandas as pd
import json
import os

In [None]:
def pytesseract_image_to_string(image_path):
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    return [item for item in pytesseract.image_to_string(image_path).split('\n\n') if item != '']


In [None]:
folder_path = r'notebooks\invoices_dataset_final\Annotations\Original_Format'

X_train = []
y_train = []

def extract_data_from_json(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
        
    for label, content in data.items():
        if isinstance(content, dict) and 'text' in content:
            text = content['text'].split('\n')[0]
            X_train.append(text)
            y_train.append(label)
        elif isinstance(content, list):
            for item in content:
                if isinstance(item, dict) and 'text' in item:
                    text = item['text'].split('\n')[0]
                    X_train.append(text)
                    y_train.append(label)

# Iterate over all the JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        json_file = os.path.join(folder_path, filename)
        extract_data_from_json(json_file)

train_df = pd.DataFrame({
    'Text': X_train,
    'Label': y_train
})

train_df.to_csv('train_data.csv', index=False)

print(train_df.head())

                         Text       Label
0      Bill to:Michael Sparks       BUYER
1        Terms and Conditions  CONDITIONS
2           Date: 03-Jan-1994        DATE
3  DISCOUNT(2.14%): (-)  9.39    DISCOUNT
4      Due Date : 06-Feb-2007    DUE_DATE


In [6]:
df = pd.read_csv('train_data.csv')
X_train = df['Text']
y_train = df['Label']

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_vec, y_train, test_size=0.2, random_state=42)

In [None]:
svm = SVC(kernel='linear') 
svm.fit(X_train_split, y_train_split)

y_pred = svm.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8844626728914874
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      1.00      0.98      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.91      0.54      0.68      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.54      0.95      0.69      1188
        GST(1%)       0.89      0.91      0.90        45
       GST(12%)       0.85      1.00      0.92        33
       GST(18%)       0.95      1.00      0.98       121
       GST(20%)       0.94      0.98      0.96        48
        GST(5%)       0.38      0.07      0.12        43
        GST(7%)       0.00      0.00      0.00        34
        GST(9%)       0.46      0.76      0.57        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [8]:
svm = SVC(kernel='linear', class_weight='balanced') 
svm.fit(X_train_split, y_train_split)

y_pred = svm.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8829791875736289
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      1.00      0.98      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.96      0.50      0.66      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.53      1.00      0.70      1188
        GST(1%)       0.85      0.89      0.87        45
       GST(12%)       0.87      1.00      0.93        33
       GST(18%)       0.96      0.98      0.97       121
       GST(20%)       0.96      0.98      0.97        48
        GST(5%)       0.29      0.37      0.33        43
        GST(7%)       0.17      0.21      0.19        34
        GST(9%)       0.47      0.26      0.34        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [4]:
svm = SVC(kernel='poly', degree=3, C=5) 
svm.fit(X_train_split, y_train_split)

y_pred = svm.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8720275753741438
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      0.97      0.97      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.71      0.70      0.71      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.55      0.59      0.57      1188
        GST(1%)       0.87      0.87      0.87        45
       GST(12%)       0.86      0.94      0.90        33
       GST(18%)       0.95      0.96      0.95       121
       GST(20%)       0.94      0.92      0.93        48
        GST(5%)       0.22      0.14      0.17        43
        GST(7%)       0.14      0.12      0.13        34
        GST(9%)       0.41      0.49      0.44        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [10]:
svm = SVC(kernel='rbf', gamma=5, C=5) 
svm.fit(X_train_split, y_train_split)

y_pred = svm.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8506479340285352
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       1.00      0.01      0.01       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      0.96      0.97      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.67      0.73      0.70      1993
       DISCOUNT       1.00      0.96      0.98       495
       DUE_DATE       0.52      0.47      0.49      1188
        GST(1%)       0.81      0.76      0.78        45
       GST(12%)       0.67      0.12      0.21        33
       GST(18%)       0.96      0.61      0.75       121
       GST(20%)       0.60      0.06      0.11        48
        GST(5%)       0.20      0.02      0.04        43
        GST(7%)       0.11      0.03      0.05        34
        GST(9%)       0.29      0.06      0.10        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_split, y_train_split)

y_pred = rf.predict(X_val_split)

In [12]:
print("Accuracy:", accuracy_score(y_val_split, y_pred))

print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8741219075875911
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      0.97      0.97      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.71      0.69      0.70      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.54      0.59      0.56      1188
        GST(1%)       0.84      0.84      0.84        45
       GST(12%)       1.00      0.97      0.98        33
       GST(18%)       0.98      0.99      0.99       121
       GST(20%)       0.98      0.98      0.98        48
        GST(5%)       0.45      0.21      0.29        43
        GST(7%)       0.17      0.15      0.16        34
        GST(9%)       0.46      0.65      0.54        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


logreg = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

logreg.fit(X_train_split, y_train_split)

y_pred = logreg.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8818447576246782
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      1.00      0.98      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.81      0.62      0.70      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.54      0.80      0.65      1188
        GST(1%)       0.81      0.47      0.59        45
       GST(12%)       0.85      1.00      0.92        33
       GST(18%)       0.95      1.00      0.98       121
       GST(20%)       0.94      0.98      0.96        48
        GST(5%)       0.30      0.07      0.11        43
        GST(7%)       0.15      0.06      0.09        34
        GST(9%)       0.41      0.76      0.53        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [37]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('train_data.csv')
X_train = df['Text']  # Raw text
y_train = df['Label']  # Labels

# Split data into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Print original class distribution
print(f"Original class distribution: {Counter(y_train_split)}")

# Define the sampling strategy for selected classes
sampling_strategy = {
    'GST(5%)': 1000,
    'GST(7%)': 1000,
    'GST(1%)': 1000,
    'GST(9%)': 1000,
}

# Reshape X_train_split to a 2D array for RandomOverSampler
X_train_split_reshaped = np.array(X_train_split).reshape(-1, 1)

# Apply oversampling
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_split_reshaped, y_train_split)

# Reshape X_resampled back to 1D array for vectorization
X_resampled = X_resampled.ravel()

# Print resampled class distribution
print(f"Resampled class distribution: {Counter(y_resampled)}")

# Vectorize the resampled data
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_resampled)

# Vectorize the validation data
X_val_vec = vectorizer.transform(X_val_split)

# Now X_train_vec and y_resampled are ready for training your classifier


Original class distribution: Counter({'OTHER': 8006, 'DATE': 7807, 'NUMBER': 7031, 'SELLER_ADDRESS': 6569, 'TOTAL': 6444, 'TITLE': 5879, 'SELLER_NAME': 5467, 'SUB_TOTAL': 5462, 'BUYER': 4901, 'DUE_DATE': 4610, 'NOTE': 4142, 'SELLER_EMAIL': 3525, 'TOTAL_WORDS': 3223, 'TAX': 3030, 'PAYMENT_DETAILS': 2074, 'CONDITIONS': 1964, 'DISCOUNT': 1905, 'SEND_TO': 1434, 'BILL_TO': 1265, 'GSTIN': 1147, 'SELLER_SITE': 1139, 'PO_NUMBER': 1112, 'GSTIN_SELLER': 801, 'AMOUNT_DUE': 641, 'GSTIN_BUYER': 489, 'GST(18%)': 479, 'GST(9%)': 332, 'GST(12%)': 167, 'GST(7%)': 166, 'GST(5%)': 157, 'GST(1%)': 155, 'GST(20%)': 152})
Resampled class distribution: Counter({'OTHER': 8006, 'DATE': 7807, 'NUMBER': 7031, 'SELLER_ADDRESS': 6569, 'TOTAL': 6444, 'TITLE': 5879, 'SELLER_NAME': 5467, 'SUB_TOTAL': 5462, 'BUYER': 4901, 'DUE_DATE': 4610, 'NOTE': 4142, 'SELLER_EMAIL': 3525, 'TOTAL_WORDS': 3223, 'TAX': 3030, 'PAYMENT_DETAILS': 2074, 'CONDITIONS': 1964, 'DISCOUNT': 1905, 'SEND_TO': 1434, 'BILL_TO': 1265, 'GSTIN': 1147,

In [33]:
logreg = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

logreg.fit(X_train_vec, y_resampled)

y_pred = logreg.predict(X_val_vec)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8811466468868624
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      0.99      0.98      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.81      0.62      0.70      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.54      0.80      0.65      1188
        GST(1%)       0.87      0.87      0.87        45
       GST(12%)       0.85      0.88      0.87        33
       GST(18%)       0.95      0.99      0.97       121
       GST(20%)       0.94      0.92      0.93        48
        GST(5%)       0.21      0.23      0.22        43
        GST(7%)       0.17      0.24      0.20        34
        GST(9%)       0.39      0.26      0.32        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [39]:
image = pytesseract_image_to_string('notebooks\invoice-template-us-neat-750px.png')

image_vec = vectorizer.transform(image)

print(image)
print(logreg.predict(image_vec))

['INVOICE', 'East Repair Inc.\n1912 Harvest Lane\nNew York, NY 12210', 'BILLTO SHIPTO INVOICE # us-001\nJohn Smith John Smith INVOICE DATE 1102/2019\n2 Court Square 3787 Pineview Drive poe', 'New York, NY 12210 Cambridge, MA 12210 i 2312/2019\nDUE DATE 26102/2019', 'ay DESCRIPTION UNIT PRICE AMOUNT', '1 Front and rear brake cables 100.00 100.00', '2 Newset of pedal arms 15.00 30.00', '3 Labor Shrs 5.00 15.00', 'Subtotal 145.00', 'Sales Tax 6.25% 9.06', 'TOTAL $154.06', 'Shank you', 'Smith', 'TERMS & CONDITIONS', 'Payment is due within 15 days.', 'Please make checks payable to: East Repair Inc.\n']
['TITLE' 'BUYER' 'OTHER' 'DUE_DATE' 'BUYER' 'BUYER' 'BUYER' 'BUYER'
 'BUYER' 'TAX' 'TOTAL' 'BUYER' 'BUYER' 'CONDITIONS' 'BUYER' 'BUYER']


In [41]:
import os
import shutil

source_folder = 'notebooks\invoices_dataset_final\Annotations\Original_Format'
destination_folder = 'invoices_dataset_final/400'
destination_folder_train = 'invoices_dataset_final/train'

if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)
    
if not os.path.exists(destination_folder_train):
    os.makedirs(destination_folder_train)

files = [f for f in os.listdir(source_folder) if f.endswith('.json')]

files_to_move = files[-400:]
files_to_train = files[:-400]

for file in files_to_move:
    source_path = os.path.join(source_folder, file)
    destination_path = os.path.join(destination_folder, file)
    shutil.copy(source_path, destination_path)
    
for file in files_to_train:
    source_path = os.path.join(source_folder, file)
    destination_path = os.path.join(destination_folder_train, file)
    shutil.copy(source_path, destination_path)

print(f"Coped {len(files_to_move)} files to {destination_folder}")
print(f"Coped {len(files_to_train)} files to {destination_folder_train}")


Coped 400 files to invoices_dataset_final/400
Coped 9600 files to invoices_dataset_final/train


In [42]:
import os

def extract_json_data_from_dir(json_path):
    X_train = []
    y_train = []

    def extract_data_from_json(json_file):
        with open(json_file, 'r') as f:
            data = json.load(f)
            
        for label, content in data.items():
            if isinstance(content, dict) and 'text' in content:
                text = content['text'].split('\n')[0]
                X_train.append(text)
                y_train.append(label)
            elif isinstance(content, list):
                for item in content:
                    if isinstance(item, dict) and 'text' in item:
                        text = item['text'].split('\n')[0]
                        X_train.append(text)
                        y_train.append(label)

    # Iterate over all the JSON files in the folder
    for filename in os.listdir(json_path):
        if filename.endswith('.json'):
            json_file = os.path.join(json_path, filename)
            extract_data_from_json(json_file)

    return X_train, y_train

In [43]:
X_train_1, y_train_1 = extract_json_data_from_dir('notebooks\\invoices_dataset_final\\train')
X_val_1, y_val_1 = extract_json_data_from_dir('notebooks\\invoices_dataset_final\\400')

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_1_vec = vectorizer.fit_transform(X_train_1)
X_val_1_vec = vectorizer.transform(X_val_1)

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

logreg_1 = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

logreg_1.fit(X_train_1_vec, y_train_1)

y_pred = logreg_1.predict(X_val_1_vec)


print("Accuracy:", accuracy_score(y_val_1, y_pred))
print("Classification Report:")
print(classification_report(y_val_1, y_pred))

Accuracy: 0.7673076923076924
Classification Report:
                 precision    recall  f1-score   support

          BUYER       0.34      0.99      0.51       400
     CONDITIONS       1.00      1.00      1.00       200
           DATE       0.91      0.65      0.76       400
       DUE_DATE       0.55      0.87      0.68       200
          GSTIN       0.50      1.00      0.67       200
   GSTIN_SELLER       0.00      0.00      0.00       200
           NOTE       1.00      1.00      1.00       200
         NUMBER       0.92      0.90      0.91       400
          OTHER       0.00      0.00      0.00       400
PAYMENT_DETAILS       1.00      1.00      1.00       200
 SELLER_ADDRESS       1.00      1.00      1.00       200
   SELLER_EMAIL       1.00      1.00      1.00       400
    SELLER_NAME       0.00      0.00      0.00       400
    SELLER_SITE       1.00      1.00      1.00       200
        SEND_TO       1.00      1.00      1.00       200
      SUB_TOTAL       1.00      1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
from sklearn.svm import SVC

svm = SVC(kernel='linear') 
svm.fit(X_train_1_vec, y_train_1)

y_pred = svm.predict(X_val_1_vec)

print("Accuracy:", accuracy_score(y_val_1, y_pred))
print("Classification Report:")
print(classification_report(y_val_1, y_pred))

Accuracy: 0.7648076923076923
Classification Report:
                 precision    recall  f1-score   support

          BUYER       0.34      0.99      0.51       400
     CONDITIONS       1.00      1.00      1.00       200
           DATE       0.95      0.58      0.72       400
       DUE_DATE       0.53      0.94      0.67       200
          GSTIN       0.50      1.00      0.67       200
   GSTIN_SELLER       0.00      0.00      0.00       200
           NOTE       1.00      1.00      1.00       200
         NUMBER       0.93      0.90      0.91       400
          OTHER       0.00      0.00      0.00       400
PAYMENT_DETAILS       1.00      1.00      1.00       200
 SELLER_ADDRESS       1.00      1.00      1.00       200
   SELLER_EMAIL       1.00      1.00      1.00       400
    SELLER_NAME       0.00      0.00      0.00       400
    SELLER_SITE       1.00      1.00      1.00       200
        SEND_TO       1.00      1.00      1.00       200
      SUB_TOTAL       1.00      1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
import pytesseract
import cv2

def pytesseract_image_to_string_with_bboxes(image_path):
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    image = cv2.imread(image_path)

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)

    result = []

    block_text = []
    block_bboxes = []

    last_top = None
    last_bottom = None
    last_left = None
    last_right = None
    
    # Iterate through the data and group words into blocks
    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        if text:  # Only process non-empty text
            # Extract the bounding box coordinates
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            bbox = [[x, y], [x + w, y + h]]

            # Add the word and its bbox to the temporary block lists
            block_text.append(text)
            block_bboxes.append(bbox)

            # Check if the current word belongs to the same line (check vertical distance)
            if last_top is None or abs(last_top - y) < 10:
                last_top = y
                last_bottom = y + h
                last_left = min(last_left or x, x)
                last_right = max(last_right or (x + w), x + w)
            else:
                # If the current word is far from the last one, treat it as a new block
                if block_text:
                    # Create the combined block string and its bbox
                    combined_text = ' '.join(block_text)
                    combined_bbox = [[last_left, last_top], [last_right, last_bottom]]
                    result.append({"text": combined_text, "bbox": combined_bbox})
                # Start a new block
                block_text = [text]
                block_bboxes = [bbox]
                last_top = y
                last_bottom = y + h
                last_left = x
                last_right = x + w

    if block_text:
        combined_text = ' '.join(block_text)
        combined_bbox = [[last_left, last_top], [last_right, last_bottom]]
        result.append({"text": combined_text, "bbox": combined_bbox})

    for entry in result:
        bbox = entry['bbox']
        top_left = tuple(bbox[0])
        bottom_right = tuple(bbox[1])
        
        cv2.rectangle(gray, top_left, bottom_right, (0, 255, 0), 2) 

        cv2.putText(gray, entry['text'], (top_left[0], top_left[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

    cv2.imshow('Image with Bounding Boxes', gray)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    return result

print(pytesseract_image_to_string_with_bboxes(r'notebooks\invoices_dataset_final\images\Template19_Instance10.jpg'))

[{'text': 'INVOICE # 8287-236 Date:', 'bbox': [[41, 105], [153, 114]]}, {'text': 'Date: 19-Jun-2012 Due', 'bbox': [[41, 132], [139, 141]]}, {'text': 'Due Date :', 'bbox': [[41, 155], [98, 164]]}, {'text': ': 13-Dec-2000', 'bbox': [[95, 145], [98, 173]]}, {'text': '13-Dec-2000 Buyer', 'bbox': [[103, 155], [172, 164]]}, {'text': 'Buyer :Cynthia Phillips 49716', 'bbox': [[37, 194], [167, 205]]}, {'text': '49716 Vega Gardens Suite 257 Jamesberg,', 'bbox': [[36, 208], [209, 217]]}, {'text': 'Jamesberg, IN 27386 US Tel:+(433)412-1127', 'bbox': [[36, 222], [174, 232]]}, {'text': 'Tel:+(433)412-1127 Email:saragray@example.org', 'bbox': [[36, 237], [142, 248]]}, {'text': 'Email:saragray@example.org Site:https:/iwest.biz/', 'bbox': [[37, 251], [204, 263]]}, {'text': 'Site:https:/iwest.biz/ Item', 'bbox': [[37, 265], [152, 277]]}, {'text': 'Item Stock', 'bbox': [[47, 459], [76, 471]]}, {'text': 'Stock site. Family', 'bbox': [[46, 477], [106, 486]]}, {'text': 'Family plan. Opportunity', 'bbox': [[

In [54]:
def extract_json_data_from_dir_bbox(json_path):
    X_train = []  # List to store the text
    y_train = []  # List to store the labels
    bbox_train = []  # List to store bounding boxes

    def extract_data_from_json(json_file):
        with open(json_file, 'r') as f:
            data = json.load(f)
            
        for label, content in data.items():
            if isinstance(content, dict) and 'text' in content:
                text = content['text'].split('\n')[0]  # Extract the first line of text
                bbox = content.get('bbox', None)  # Get the bounding box for the item
                X_train.append(text)
                y_train.append(label)
                bbox_train.append(bbox)
            elif isinstance(content, list):
                for item in content:
                    if isinstance(item, dict) and 'text' in item:
                        text = item['text'].split('\n')[0]  # Extract the first line of text
                        bbox = item.get('bbox', None)  # Get the bounding box for the item
                        X_train.append(text)
                        y_train.append(label)
                        bbox_train.append(bbox)

    # Iterate over all the JSON files in the folder
    for filename in os.listdir(json_path):
        if filename.endswith('.json'):
            json_file = os.path.join(json_path, filename)
            extract_data_from_json(json_file)


    # Create DataFrame
    train_df = pd.DataFrame({
        'Text': X_train,
        'Label': y_train,
        'Bounding Box': bbox_train
    })

    # Save to CSV
    train_df.to_csv('train_data_with_bboxes.csv', index=False)

    return X_train, y_train, bbox_train

In [55]:
X_train_bbox, y_train_bbox, bbox_train = extract_json_data_from_dir_bbox(r'notebooks\invoices_dataset_final\Annotations\Original_Format')


print(X_train_bbox[:5], y_train_bbox[:5], bbox_train[:5])

['Bill to:Michael Sparks', 'Terms and Conditions', 'Date: 03-Jan-1994', 'DISCOUNT(2.14%): (-)  9.39', 'Due Date : 06-Feb-2007'] ['BUYER', 'CONDITIONS', 'DATE', 'DISCOUNT', 'DUE_DATE'] [[[324.0, 591.4058000000001], [512.712, 675.4058]], [[26.0, 192.81980000000001], [305.52000000000004, 214.81980000000001]], [[4.0, 786.4058], [103.38399999999999, 798.4058]], [[250.0, 324.4058], [407.356, 336.4058]], [[3.0, 733.4058], [135.03599999999997, 745.4058]]]


In [60]:
import json
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Function to extract bounding box features
def extract_bbox_features(bboxes):
    features = []
    for bbox in bboxes:
        if isinstance(bbox, str):  # Check if bbox is a valid string
            try:
                coords = json.loads(bbox)  # Convert string to list
                x_min, y_min = coords[0]
                x_max, y_max = coords[1]
                width = x_max - x_min
                height = y_max - y_min
                area = width * height
                aspect_ratio = height / width if width > 0 else 0
                features.append([x_min, y_min, x_max, y_max, width, height, area, aspect_ratio])
            except (json.JSONDecodeError, ValueError):
                # Handle invalid JSON
                features.append([0, 0, 0, 0, 0, 0, 0, 0])
        else:
            # Handle non-string (e.g., NaN or float)
            features.append([0, 0, 0, 0, 0, 0, 0, 0])
    return np.array(features)

# Extract bbox features from the 'Bounding Box' column
bbox_features = extract_bbox_features(data['Bounding Box'])

# Normalize the bbox features
scaler = MinMaxScaler()
normalized_bbox_features = scaler.fit_transform(bbox_features)

# Display normalized features
print(normalized_bbox_features)


[[0.69083156 0.71390833 0.80236117 ... 0.93126386 0.7574996  0.64124814]
 [0.0554371  0.23276008 0.47811907 ... 0.24390244 0.29385912 0.11338514]
 [0.00852878 0.94930021 0.16178928 ... 0.13303769 0.05699035 0.17394467]
 ...
 [0.00639659 0.32351995 0.34796026 ... 0.29268293 0.27672042 0.17338703]
 [0.19402985 0.8853219  0.42312098 ... 0.13303769 0.10286063 0.09637475]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Generate text embeddings using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
text_embeddings = vectorizer.fit_transform(data['Text']).toarray()

# Combine text embeddings and bounding box features
combined_embeddings = np.hstack([text_embeddings, normalized_bbox_features])

print(f"Combined embeddings shape: {combined_embeddings.shape}")


Combined embeddings shape: (114594, 5008)


In [62]:
from sklearn.model_selection import train_test_split

# Extract labels (y)
y = data['Label']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(combined_embeddings, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")


Training data shape: (91675, 5008)
Validation data shape: (22919, 5008)


In [63]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Train the SVM classifier
classifier = SVC(kernel='linear', probability=True, random_state=42)
classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = classifier.predict(X_val)

# Evaluate the model
print(classification_report(y_val, y_pred))


                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       1.00      1.00      1.00       335
          BUYER       1.00      1.00      1.00      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       1.00      1.00      1.00      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       1.00      1.00      1.00      1188
        GST(1%)       0.98      0.93      0.95        45
       GST(12%)       0.85      1.00      0.92        33
       GST(18%)       0.95      1.00      0.98       121
       GST(20%)       0.94      0.98      0.96        48
        GST(5%)       1.00      0.74      0.85        43
        GST(7%)       0.82      0.82      0.82        34
        GST(9%)       0.85      0.84      0.84        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.00      1.00       111
   GSTIN_SELLER       1.00    