In [1]:
import pytesseract

def pytesseract_image_to_string(image_path):
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    return [item for item in pytesseract.image_to_string(image_path).split('\n\n') if item != '']


In [2]:
print(pytesseract_image_to_string('invoices_dataset_final/images/Template1_Instance0.jpg'))

['Address:16424 Timothy Mission\nMarkville, AK 58294 US.', 'Email:melvinso@example.net\n‘www.ThompsonandSons.org\n(GSTIN: 12345670 00070007', 'TAX INVOICE', 'Date: 20-Mar-2008\nDue Date : 16-Oct-2016', 'Bill to:Denise Perez\n16424 Timothy Mission\nMarkville, AK 58294 US\nTol:+(952)259-8443\nEmail:melvin40@example.net\nSite:http:/ismith.org/', 'GSTIN: OG@AAMFCO376K124', 'TEMS QUANTITY PRICE\nData score fre. 6.00 $57.80\nDetermine ha, 2.00 $2470', 'Model rea. 100 $86.14', 'Mather consider 1.00 seta', 'Tv fous 4.00 $40.28', 'Total in words: seven hundred and thirt-', 'y-four point three three\nBank Name State Bank of California', 'Branch Name Raf CAMP.\nBank Account Number 11695435\nBank Swift Code SBININBB250', 'Note:\nThis order is shipped through blue dart couri', '‘SUB_TOTAL : 725.30 EUR\nDISCOUNT(1.85%): (-) 13.42\nTAX:VAT (3.88%): 28.18 EUR', 'TOTAL : 734.33 EUR\n']


In [3]:
import json
import os
import pandas as pd

folder_path = 'invoices_dataset_final/Annotations/Original_Format'

X_train = []
y_train = []

# Function to extract data from a single JSON file
def extract_data_from_json(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
        
    for label, content in data.items():
        if isinstance(content, dict) and 'text' in content:
            text = content['text'].split('\n')[0]
            X_train.append(text)
            y_train.append(label)
        elif isinstance(content, list):
            for item in content:
                if isinstance(item, dict) and 'text' in item:
                    text = item['text'].split('\n')[0]
                    X_train.append(text)
                    y_train.append(label)

# Iterate over all the JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        json_file = os.path.join(folder_path, filename)
        extract_data_from_json(json_file)

train_df = pd.DataFrame({
    'Text': X_train,
    'Label': y_train
})

train_df.to_csv('train_data.csv', index=False)

print(train_df.head())

                         Text       Label
0      Bill to:Michael Sparks       BUYER
1        Terms and Conditions  CONDITIONS
2           Date: 03-Jan-1994        DATE
3  DISCOUNT(2.14%): (-)  9.39    DISCOUNT
4      Due Date : 06-Feb-2007    DUE_DATE


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_vec, y_train, test_size=0.2, random_state=42)

svm = SVC(kernel='linear') 
svm.fit(X_train_split, y_train_split)

y_pred = svm.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8844626728914874
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      1.00      0.98      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.91      0.54      0.68      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.54      0.95      0.69      1188
        GST(1%)       0.89      0.91      0.90        45
       GST(12%)       0.85      1.00      0.92        33
       GST(18%)       0.95      1.00      0.98       121
       GST(20%)       0.94      0.98      0.96        48
        GST(5%)       0.38      0.07      0.12        43
        GST(7%)       0.00      0.00      0.00        34
        GST(9%)       0.46      0.76      0.57        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [5]:
svm = SVC(kernel='poly', degree=3, C=5) 
svm.fit(X_train_split, y_train_split)

y_pred = svm.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8720275753741438
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      0.97      0.97      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.71      0.70      0.71      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.55      0.59      0.57      1188
        GST(1%)       0.87      0.87      0.87        45
       GST(12%)       0.86      0.94      0.90        33
       GST(18%)       0.95      0.96      0.95       121
       GST(20%)       0.94      0.92      0.93        48
        GST(5%)       0.22      0.14      0.17        43
        GST(7%)       0.14      0.12      0.13        34
        GST(9%)       0.41      0.49      0.44        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [7]:
svm = SVC(kernel='rbf', gamma=5, C=5) 
svm.fit(X_train_split, y_train_split)

y_pred = svm.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8506479340285352
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       1.00      0.01      0.01       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      0.96      0.97      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.67      0.73      0.70      1993
       DISCOUNT       1.00      0.96      0.98       495
       DUE_DATE       0.52      0.47      0.49      1188
        GST(1%)       0.81      0.76      0.78        45
       GST(12%)       0.67      0.12      0.21        33
       GST(18%)       0.96      0.61      0.75       121
       GST(20%)       0.60      0.06      0.11        48
        GST(5%)       0.20      0.02      0.04        43
        GST(7%)       0.11      0.03      0.05        34
        GST(9%)       0.29      0.06      0.10        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_split, y_train_split)

y_pred = rf.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

TypeError: missing a required argument: 'y_pred'

In [12]:
print("Accuracy:", accuracy_score(y_val_split, y_pred))

print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8741219075875911
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      0.97      0.97      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.71      0.69      0.70      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.54      0.59      0.56      1188
        GST(1%)       0.84      0.84      0.84        45
       GST(12%)       1.00      0.97      0.98        33
       GST(18%)       0.98      0.99      0.99       121
       GST(20%)       0.98      0.98      0.98        48
        GST(5%)       0.45      0.21      0.29        43
        GST(7%)       0.17      0.15      0.16        34
        GST(9%)       0.46      0.65      0.54        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


logreg = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

logreg.fit(X_train_split, y_train_split)

y_pred = logreg.predict(X_val_split)

print("Accuracy:", accuracy_score(y_val_split, y_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_pred))

Accuracy: 0.8818447576246782
Classification Report:
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       0.91      1.00      0.95       335
          BUYER       0.97      1.00      0.98      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       0.81      0.62      0.70      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       0.54      0.80      0.65      1188
        GST(1%)       0.81      0.47      0.59        45
       GST(12%)       0.85      1.00      0.92        33
       GST(18%)       0.95      1.00      0.98       121
       GST(20%)       0.94      0.98      0.96        48
        GST(5%)       0.30      0.07      0.11        43
        GST(7%)       0.15      0.06      0.09        34
        GST(9%)       0.41      0.76      0.53        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.0

In [32]:
image = pytesseract_image_to_string('invoice-template-us-neat-750px.png')

image_vec = vectorizer.transform(image)

print(image)
print(logreg.predict(image_vec))

['INVOICE', 'East Repair Inc.\n1912 Harvest Lane\nNew York, NY 12210', 'BILLTO SHIPTO INVOICE # us-001\nJohn Smith John Smith INVOICE DATE 1102/2019\n2 Court Square 3787 Pineview Drive poe', 'New York, NY 12210 Cambridge, MA 12210 i 2312/2019\nDUE DATE 26102/2019', 'ay DESCRIPTION UNIT PRICE AMOUNT', '1 Front and rear brake cables 100.00 100.00', '2 Newset of pedal arms 15.00 30.00', '3 Labor Shrs 5.00 15.00', 'Subtotal 145.00', 'Sales Tax 6.25% 9.06', 'TOTAL $154.06', 'Shank you', 'Smith', 'TERMS & CONDITIONS', 'Payment is due within 15 days.', 'Please make checks payable to: East Repair Inc.\n']
['TITLE' 'BUYER' 'OTHER' 'DUE_DATE' 'BUYER' 'BUYER' 'BUYER' 'BUYER'
 'BUYER' 'TAX' 'TOTAL' 'BUYER' 'BUYER' 'CONDITIONS' 'BUYER' 'BUYER']


In [41]:
import os
import shutil

source_folder = 'invoices_dataset_final/Annotations/Original_Format'
destination_folder = 'invoices_dataset_final/400'
destination_folder_train = 'invoices_dataset_final/train'

if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)
    
if not os.path.exists(destination_folder_train):
    os.makedirs(destination_folder_train)

files = [f for f in os.listdir(source_folder) if f.endswith('.json')]

files_to_move = files[-400:]
files_to_train = files[:-400]

for file in files_to_move:
    source_path = os.path.join(source_folder, file)
    destination_path = os.path.join(destination_folder, file)
    shutil.copy(source_path, destination_path)
    
for file in files_to_train:
    source_path = os.path.join(source_folder, file)
    destination_path = os.path.join(destination_folder_train, file)
    shutil.copy(source_path, destination_path)

print(f"Coped {len(files_to_move)} files to {destination_folder}")
print(f"Coped {len(files_to_train)} files to {destination_folder_train}")


Coped 400 files to invoices_dataset_final/400
Coped 9600 files to invoices_dataset_final/train
