In [1]:
from cuml.linear_model import LogisticRegression
from sklearn.svm import SVC 
import numpy as np
from sklearn.utils import shuffle
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import json
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

In [2]:
def extract_bbox_features(bboxes):
    features = []
    for bbox in bboxes:
        if isinstance(bbox, str):
            try:
                coords = json.loads(bbox)
                x_min, y_min = coords[0]
                x_max, y_max = coords[1]
                width = x_max - x_min
                height = y_max - y_min
                area = width * height
                aspect_ratio = height / width if width > 0 else 0
                features.append([x_min, y_min, x_max, y_max, width, height, area, aspect_ratio])
            except (json.JSONDecodeError, ValueError):
                features.append([0, 0, 0, 0, 0, 0, 0, 0])
        else:
            features.append([0, 0, 0, 0, 0, 0, 0, 0])
    return np.array(features)

In [3]:
df = pd.read_csv(r'train_data_with_bboxes.csv')

In [13]:
bbox_features = extract_bbox_features(df['Bounding Box'])

scaler = MinMaxScaler()
normalized_bbox_features = scaler.fit_transform(bbox_features)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
text_embeddings = vectorizer.fit_transform(df['Text']).toarray()

text_embeddings_sparse = csr_matrix(text_embeddings)
bbox_features_sparse = csr_matrix(normalized_bbox_features)

combined_embeddings_sparse = csr_matrix(np.hstack([text_embeddings_sparse.toarray(), bbox_features_sparse.toarray()]))

print(f"Combined embeddings shape: {combined_embeddings_sparse.shape}")

y = df['Label']

X_train, X_val, y_train, y_val = train_test_split(combined_embeddings_sparse, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

Combined embeddings shape: (114594, 5008)
Training data shape: (91675, 5008)
Validation data shape: (22919, 5008)


In [15]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

print(y_train.dtype)

lg_model = LogisticRegression()
lg_model.fit(X_train, y_train)

y_pred = lg_model.predict(X_val)

y_pred_original = label_encoder.inverse_transform(y_pred)


print(accuracy_score(y_val, y_pred_original))
print(classification_report(y_val, y_pred_original))

int64
0.9968148697587155
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       1.00      1.00      1.00       335
          BUYER       1.00      1.00      1.00      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       1.00      1.00      1.00      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       1.00      1.00      1.00      1188
        GST(1%)       0.91      0.89      0.90        45
       GST(12%)       0.85      1.00      0.92        33
       GST(18%)       0.95      1.00      0.98       121
       GST(20%)       0.94      0.98      0.96        48
        GST(5%)       0.80      0.19      0.30        43
        GST(7%)       0.69      0.74      0.71        34
        GST(9%)       0.55      0.69      0.61        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.00      1.00       111
   GS

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_val)

y_pred_original = label_encoder.inverse_transform(y_pred)

print(accuracy_score(y_val, y_pred_original))
print(classification_report(y_val, y_pred_original))

Parameters: { "use_label_encoder" } are not used.



0.9996945765522056
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       1.00      1.00      1.00       335
          BUYER       1.00      1.00      1.00      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       1.00      1.00      1.00      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       1.00      1.00      1.00      1188
        GST(1%)       1.00      0.98      0.99        45
       GST(12%)       1.00      1.00      1.00        33
       GST(18%)       1.00      1.00      1.00       121
       GST(20%)       0.98      1.00      0.99        48
        GST(5%)       0.95      0.95      0.95        43
        GST(7%)       1.00      0.97      0.99        34
        GST(9%)       0.99      1.00      0.99        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.00      1.00       111
   GSTIN_SE

In [8]:
svm = SVC(kernel='linear', random_state=42, class_weight='balanced')

svm.fit(X_train, y_train)

y_pred = svm.predict(X_val)

y_pred_original = label_encoder.inverse_transform(y_pred)

print(accuracy_score(y_val, y_pred_original))
print(classification_report(y_val, y_pred_original))

0.9992582573410708
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       1.00      1.00      1.00       335
          BUYER       1.00      1.00      1.00      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       1.00      1.00      1.00      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       1.00      1.00      1.00      1188
        GST(1%)       0.98      1.00      0.99        45
       GST(12%)       0.92      1.00      0.96        33
       GST(18%)       0.97      0.97      0.97       121
       GST(20%)       0.96      0.94      0.95        48
        GST(5%)       0.93      0.93      0.93        43
        GST(7%)       0.97      1.00      0.99        34
        GST(9%)       0.97      0.93      0.95        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.00      1.00       111
   GSTIN_SE

In [9]:
from imblearn.over_sampling import RandomOverSampler

sampling_strategy = {
    'GST(5%)': 1000,
    'GST(7%)': 1000,
    'GST(1%)': 1000,
    'GST(9%)': 1000,
}

print(X_train.shape)

ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print(X_resampled.shape)

(91675, 5008)
(94865, 5008)


In [12]:
label_encoder = LabelEncoder()
y_resampled = label_encoder.fit_transform(y_resampled)

svm = SVC(kernel='linear', random_state=42, class_weight='balanced')

svm.fit(X_resampled, y_resampled)

y_pred = svm.predict(X_val)

y_pred_original = label_encoder.inverse_transform(y_pred)

print(accuracy_score(y_val, y_pred_original))
print(classification_report(y_val, y_pred_original))

0.9993018892621842
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       1.00      1.00      1.00       335
          BUYER       1.00      1.00      1.00      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       1.00      1.00      1.00      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       1.00      1.00      1.00      1188
        GST(1%)       0.98      1.00      0.99        45
       GST(12%)       0.92      1.00      0.96        33
       GST(18%)       0.98      0.97      0.97       121
       GST(20%)       0.96      0.94      0.95        48
        GST(5%)       0.93      0.95      0.94        43
        GST(7%)       0.97      1.00      0.99        34
        GST(9%)       0.97      0.93      0.95        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.00      1.00       111
   GSTIN_SE

In [17]:
xgb_model.fit(X_resampled, y_resampled)

y_pred = xgb_model.predict(X_val)

y_pred_original = label_encoder.inverse_transform(y_pred)

print(accuracy_score(y_val, y_pred_original))
print(classification_report(y_val, y_pred_original))

Parameters: { "use_label_encoder" } are not used.



0.9997382084733191
                 precision    recall  f1-score   support

     AMOUNT_DUE       0.99      1.00      1.00       158
        BILL_TO       1.00      1.00      1.00       335
          BUYER       1.00      1.00      1.00      1299
     CONDITIONS       1.00      1.00      1.00       436
           DATE       1.00      1.00      1.00      1993
       DISCOUNT       1.00      1.00      1.00       495
       DUE_DATE       1.00      1.00      1.00      1188
        GST(1%)       1.00      0.98      0.99        45
       GST(12%)       1.00      1.00      1.00        33
       GST(18%)       1.00      1.00      1.00       121
       GST(20%)       0.98      1.00      0.99        48
        GST(5%)       0.95      0.98      0.97        43
        GST(7%)       1.00      0.97      0.99        34
        GST(9%)       1.00      1.00      1.00        68
          GSTIN       1.00      1.00      1.00       253
    GSTIN_BUYER       1.00      1.00      1.00       111
   GSTIN_SE

In [19]:
df_400 = pd.read_csv(r'400.csv')
df_train = pd.read_csv(r'train.csv')

bbox_features_400 = extract_bbox_features(df_400['Bounding Box'])
bbox_features_train = extract_bbox_features(df_train['Bounding Box'])

scaler = MinMaxScaler()
normalized_bbox_features_400 = scaler.fit_transform(bbox_features_400)
normalized_bbox_features_train = scaler.fit_transform(bbox_features_train)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
text_embeddings_train = vectorizer.fit_transform(df_train['Text'])
text_embeddings_400 = vectorizer.transform(df_400['Text'])

text_embeddings_sparse_400 = csr_matrix(text_embeddings_400)
bbox_features_sparse_400 = csr_matrix(normalized_bbox_features_400)

text_embeddings_sparse_train = csr_matrix(text_embeddings_train)
bbox_features_sparse_train = csr_matrix(normalized_bbox_features_train)

combined_embeddings_sparse_400 = csr_matrix(np.hstack([text_embeddings_sparse_400.toarray(), bbox_features_sparse_400.toarray()]))
combined_embeddings_sparse_train = csr_matrix(np.hstack([text_embeddings_sparse_train.toarray(), bbox_features_sparse_train.toarray()]))

print(f"Combined embeddings shape: {combined_embeddings_sparse_400.shape}")
print(f"Combined embeddings shape: {combined_embeddings_sparse_train.shape}")

y_train = df_train['Label']
y_val = df_400['Label']

Combined embeddings shape: (5200, 5008)
Combined embeddings shape: (109394, 5008)


In [16]:
y_train = label_encoder.transform(y_train)

svm_inter = SVC(kernel='linear', probability=True, random_state=42, class_weight='balanced')
svm_inter.fit(combined_embeddings_sparse_train, y_train)

y_pred = svm_inter.predict(combined_embeddings_sparse_400)

y_pred_original = label_encoder.inverse_transform(y_pred)

print(accuracy_score(y_val, y_pred_original))
print(classification_report(y_val, y_pred_original))

0.9230769230769231
                 precision    recall  f1-score   support

          BUYER       1.00      1.00      1.00       400
     CONDITIONS       1.00      1.00      1.00       200
           DATE       1.00      1.00      1.00       400
       DUE_DATE       1.00      1.00      1.00       200
          GSTIN       0.50      1.00      0.67       200
   GSTIN_SELLER       0.00      0.00      0.00       200
           NOTE       1.00      1.00      1.00       200
         NUMBER       0.67      1.00      0.80       400
          OTHER       1.00      1.00      1.00       400
PAYMENT_DETAILS       1.00      1.00      1.00       200
 SELLER_ADDRESS       1.00      1.00      1.00       200
   SELLER_EMAIL       1.00      1.00      1.00       400
    SELLER_NAME       1.00      0.50      0.67       400
    SELLER_SITE       1.00      1.00      1.00       200
        SEND_TO       1.00      1.00      1.00       200
      SUB_TOTAL       1.00      1.00      1.00       200
          T

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
y_train = label_encoder.transform(y_train)

xgb_model.fit(combined_embeddings_sparse_train, y_train)

y_pred = xgb_model.predict(combined_embeddings_sparse_400)

y_pred_original = label_encoder.inverse_transform(y_pred)

print(accuracy_score(y_val, y_pred_original))
print(classification_report(y_val, y_pred_original))

Parameters: { "use_label_encoder" } are not used.



0.9615384615384616
                 precision    recall  f1-score   support

          BUYER       1.00      0.50      0.67       400
     CONDITIONS       1.00      1.00      1.00       200
           DATE       1.00      1.00      1.00       400
       DUE_DATE       1.00      1.00      1.00       200
          GSTIN       1.00      1.00      1.00       200
   GSTIN_SELLER       1.00      1.00      1.00       200
           NOTE       1.00      1.00      1.00       200
         NUMBER       1.00      1.00      1.00       400
          OTHER       0.67      1.00      0.80       400
PAYMENT_DETAILS       1.00      1.00      1.00       200
 SELLER_ADDRESS       1.00      1.00      1.00       200
   SELLER_EMAIL       1.00      1.00      1.00       400
    SELLER_NAME       1.00      1.00      1.00       400
    SELLER_SITE       1.00      1.00      1.00       200
        SEND_TO       1.00      1.00      1.00       200
      SUB_TOTAL       1.00      1.00      1.00       200
          T