In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import re
import os

In [2]:
df = pd.read_csv("invoice_nlp_data/gen_invoices_labels_100k.csv")

In [3]:
df.head()

Unnamed: 0,date,price,str_input,str_label
0,29.11.2022,0.99,"address : Perla Muniz 7000 Meadow Run Circle, ...",0000000000000000000000000000000000000000000000...
1,16.3.2028,0.21,Invoice\nAdventure Direct Internet\nDee Hacket...,0000000\n0000000000000000000000000\n0000000000...
2,28.4.2027,4.12,Invoice\nPower Provider\nJeremy Beers 1068 Sum...,0000000\n00000000000000\n000000000000000000000...
3,2026-23-7,4626.83,Electronic Interactive\nJeromy Mcghee 8226 Pel...,0000000000000000000000\n0000000000000000000000...
4,23.7.2020,0.37,address : Rolland Hutson 3023 Saint George Str...,0000000000000000000000000000000000000000000000...


In [4]:
# automatic EDA
# https://nbviewer.jupyter.org/github/JosPolfliet/pandas-profiling/blob/master/examples/meteorites.ipynb
# pandas_profiling.ProfileReport(df)

In [5]:
inpu = []
targ = []
i = 0

for i in range(len(df['str_input'])):
    inp = df['str_input'][i].split("\\n")
    tar = df['str_label'][i].split("\\n")

    inp = [x for x in inp if len(x)>0]
    tar = [x for x in tar if len(x)>0]

    inpu.append(inp)
    targ.append(tar)

# for safety
inpu = [x for x in inpu if len(x)>0]
targ = [x for x in targ if len(x)>0]

inpu = np.concatenate(inpu)
targ = np.concatenate(targ)

In [6]:
inpu[103], targ[103], inpu[-19], targ[-19]

('price EUR 0.24', '00000000001111', 'price EUR 16.15', '000000000011111')

# Simple Bi-LSTM CRF

In [7]:
# https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html

# Viterbi algorithm - calculates maximum probability path for a hidden markov model

In [8]:
import torch
import torch.optim as optim

from bi_lstm_crf import BiLSTM_CRF

from sklearn.model_selection import train_test_split

torch.manual_seed(1)

<torch._C.Generator at 0x13fa4d3f0>

In [9]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [10]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

char_to_ix = {}
for sentence in inpu:
    for char in sentence:
        if char not in char_to_ix:
            char_to_ix[char] = len(char_to_ix)

tag_to_ix = {}
for sentence in targ:
    for char in sentence:
        if char not in tag_to_ix:
            tag_to_ix[char] = len(tag_to_ix)

tag_to_ix[START_TAG] = len(tag_to_ix)
tag_to_ix[STOP_TAG] = len(tag_to_ix)

all_data = np.transpose(np.stack([np.array(inpu), np.array(targ)]))
training_data, test_data = train_test_split(all_data, test_size=0.2)

In [11]:
model = BiLSTM_CRF(len(char_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, START_TAG, STOP_TAG)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [12]:
# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], char_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
    print(model(precheck_sent))

(tensor(61.1821), [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])


In [13]:
# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(1):
    for sentence, tags in training_data[:1000]:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, char_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)
        
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

In [14]:
# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], char_to_ix)
    print(training_data[0][0])
    print(model(precheck_sent))
# We got it!

relish sale slash sidewalk: 367.75
(tensor(90.1176), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


# Save Model

In [15]:
model_folder = "models"

# create data folder
try:
    os.mkdir(model_folder)
except:
    print("%s folder already created!" % model_folder)

torch.save(model, "%s/seq_labeling_BiLSTM_CRF.pt" % model_folder)

models folder already created!


# Prediction

In [16]:
from bi_lstm_crf import BiLSTM_CRF

In [17]:
m = torch.load("%s/seq_labeling_BiLSTM_CRF.pt" % model_folder)

In [30]:
test_data.shape

(338043, 2)

In [53]:
preds = []
test_sentences = np.transpose(test_data)
for sentence in test_sentences[0][:100]:
    sentence = prepare_sequence(sentence, char_to_ix)
    preds.append(m.forward(sentence)[1])

In [61]:
# for i in range(len(test_sentences[0])):
for i in range(100):
    print(test_sentences[0][i])
    print("".join([str(elem) for elem in preds[i]]))

Customer Number 28674408118
000000000000000000000000000
Praesent dolore facilisi accumsan enim diam illum veniam Utwisi aliquip nulla hendrerit.
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
Total EPS USD 6400.43
000000000000000000000
Invoice ID : 4392306400
00000000000000000000000
http://speed_software.at
000000000000000000000000
router industry air 4.88
000000000000000000000000
https://west_architecture_adventure.uk
00000000000000000000000000000000000000
Daniela@illumnibh.com
000000000000000000000
Ut velit enim delenit velit qui nisl augue et hendrerit lobortis te delenitaugue erat.
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000
Bill
0000
http://advanced_star.jp
00000000000000000000000
Suscipit dolore ut dolore tationullamcorper iriure qui Utwisi ad.
00000000000000000000000000000000000000000000000000000000000000000
L.Dallas@facilisisnulla.org
000000000000000000000000000
Vero diam qui illum ad.
00000000