In [0]:
import pandas as pd
import pandas_profiling
import numpy as np
import re
import os

In [0]:
# https://mikulskibartosz.name/how-to-load-data-from-google-drive-to-pandas-running-in-google-colaboratory-a7f6a033c997
# nali.org/load-google-drive-csv-panda-dataframe-google-colab/

!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
file_list = drive.ListFile({'q': "'1MI8PBiqNT-o6m0xYV8JYHUkqaqfTKkz2' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))
  
file_id = "1727tj-xC0LJUZ77N2AdIyQWNJX7jKj1x"

title: gen_invoices_labels_100k.csv, id: 1727tj-xC0LJUZ77N2AdIyQWNJX7jKj1x
title: invoice_data, id: 1q7d9ckpy5o6bhRh0KKV0TJrzFMdf4pWN


In [0]:
# Copy data from Google Drive to Colaboratory
download_path = os.path.expanduser('~/data')
try:
  os.makedirs(download_path)
except FileExistsError:
  pass

output_file = os.path.join(download_path, 'test.csv')

temp_file = drive.CreateFile({'id': file_id})
temp_file.GetContentFile(output_file)

# open in pandas
df = pd.read_csv(output_file)

In [5]:
df.head()

Unnamed: 0,date,price,str_input,str_label
0,29.11.2022,0.99,"address : Perla Muniz 7000 Meadow Run Circle, ...",0000000000000000000000000000000000000000000000...
1,16.3.2028,0.21,Invoice\nAdventure Direct Internet\nDee Hacket...,0000000\n0000000000000000000000000\n0000000000...
2,28.4.2027,4.12,Invoice\nPower Provider\nJeremy Beers 1068 Sum...,0000000\n00000000000000\n000000000000000000000...
3,2026-23-7,4626.83,Electronic Interactive\nJeromy Mcghee 8226 Pel...,0000000000000000000000\n0000000000000000000000...
4,23.7.2020,0.37,address : Rolland Hutson 3023 Saint George Str...,0000000000000000000000000000000000000000000000...


In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# test

with open('/content/gdrive/My Drive/Colab Notebooks/models/file_2.txt', 'w') as f:
  f.write('content')

In [0]:
# automatic EDA
# https://nbviewer.jupyter.org/github/JosPolfliet/pandas-profiling/blob/master/examples/meteorites.ipynb
# pandas_profiling.ProfileReport(df)

In [0]:
inpu = []
targ = []
i = 0

for i in range(len(df['str_input'])):
    inp = df['str_input'][i].split("\\n")
    tar = df['str_label'][i].split("\\n")

    inp = [x for x in inp if len(x)>0]
    tar = [x for x in tar if len(x)>0]

    inpu.append(inp)
    targ.append(tar)

# for safety
inpu = [x for x in inpu if len(x)>0]
targ = [x for x in targ if len(x)>0]

inpu = np.concatenate(inpu)
targ = np.concatenate(targ)

In [10]:
inpu[103], targ[103], inpu[-19], targ[-19]

('price EUR 0.24', '00000000001111', 'price EUR 16.15', '000000000011111')

# Simple Bi-LSTM CRF

In [0]:
# https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html

# Viterbi algorithm - calculates maximum probability path for a hidden markov model

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.0-{platform}-linux_x86_64.whl torchvision

In [13]:
import torch
import torch.optim as optim

from sklearn.model_selection import train_test_split

torch.manual_seed(1)

<torch._C.Generator at 0x7f4bc1c31f50>

In [0]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))



In [0]:
import torch.autograd as autograd
import torch.nn as nn

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, START_TAG, STOP_TAG):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.START_TAG = START_TAG
        self.STOP_TAG = STOP_TAG

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[self.START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[self.STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[self.START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + \
            self.transitions[self.tag_to_ix[self.STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat(
            [torch.tensor([self.tag_to_ix[self.START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + \
            self.transitions[self.tag_to_ix[self.STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[self.START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + \
            self.transitions[self.tag_to_ix[self.STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[self.START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq


In [0]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

char_to_ix = {}
for sentence in inpu:
    for char in sentence:
        if char not in char_to_ix:
            char_to_ix[char] = len(char_to_ix)

tag_to_ix = {}
for sentence in targ:
    for char in sentence:
        if char not in tag_to_ix:
            tag_to_ix[char] = len(tag_to_ix)

tag_to_ix[START_TAG] = len(tag_to_ix)
tag_to_ix[STOP_TAG] = len(tag_to_ix)

all_data = np.transpose(np.stack([np.array(inpu), np.array(targ)]))
training_data, test_data = train_test_split(all_data, test_size=0.2)

In [0]:
model = BiLSTM_CRF(len(char_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, START_TAG, STOP_TAG)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [18]:
# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], char_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
    print(model(precheck_sent))

(tensor(45.1594), [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])


In [19]:
import time
start_time = time.time()

# Make sure prepare_sequence from earlier in the LSTM section is loaded
i = 0
for epoch in range(1):
    print("epoch: %i" % epoch)
    for sentence, tags in training_data[:100000]:
        if(i % 10000 == 0):
          print("iteration: %i" % i)
        if(i == 1000):
          print("total estimated runtime %f hours" % ((time.time() - start_time)*len(training_data)/(1000*60*60)))
        i += 1
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, char_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)
        
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
        
print("--- %s seconds ---" % (time.time() - start_time))

epoch: 0
iteration: 0
iteration: 100
iteration: 200
iteration: 300
iteration: 400
iteration: 500
iteration: 600
iteration: 700
iteration: 800
iteration: 900
iteration: 1000
total estimated runtime 14.036476 hours
iteration: 1100
iteration: 1200
iteration: 1300
iteration: 1400
iteration: 1500
iteration: 1600
iteration: 1700
iteration: 1800
iteration: 1900
iteration: 2000
iteration: 2100
iteration: 2200
iteration: 2300
iteration: 2400
iteration: 2500
iteration: 2600
iteration: 2700
iteration: 2800
iteration: 2900
iteration: 3000
iteration: 3100
iteration: 3200
iteration: 3300
iteration: 3400
iteration: 3500
iteration: 3600
iteration: 3700
iteration: 3800
iteration: 3900
iteration: 4000
iteration: 4100
iteration: 4200
iteration: 4300
iteration: 4400
iteration: 4500
iteration: 4600
iteration: 4700
iteration: 4800
iteration: 4900
iteration: 5000
iteration: 5100
iteration: 5200
iteration: 5300
iteration: 5400
iteration: 5500
iteration: 5600
iteration: 5700
iteration: 5800
iteration: 5900
ite

In [20]:
# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], char_to_ix)
    print(training_data[0][0])
    print(model(precheck_sent))
# We got it!

Customer ID. 507217796253
(tensor(164.7033), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


# Save Model

In [21]:
model_folder = "/content/gdrive/My Drive/Colab Notebooks/models"

# create data folder
try:
    os.mkdir(model_folder)
except:
    print("%s folder already created!" % model_folder)

torch.save(model, "%s/seq_labeling_BiLSTM_CRF.pt" % model_folder)

/content/gdrive/My Drive/Colab Notebooks/models folder already created!


  "type " + obj.__name__ + ". It won't be checked "


# Prediction

In [0]:
from bi_lstm_crf import BiLSTM_CRF

In [0]:
m = torch.load("%s/seq_labeling_BiLSTM_CRF.pt" % model_folder)

In [0]:
preds = []
test_sentences = np.transpose(test_data)
for sentence in test_sentences[0][-100:]:
    sentence = prepare_sequence(sentence, char_to_ix)
    preds.append(model.forward(sentence)[1])

In [29]:
# for i in range(len(test_sentences[0])):
for i in range(100):
    print(test_sentences[0][-i])
    print("".join([str(elem) for elem in preds[-i]]))

  EUR 51.0
000000000000
Y.Crain@luptatumzzrildignissim.us
000000000000000000000000000000000
Omega Max Galaxy
0000000000000000
outrigger holiday switch: 2.13
000000000000000000000000000000
2026-18-7
111111111
Utwisi et enim accumsan ut et lorem nulla duis aliquam feugait iusto eufeugiat.
0000000000000000000000000000000000000000000000000000000000000000000000000000000
flower diploma wind: 71.35
00000000000000000000000000
M.Teague@facilisispraesent.com
000000000000000000000000000000
Elouise@eavulputate.org
00000000000000000000000
Sub t. € 44.72
00000000000000
http://universal_power_alpha.co
0000000000000000000000000000000
Invoice # 8532499605
00000000000000000000
Invoice no. : 627325701814
00000000000000000000000000
address : Josef Mclaughlin 902 Sunrise Drive, 45840 Graceville
00000000000000000000000000000000000000000000000000000000000000
address : Antionette Denney 5014 Oak Glen Lane, 70039 Washington
0000000000000000000000000000000000000000000000000000000000000000
Feugiat consequat null