# Bi-LSTM Conditional Random Field

In [1]:
import random
from datetime import datetime

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

from torch.utils.tensorboard import SummaryWriter

from faker import Faker

torch.manual_seed(1)

<torch._C.Generator at 0x22715d545f0>

## Helper function

In [2]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix.keys() else len(to_ix) for w in seq ]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

## Model

In [3]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size+1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

## Generate data

In [4]:
LOCS = ['fr_FR', 'fr_FR', 'fr_FR', 'de_DE', 'it_IT','en_US', 'fr_CH', 'nl_BE', 'ro_RO', 'ru_RU', 'zh_CN']
fake = {loc:Faker(loc) for loc in LOCS}
Faker.seed(411)

DATASET_SIZE = 5000

In [5]:
adrs = []

for i in range(DATASET_SIZE):
    words, tags = [],[]
    loc = random.sample(LOCS,1)[0]
    
    for f in fake[loc].iban().split():
        if random.random()>0.2:
            words.append(f)
            tags.append('IBAN')
        
    if random.random() > 0.5:
        for f in fake[loc].name().split():
            words.append(f)
            tags.append('NAME')
    else:
        for f in fake[loc].company().split():
            words.append(f)
            tags.append('ORG')
        
    for f in fake[loc].address().split():
        if random.random()>0.1:
            words.append(f)
            tags.append('ADDRESS')
    
    if random.random()>0.1:
        words.append(loc[-2:])
        tags.append('COUNTRY')
        
    adrs.append((words,tags))

## training

In [6]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 64
HIDDEN_DIM = 256
EPOCHS = 20

In [7]:
split = int(len(adrs) * 0.8)
training_data = adrs[:split]
valid_data = adrs[split:]

In [8]:
word_to_ix = {}
for adr, tags in training_data:
    for word in adr:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"IBAN": 0, "NAME": 1, "ORG": 2, "ADDRESS": 3, "COUNTRY": 4, START_TAG: 5, STOP_TAG: 6}

In [9]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [10]:
print(model)

BiLSTM_CRF(
  (word_embeds): Embedding(16588, 64)
  (lstm): LSTM(64, 128, bidirectional=True)
  (hidden2tag): Linear(in_features=256, out_features=7, bias=True)
)


In [11]:
# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
    print(model(precheck_sent))

(tensor(6.4928), [1, 4, 1, 4, 1, 0, 2])


In [12]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.
    
    for i, data in enumerate(training_data):
        sentence, tags = data
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        disp_step = len(training_data) / 5
        if i % disp_step == disp_step - 1:
            last_loss = running_loss / disp_step # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_data) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
            
    return last_loss
    

In [13]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

best_vloss = 1_000_000. 

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    
    
    for i, vdata in enumerate(valid_data):
        sentence, tags = vdata

        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        vloss = model.neg_log_likelihood(sentence_in, targets)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
  batch 800 loss: 5.184063190817833
  batch 1600 loss: 2.673265378475189
  batch 2400 loss: 2.3263999181985855
  batch 3200 loss: 2.1676559525728227
  batch 4000 loss: 2.0790936303138734
LOSS train 2.0790936303138734 valid tensor([2.1796], grad_fn=<DivBackward0>)
EPOCH 2:
  batch 800 loss: 1.9767471635341645
  batch 1600 loss: 1.814931811094284
  batch 2400 loss: 1.6406215167045592
  batch 3200 loss: 1.5203109323978423
  batch 4000 loss: 1.517509219646454
LOSS train 1.517509219646454 valid tensor([1.8025], grad_fn=<DivBackward0>)
EPOCH 3:
  batch 800 loss: 1.5359493625164031
  batch 1600 loss: 1.362930200099945
  batch 2400 loss: 1.2150639700889587
  batch 3200 loss: 1.2055588138103486
  batch 4000 loss: 1.2074001348018646
LOSS train 1.2074001348018646 valid tensor([1.3450], grad_fn=<DivBackward0>)
EPOCH 4:
  batch 800 loss: 1.2223499071598054
  batch 1600 loss: 1.0354771995544434
  batch 2400 loss: 1.020994462966919
  batch 3200 loss: 0.9704701733589173
  batch 4000 loss: 0.9

KeyboardInterrupt: 

In [15]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
model.load_state_dict(torch.load("model_20230401_104345_4"))

<All keys matched successfully>

In [17]:
ix_to_tag = {v:k for k,v in tag_to_ix.items()}

# Check predictions after training
with torch.no_grad():
    for i in random.sample(range(len(valid_data)),5):
        precheck_sent = prepare_sequence(valid_data[i][0], word_to_ix)
        pred = model(precheck_sent)

        print(" ".join(valid_data[i][0]))
        print("target: " ," ".join(valid_data[i][1]))
        print("predic: " ," ".join([ix_to_tag[p] for p in pred[1]]))
        print("******")

BE87303905806726 Verlinden CommV Ivanweg 082 5259 Goutroux
target:  IBAN ORG ORG ADDRESS ADDRESS ADDRESS ADDRESS
predic:  IBAN ORG ORG ADDRESS ADDRESS ADDRESS ADDRESS
******
RU19AOPW5554876779062 Игнатьева Виктория Матвеевна ст. Русса, пр. д. 197 65, 324872 RU
target:  IBAN NAME NAME NAME ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS COUNTRY
predic:  IBAN NAME NAME NAME ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS COUNTRY
******
RO78LBKY0572183817677456 Ababei Dumitrescu INC Soseaua Oprea Nr. 593 Sancraiu de Mures, 765599 RO
target:  IBAN ORG ORG ORG ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS COUNTRY
predic:  IBAN ORG ORG ORG ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS ADDRESS COUNTRY
******
RO27INNV1083166516774525 Adriana Popescu Strada Stancu Buhusi, 525239 RO
target:  IBAN NAME NAME ADDRESS ADDRESS ADDRESS ADDRESS COUNTRY
predic:  IBAN ORG ORG ADDRESS ADDRESS ADDRESS ADDRESS COUNTRY
******
FR9287520701483200950568562 Faure 750, 

In [27]:
targets = []
preds = []
with torch.no_grad():
    for data in valid_data[:100]:
        precheck_sent = prepare_sequence(data[0], word_to_ix)
        preds += [ix_to_tag[p] for p in model(precheck_sent)[1]]
        targets += data[1]

In [39]:
import pandas as pd
df = pd.DataFrame({'preds':preds, 'targets':targets})
df["success"] = pd.to_numeric(df.preds == df.targets)
df.groupby('preds').mean()

  df.groupby('preds').mean()


Unnamed: 0_level_0,success
preds,Unnamed: 1_level_1
ADDRESS,0.966608
COUNTRY,1.0
IBAN,0.871795
NAME,0.858824
ORG,0.81295
