# Entity extraction using Fasttext and LSTM

## Import everything important

In [1]:
import joblib
import torch
import torch.nn as nn
import transformers
import nltk

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection

from gensim.models import fasttext as ft
# from gensim.models import FastText as ft
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from nltk.tokenize import word_tokenize

from tqdm import tqdm
from sklearn.model_selection import train_test_split

## Some config

In [2]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 128
EPOCHS = 5
EMBED_DIM = 300

MODEL_PATH = "./state_dict.pt"
# TRAINING_FILE = './ner_dataset.csv'
TRAINING_FILE = './dataset/ner_dataset.csv'

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [4]:
fasttext_model = ft.load_facebook_vectors("cc.en.300.bin")
# fasttext_model = ft.load_fasttext_format("cc.en.300.bin")

## Dataset

In [62]:
class EntityDataset:
    def __init__(self, texts, tags, max_len):
        self.texts = texts
        self.tags = tags
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        tags = np.array(self.tags[item])

        # while np.nan in text:
        #     text[text.index(np.nan)] = ""

        # Усеките текст до указанного значения max_len
        if len(text) < self.max_len:
            text = text + [''] * (self.max_len - len(text))
        else:
            text = text[:self.max_len]

        # Преобразуйте текст в векторы слов с использованием вашей FastText модели
        ids = [fasttext_model[s] for s in text]
        # ids = [fasttext_model.wv.get_vector(s) for s in text]

        # Создайте маску, чтобы определить, какие элементы являются реальными словами (1), а какие дополнены (0)
        mask = [1] * len(text) + [0] * (self.max_len - len(text))

        # Дополните или усечите теги до указанного max_len
        if len(tags) < self.max_len:
            tags = np.pad(tags, (0, self.max_len - len(tags)), mode='constant', constant_values=0)
        else:
            tags = tags[:self.max_len]

        return (torch.tensor(ids, dtype=torch.float32),
                torch.tensor(tags, dtype=torch.long),
                torch.tensor(mask, dtype=torch.long))
    


## Training and evaluation functions

In [63]:
def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss

In [64]:
def acc_stat(pred, target, mask):
    mask = mask.bool()
    pred = torch.masked_select(pred, mask)
    target = torch.masked_select(target, mask)
    # сколько элементов угадано корректно
    # correct = (pred == target).sum().item()
    correct = torch.tensor(torch.eq(pred, target).sum().item(),dtype=torch.float32)
    # сколько элементов было всего, не считая "пустых" с нулями
    # total = mask.sum().item()
    total = torch.tensor(len(pred), dtype=torch.float32)
    return correct, total

пример того как должно работать

In [65]:
acc_stat(torch.tensor([1,2,3,4,0,0,0,0]), torch.tensor([1,2,3,4,5,5,5,5]), torch.tensor([1,1,1,1,0,0,0,0]))

acc_stat(torch.tensor([1,2,3,4,0,0,0,0]), torch.tensor([1,2,3,4,5,5,5,5]), torch.tensor([0,0,0,0,1,1,1,1]))

(tensor(0.), tensor(4.))

## Loss function and model

In [66]:
class EntityModel(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5, bidirectional=False):
        super().__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob,
                            batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(
        self, 
        embeds,
        hidden
    ):
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # Пропустим через дропаут
        out = self.dropout(lstm_out)
        # Проходите через линейный слой
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        num_directions = 2 if self.lstm.bidirectional else 1
        print("num_directions", num_directions)
        h_zeros = torch.zeros(self.n_layers * num_directions,
                              batch_size, self.hidden_dim,
                              dtype=torch.float32, device=device)
        c_zeros = torch.zeros(self.n_layers * num_directions,
                              batch_size, self.hidden_dim,
                              dtype=torch.float32, device=device)

        return (h_zeros, c_zeros)

## Data processing

In [67]:
def process_data(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")

    enc_tag = preprocessing.LabelEncoder()

    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])

    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag

## Training

In [68]:
sentences, tag, enc_tag = process_data(TRAINING_FILE)

  df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")


In [69]:
meta_data = {
    "enc_tag": enc_tag
}

joblib.dump(meta_data, "meta.bin")

num_tag = len(list(enc_tag.classes_))

# делим на трейн и тест с помощью train_test_split
(
    train_sentences,
    test_sentences,
    train_tag,
    test_tag
) = train_test_split(sentences, tag, test_size=0.2, random_state=42)

In [70]:
train_dataset = EntityDataset(
    texts=train_sentences, tags=train_tag, max_len=MAX_LEN
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=0,
    shuffle=True, drop_last=True
)

valid_dataset = EntityDataset(
    texts=test_sentences, tags=test_tag, max_len=MAX_LEN
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=0,
    shuffle=False, drop_last=True
)

In [71]:
def eval_model(model, valid_data_loader):
    h = model.init_hidden(VALID_BATCH_SIZE)
    losses = []
    
    correct_sum, total_sum = 0, 0
    
    for inputs, labels, mask in valid_data_loader:
        h = tuple([each.data for each in h])
        # отправим inputs, labels и mask на GPU
        inputs = inputs.to(device)
        labels = labels.to(device)
        mask = mask.to(device)

        model.zero_grad()
        output, h = model(inputs, h)
        loss = loss_fn(output, labels.flatten(), mask, num_tag)
        losses.append(loss.item())
        
        correct, total = acc_stat(torch.argmax(output, dim=-1).flatten(), labels.flatten(), mask.flatten())
        correct_sum += correct
        total_sum += total
    return losses, correct_sum / total_sum

In [72]:
hidden_dim = 512
n_layers = 2

model = EntityModel(num_tag, EMBED_DIM, hidden_dim, n_layers, drop_prob=0.5, bidirectional=False)
model.to(device)

lr=0.005
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [73]:
from tqdm import tqdm

counter = 0
print_every = 100
clip = 5
valid_loss_min = np.Inf
writer = SummaryWriter('logs')

num_labels = len(list(enc_tag.classes_))

model.train()
for i in range(EPOCHS):
    h = model.init_hidden(TRAIN_BATCH_SIZE)
    
    correct_sum, total_sum = 0, 0
    
    for inputs, labels, mask in tqdm(train_data_loader):
        counter += 1
        h = tuple([e.data for e in h])

        inputs = inputs.to(device)
        labels = labels.to(device)
        mask = mask.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = loss_fn(output, labels.flatten(), mask, num_tag) # вызываем функцию для подсчета лосса
        loss.backward() # и делаем обратное распространение ошибки
        correct, total = acc_stat(torch.argmax(output, dim=-1).flatten(), labels.flatten(), mask.flatten())# вызываем функцию acc_stat
        correct_sum += correct
        total_sum += total

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        # градиентный спуск
        
        if counter % print_every == 0:
            model.eval()
            val_losses, val_acc = eval_model(model, valid_data_loader)
            model.train()
            
            val_loss = np.mean(val_losses)
            writer.add_scalar('train/loss', loss.item(), counter)
            writer.add_scalar('val/loss', val_loss, counter)
            writer.add_scalar('train/acc', correct_sum / total_sum, counter)
            writer.add_scalar('val/acc', val_acc, counter)

            print("Epoch: {}/{}...".format(i+1, EPOCHS),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(val_loss),
                  "Train Acc: {:.6f}".format(correct_sum / total_sum),
                  "Val Acc: {:.6f}".format(val_acc))
                
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), MODEL_PATH)
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

num_directions 1


 32%|███▏      | 97/299 [00:04<00:08, 24.48it/s]

num_directions 1


 34%|███▍      | 103/299 [00:06<00:31,  6.13it/s]

Epoch: 1/5... Step: 100... Loss: 0.739289... Val Loss: 0.762800 Train Acc: 0.839555 Val Acc: 0.846498
Validation loss decreased (inf --> 0.762800).  Saving model ...


 67%|██████▋   | 199/299 [00:10<00:04, 24.71it/s]

num_directions 1


 68%|██████▊   | 202/299 [00:11<00:20,  4.62it/s]

Epoch: 1/5... Step: 200... Loss: 0.543601... Val Loss: 0.491143 Train Acc: 0.843570 Val Acc: 0.846498
Validation loss decreased (0.762800 --> 0.491143).  Saving model ...


100%|██████████| 299/299 [00:15<00:00, 18.88it/s]


num_directions 1


  0%|          | 0/299 [00:00<?, ?it/s]

num_directions 1


  1%|▏         | 4/299 [00:01<01:50,  2.68it/s]

Epoch: 2/5... Step: 300... Loss: 0.472838... Val Loss: 0.344045 Train Acc: 0.867615 Val Acc: 0.886777
Validation loss decreased (0.491143 --> 0.344045).  Saving model ...


 33%|███▎      | 100/299 [00:05<00:07, 25.31it/s]

num_directions 1


 34%|███▍      | 103/299 [00:07<00:42,  4.60it/s]

Epoch: 2/5... Step: 400... Loss: 0.220300... Val Loss: 0.222602 Train Acc: 0.912544 Val Acc: 0.938545
Validation loss decreased (0.344045 --> 0.222602).  Saving model ...


 67%|██████▋   | 199/299 [00:11<00:04, 24.81it/s]

num_directions 1


 69%|██████▊   | 205/299 [00:13<00:15,  5.97it/s]

Epoch: 2/5... Step: 500... Loss: 0.167140... Val Loss: 0.172367 Train Acc: 0.926581 Val Acc: 0.951215
Validation loss decreased (0.222602 --> 0.172367).  Saving model ...


100%|██████████| 299/299 [00:17<00:00, 17.22it/s]


num_directions 1


  0%|          | 0/299 [00:00<?, ?it/s]

num_directions 1


  2%|▏         | 5/299 [00:01<01:35,  3.09it/s]

Epoch: 3/5... Step: 600... Loss: 0.187312... Val Loss: 0.154255 Train Acc: 0.945888 Val Acc: 0.955231
Validation loss decreased (0.172367 --> 0.154255).  Saving model ...


 34%|███▍      | 101/299 [00:05<00:08, 24.47it/s]

num_directions 1


 35%|███▍      | 104/299 [00:07<00:42,  4.61it/s]

Epoch: 3/5... Step: 700... Loss: 0.158535... Val Loss: 0.150970 Train Acc: 0.953426 Val Acc: 0.955989
Validation loss decreased (0.154255 --> 0.150970).  Saving model ...


 67%|██████▋   | 200/299 [00:11<00:04, 24.71it/s]

num_directions 1


 69%|██████▉   | 206/299 [00:13<00:15,  6.06it/s]

Epoch: 3/5... Step: 800... Loss: 0.132532... Val Loss: 0.138388 Train Acc: 0.953841 Val Acc: 0.958571
Validation loss decreased (0.150970 --> 0.138388).  Saving model ...


100%|██████████| 299/299 [00:17<00:00, 17.03it/s]


num_directions 1


  0%|          | 0/299 [00:00<?, ?it/s]

num_directions 1


  2%|▏         | 6/299 [00:02<01:25,  3.43it/s]

Epoch: 4/5... Step: 900... Loss: 0.121712... Val Loss: 0.135167 Train Acc: 0.953327 Val Acc: 0.959252
Validation loss decreased (0.138388 --> 0.135167).  Saving model ...


 34%|███▍      | 102/299 [00:06<00:08, 23.33it/s]

num_directions 1


 35%|███▌      | 105/299 [00:08<00:44,  4.40it/s]

Epoch: 4/5... Step: 1000... Loss: 0.139385... Val Loss: 0.129985 Train Acc: 0.957616 Val Acc: 0.960126
Validation loss decreased (0.135167 --> 0.129985).  Saving model ...


 67%|██████▋   | 201/299 [00:12<00:04, 24.07it/s]

num_directions 1


 69%|██████▉   | 207/299 [00:14<00:15,  5.90it/s]

Epoch: 4/5... Step: 1100... Loss: 0.147673... Val Loss: 0.128872 Train Acc: 0.957413 Val Acc: 0.959986
Validation loss decreased (0.129985 --> 0.128872).  Saving model ...


100%|██████████| 299/299 [00:17<00:00, 16.69it/s]


num_directions 1


  1%|          | 3/299 [00:00<00:12, 24.59it/s]

num_directions 1


  2%|▏         | 6/299 [00:02<01:57,  2.49it/s]

Epoch: 5/5... Step: 1200... Loss: 0.124096... Val Loss: 0.127205 Train Acc: 0.959361 Val Acc: 0.960560
Validation loss decreased (0.128872 --> 0.127205).  Saving model ...


 34%|███▍      | 102/299 [00:06<00:08, 23.52it/s]

num_directions 1


 35%|███▌      | 105/299 [00:08<00:48,  4.00it/s]

Epoch: 5/5... Step: 1300... Loss: 0.121800... Val Loss: 0.125027 Train Acc: 0.959626 Val Acc: 0.960348
Validation loss decreased (0.127205 --> 0.125027).  Saving model ...


 67%|██████▋   | 201/299 [00:12<00:03, 24.89it/s]

num_directions 1


 69%|██████▉   | 207/299 [00:14<00:15,  5.93it/s]

Epoch: 5/5... Step: 1400... Loss: 0.144627... Val Loss: 0.121615 Train Acc: 0.959518 Val Acc: 0.961748
Validation loss decreased (0.125027 --> 0.121615).  Saving model ...


100%|██████████| 299/299 [00:18<00:00, 16.42it/s]


## Inference

In [74]:
meta_data = joblib.load("meta.bin")
enc_tag = meta_data["enc_tag"]

num_tag = len(list(enc_tag.classes_))

text = """
Natasha is traveling to New York
"""

device = torch.device("cuda")
model.to(device)

# так как это инференс, выключаем расчет градиентов:
inputs = torch.tensor([fasttext_model.wv[s] for s in word_tokenize(text)], dtype=torch.float32)
inputs = inputs.unsqueeze(0).to(device)
h = model.init_hidden(1)
tag, h = model(inputs, h)

print(
    enc_tag.inverse_transform(
        tag.argmax(-1).cpu().numpy().reshape(-1)
    )
)

num_directions 1
['O' 'O' 'O' 'O' 'B-geo' 'I-geo']
