# Entity extraction using Fasttext and LSTM

## Import everything important

In [69]:
import joblib
import torch
import torch.nn as nn
import transformers
import nltk

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection

from gensim.models import FastText as ft
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from nltk.tokenize import word_tokenize

from tqdm import tqdm
from sklearn.model_selection import train_test_split

## Some config

In [70]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 128
EPOCHS = 5
EMBED_DIM = 300

MODEL_PATH = "./state_dict.pt"
TRAINING_FILE = './ner_dataset.csv'

In [71]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [72]:
fasttext_model = ft.load_fasttext_format("cc.en.300.bin")

  fasttext_model = ft.load_fasttext_format("cc.en.300.bin")


## Dataset

In [73]:
class EntityDataset:
    def __init__(self, texts, tags, max_len):
        self.texts = texts
        self.tags = tags
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        tags = np.array(self.tags[item])

        # Усеките текст до указанного значения max_len
        if len(text) < self.max_len:
            text = text + [''] * (self.max_len - len(text))
        else:
            text = text[:self.max_len]

        # Преобразуйте текст в векторы слов с использованием вашей FastText модели
        ids = [fasttext_model.wv.get_vector(s) for s in text]

        # Создайте маску, чтобы определить, какие элементы являются реальными словами (1), а какие дополнены (0)
        mask = [1] * len(text) + [0] * (self.max_len - len(text))

        # Дополните или усечите теги до указанного max_len
        if len(tags) < self.max_len:
            tags = np.pad(tags, (0, self.max_len - len(tags)), mode='constant', constant_values=0)
        else:
            tags = tags[:self.max_len]

        return (torch.tensor(ids, dtype=torch.float32),
                torch.tensor(tags, dtype=torch.long),
                torch.tensor(mask, dtype=torch.long))
    


## Training and evaluation functions

In [74]:
def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss

In [75]:
def acc_stat(pred, target, mask):
    mask = mask.bool()
    pred = torch.masked_select(pred, mask)
    target = torch.masked_select(target, mask)
    # сколько элементов угадано корректно
    # correct = (pred == target).sum().item()
    correct = torch.tensor(torch.eq(pred, target).sum().item(),dtype=torch.float32)
    # сколько элементов было всего, не считая "пустых" с нулями
    # total = mask.sum().item()
    total = torch.tensor(len(pred), dtype=torch.float32)
    return correct, total

пример того как должно работать

In [76]:
acc_stat(torch.tensor([1,2,3,4,0,0,0,0]), torch.tensor([1,2,3,4,5,5,5,5]), torch.tensor([1,1,1,1,0,0,0,0]))

acc_stat(torch.tensor([1,2,3,4,0,0,0,0]), torch.tensor([1,2,3,4,5,5,5,5]), torch.tensor([0,0,0,0,1,1,1,1]))

(tensor(0.), tensor(4.))

## Loss function and model

In [77]:
class EntityModel(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5, bidirectional=False):
        super().__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob,
                            batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(
        self, 
        embeds,
        hidden
    ):
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # Пропустим через дропаут
        out = self.dropout(lstm_out)
        # Проходите через линейный слой
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        num_directions = 2 if self.lstm.bidirectional else 1
        print("num_directions", num_directions)
        h_zeros = torch.zeros(self.n_layers * num_directions,
                              batch_size, self.hidden_dim,
                              dtype=torch.float32, device=device)
        c_zeros = torch.zeros(self.n_layers * num_directions,
                              batch_size, self.hidden_dim,
                              dtype=torch.float32, device=device)

        return (h_zeros, c_zeros)

## Data processing

In [78]:
def process_data(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")

    enc_tag = preprocessing.LabelEncoder()

    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])

    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag

## Training

In [79]:
sentences, tag, enc_tag = process_data(TRAINING_FILE)

  df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")


In [80]:
meta_data = {
    "enc_tag": enc_tag
}

joblib.dump(meta_data, "meta.bin")

num_tag = len(list(enc_tag.classes_))

# делим на трейн и тест с помощью train_test_split
(
    train_sentences,
    test_sentences,
    train_tag,
    test_tag
) = train_test_split(sentences, tag, test_size=0.2, random_state=42)

In [81]:
train_dataset = EntityDataset(
    texts=train_sentences, tags=train_tag, max_len=MAX_LEN
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=0,
    shuffle=True, drop_last=True
)

valid_dataset = EntityDataset(
    texts=test_sentences, tags=test_tag, max_len=MAX_LEN
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=0,
    shuffle=False, drop_last=True
)

In [82]:
def eval_model(model, valid_data_loader):
    h = model.init_hidden(VALID_BATCH_SIZE)
    losses = []
    
    correct_sum, total_sum = 0, 0
    
    for inputs, labels, mask in valid_data_loader:
        h = tuple([each.data for each in h])
        # отправим inputs, labels и mask на GPU
        inputs = inputs.to(device)
        labels = labels.to(device)
        mask = mask.to(device)

        model.zero_grad()
        output, h = model(inputs, h)
        loss = loss_fn(output, labels.flatten(), mask, num_tag)
        losses.append(loss.item())
        
        correct, total = acc_stat(torch.argmax(output, dim=-1).flatten(), labels.flatten(), mask.flatten())
        correct_sum += correct
        total_sum += total
    return losses, correct_sum / total_sum

In [83]:
hidden_dim = 512
n_layers = 2

model = EntityModel(num_tag, EMBED_DIM, hidden_dim, n_layers, drop_prob=0.5, bidirectional=False)
model.to(device)

lr=0.005
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [86]:
from tqdm import tqdm

counter = 0
print_every = 100
clip = 5
valid_loss_min = np.Inf
writer = SummaryWriter('logs')

num_labels = len(list(enc_tag.classes_))

model.train()
for i in range(EPOCHS):
    h = model.init_hidden(TRAIN_BATCH_SIZE)
    
    correct_sum, total_sum = 0, 0
    
    for inputs, labels, mask in tqdm(train_data_loader):
        counter += 1
        h = tuple([e.data for e in h])

        inputs = inputs.to(device)
        labels = labels.to(device)
        mask = mask.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = loss_fn(output, labels.flatten(), mask, num_tag) # вызываем функцию для подсчета лосса
        loss.backward() # и делаем обратное распространение ошибки
        correct, total = acc_stat(torch.argmax(output, dim=-1).flatten(), labels.flatten(), mask.flatten())
        correct_sum += correct
        total_sum += total

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        # градиентный спуск
        optimizer.step()
        
        if counter % print_every == 0:
            model.eval()
            val_losses, val_acc = eval_model(model, valid_data_loader)
            model.train()
            
            val_loss = np.mean(val_losses)
            writer.add_scalar('train/loss', loss.item(), counter)
            writer.add_scalar('val/loss', val_loss, counter)
            writer.add_scalar('train/acc', correct_sum / total_sum, counter)
            writer.add_scalar('val/acc', val_acc, counter)

            print("Epoch: {}/{}...".format(i+1, EPOCHS),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(val_loss),
                  "Train Acc: {:.6f}".format(correct_sum / total_sum),
                  "Val Acc: {:.6f}".format(val_acc))
                
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), MODEL_PATH)
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

num_directions 1


 33%|███▎      | 99/299 [01:30<02:58,  1.12it/s]

num_directions 1


 33%|███▎      | 100/299 [02:35<1:06:31, 20.06s/it]

Epoch: 1/5... Step: 100... Loss: 0.547491... Val Loss: 0.476116 Train Acc: 0.819194 Val Acc: 0.829198
Validation loss decreased (inf --> 0.476116).  Saving model ...


 67%|██████▋   | 199/299 [04:04<01:29,  1.12it/s]  

num_directions 1


 67%|██████▋   | 200/299 [05:09<33:07, 20.07s/it]

Epoch: 1/5... Step: 200... Loss: 0.147518... Val Loss: 0.142635 Train Acc: 0.884854 Val Acc: 0.972353
Validation loss decreased (0.476116 --> 0.142635).  Saving model ...


100%|██████████| 299/299 [06:39<00:00,  1.34s/it]


num_directions 1


  0%|          | 0/299 [00:00<?, ?it/s]

num_directions 1


  0%|          | 0/299 [00:27<?, ?it/s]


KeyboardInterrupt: 

## Inference

In [None]:
meta_data = joblib.load("meta.bin")
enc_tag = meta_data["enc_tag"]

num_tag = len(list(enc_tag.classes_))

text = """
Natasha is traveling to New York
"""

device = torch.device("cuda")
model.to(device)

# так как это инференс, выключаем расчет градиентов:
inputs = torch.tensor([fasttext_model.wv[s] for s in word_tokenize(text)], dtype=torch.float32)
inputs = inputs.unsqueeze(0).to(device)
h = model.init_hidden(1)
tag, h = model(inputs, h)

print(
    enc_tag.inverse_transform(
        tag.argmax(-1).cpu().numpy().reshape(-1)
    )
)