In [11]:
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from torch.utils.data import random_split
from sklearn.metrics import confusion_matrix

from BiDirectionalLSTM import NerDataSet, NerNN, padded_data_loader

## Data load and prep

In [3]:
df = pd.read_csv('https://dl.dropboxusercontent.com/s/tlijezgr8tnpeym/ner_dataset.csv?dl=0', 
                 header=0, 
                 encoding='latin')

df['Sentence #'].fillna(method='ffill', inplace=True)
grouped = df.groupby(by='Sentence #').agg(lambda x: list(x))

sentences = grouped['Word'].values
tags = grouped['Tag'].values

## Torch data prep

In [5]:
workers = 0

ds = NerDataSet(sentences, tags)
train_set, test_set = random_split(ds, [40000, 7959], generator=torch.Generator().manual_seed(42))
train_loader = padded_data_loader(data=train_set, workers=workers, batch_size=64)
test_loader = padded_data_loader(data=test_set, workers=workers, batch_size=64)

## Model Init and train

In [26]:
model = NerNN(len(ds.vocab), len(ds.tag_vocab), num_layers=1, dropout=0)
logger = pl.loggers.TensorBoardLogger('tb_logs', name='ner_lstm_logs')

In [27]:
trainer = pl.Trainer(max_epochs=20, min_epochs=5 , logger=logger, gpus=1)
trainer.fit(model, train_loader, test_loader)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.1 M 
1 | lstm      | LSTM      | 790 K 
2 | linear    | Linear    | 9.2 K 
3 | f1_metric | F1        | 0     
----------------------------------------
4.9 M     Trainable params
0         Non-trainable params
4.9 M     Total params


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

## Evaluation

In [28]:
labels = list(range(1,18))
model.eval()
conf_matrix = np.zeros((len(ds.tag_vocab)-1, len(ds.tag_vocab)-1))
for  sentences, tags in test_loader:
    tags = tags.flatten()
    tag_mask = tags != 0
    outputs = model(sentences)
    predicted = outputs.argmax(2)
    tags = tags[tag_mask]
    predicted = predicted.flatten()[tag_mask]
    conf_matrix += confusion_matrix(tags.numpy(), predicted.numpy(), labels=labels)
tp = np.diagonal(conf_matrix)
prec = tp / conf_matrix.sum(axis=0)
rec = tp / conf_matrix.sum(axis=1)
mask = np.logical_and(prec == 0, rec == 0)
f1 = 2 * (prec * rec /(prec + rec))
f1[mask] = 0

  prec = tp / conf_matrix.sum(axis=0)


In [29]:
labels = list(ds.tag_vocab.keys())[1:]
report = pd.DataFrame.from_dict({'labels': labels, 'recall': rec, 'precision': prec, 'f1': f1})
report.set_index('labels')

Unnamed: 0_level_0,recall,precision,f1
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B-nat,0.222222,0.615385,0.326531
I-tim,0.694767,0.812005,0.748825
B-art,0.0,,
B-geo,0.858993,0.827253,0.842824
I-per,0.817342,0.829128,0.823193
B-tim,0.851874,0.909578,0.879781
B-eve,0.194444,0.7,0.304348
I-org,0.556798,0.695182,0.618342
I-gpe,0.533333,1.0,0.695652
B-gpe,0.902128,0.937676,0.919558


In [30]:
f1.mean(), rec.mean(), prec.mean()

(nan, 0.5187155365896672, nan)