In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('https://dl.dropboxusercontent.com/s/tlijezgr8tnpeym/ner_dataset.csv?dl=0', 
                 header=0, 
                 encoding='latin')

In [None]:
df['Sentence #'].fillna(method='ffill', inplace=True)

In [None]:
df.sample(5)

## EDA

In [None]:
df['Tag'].value_counts()

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(y=df['Tag']);

In [None]:
num_sen = df['Sentence #'].nunique()
print(f'Number of sentences = {num_sen}')

In [None]:
mean_words = df['Sentence #'].value_counts().mean()
min_words = df['Sentence #'].value_counts().min()
max_words = df['Sentence #'].value_counts().max()
print('Mean number of tokems in sentence= {:.3f}'.format(mean_words))
print('Min number of tokems in sentence= {}'.format(min_words))
print('Max number of tokems in sentence= {}'.format(max_words))

In [None]:
from collections import Counter

cnt_words = Counter(df['Word'])

In [None]:
print('The size of voxcabulary = {}'.format(len(cnt_words)))

## Data Prep

In [None]:
grouped = df.groupby(by='Sentence #').agg(lambda x: list(x))

In [None]:
test_dataset = grouped.iloc[-5000:,[0,2]]

In [None]:
train_dataset = grouped.iloc[ :-5000, [0,2]]

In [None]:
vocab = np.unique(np.array([word for words in train_dataset['Word'].values for word in words])).tolist()

In [None]:
states = df['Tag'].unique().tolist()

In [None]:
len(states)

## HMM

In [None]:
train_hmm = [[(word, tag) for tag, word in zip(*element)] for element in zip(train_dataset['Word'], train_dataset['Tag'])]

In [None]:
test_hmm = test_dataset['Word'].values.tolist()

In [None]:
from HMM import HMMTagger

In [None]:
tagger = HMMTagger(states, vocab)

In [None]:
tagger.fit(train_hmm)

In [None]:
predict = tagger.predict(test_hmm)

In [None]:
y_true = [tag for tags in test_dataset['Tag'] for tag in tags]

In [None]:
prediction = [tag for tags in predict for tag in tags]

In [None]:

print(classification_report(y_true, prediction))

## CRF

In [None]:
from sklearn_crfsuite import CRF
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [None]:
crf_corpus = [[(word, pos, tag) for word, pos, tag in zip(*element)] 
              for element in zip(grouped['Word'], grouped['POS'], grouped['Tag'])]

In [None]:
crf_corpus[0]

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]


In [None]:
X = [sent2features(s) for s in crf_corpus]
y = [sent2labels(s) for s in crf_corpus]

In [None]:
crf = CRF(algorithm='lbfgs',
c1=10,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)

In [None]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [None]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

## Bi-Directional LSTM

In [None]:
from BiDirectionalLSTM import *

### Data Prep

In [None]:
sentences = grouped['Word'].values
tags = grouped['Tag'].values

In [None]:
ds = NerDataSet(sentences, tags)

In [None]:
len(ds)

In [None]:
learning_rate = 0.01
num_epochs = 2
device = 'cpu'
labels = list(range(1,18))
workers = 8

In [None]:
from torch.utils.data import random_split
train_set, test_set = random_split(ds, [40000, 7959], generator=torch.Generator().manual_seed(42))

In [None]:
train_loader = padded_data_loader(data=train_set, workers=workers, batch_size=32)

In [None]:
test_loader = padded_data_loader(data=test_set, workers=workers, batch_size=32)

### Model Training & Evaluation

In [None]:
model = BiLSTM(len(ds.vocab), len(ds.tag_vocab))

In [None]:
import torch.nn.functional as F
import torch 

In [None]:
from sklearn.metrics import confusion_matrix, f1_score

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
conf_matrix = np.zeros((len(ds.tag_vocab)-1, len(ds.tag_vocab)-1))
total_step = len(train_loader)
scores = []
for epoch in range(num_epochs):
    for i, (sentences, tags) in enumerate(train_loader):
        sentences = sentences.to(device)
        tags = tags.to(device)
        
        # Forward pass
        outputs = model(sentences)
        loss = F.cross_entropy(torch.flatten(outputs, 0, 1), torch.flatten(tags, 0, 1), ignore_index=0)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
    model.eval()
    with torch.no_grad():
        for  sentences, tags in test_loader:
            sentences = sentences.to(device)
            tags = tags.to(device)
            tags = tags.flatten()
            tag_mask = tags != 0
            outputs = model(sentences)
            predicted = outputs.argmax(2)
            tags = tags[tag_mask]
            predicted = predicted.flatten()[tag_mask]
            conf_matrix += confusion_matrix(tags.numpy(), predicted.numpy(), labels=labels)
        tp = np.diagonal(conf_matrix)
        prec = tp / conf_matrix.sum(axis=0)
        rec = tp / conf_matrix.sum(axis=1)
        mask = np.logical_and(prec == 0, rec == 0)
        f1 = 2 * (prec * rec /(prec + rec))
        f1[mask] = 0
        print('Macro avg for f1 score on {} epoch = {:.3f}'.format(epoch, f1.mean()))
        scores.append(f1.mean())
    model.train()
            
            



## Lightning model

In [None]:
import pytorch_lightning as pl


model = NerNN(len(ds.vocab), len(ds.tag_vocab))

In [None]:
trainer = pl.Trainer(max_epochs=30, min_epochs=5 )
trainer.fit(model, train_loader, test_loader)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
model.eval()
conf_matrix = np.zeros((len(ds.tag_vocab)-1, len(ds.tag_vocab)-1))
for  sentences, tags in test_loader:
    tags = tags.flatten()
    tag_mask = tags != 0
    outputs = model(sentences)
    predicted = outputs.argmax(2)
    tags = tags[tag_mask]
    predicted = predicted.flatten()[tag_mask]
    conf_matrix += confusion_matrix(tags.numpy(), predicted.numpy(), labels=labels)
tp = np.diagonal(conf_matrix)
prec = tp / conf_matrix.sum(axis=0)
rec = tp / conf_matrix.sum(axis=1)
mask = np.logical_and(prec == 0, rec == 0)
f1 = 2 * (prec * rec /(prec + rec))
f1[mask] = 0

In [61]:
labels = list(ds.tag_vocab.keys())[1:]
report = pd.DataFrame.from_dict({'labels': labels, 'recall': rec, 'precision': prec, 'f1': f1})
report.set_index('labels')

Unnamed: 0_level_0,recall,precision,f1
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I-eve,0.233333,0.269231,0.25
B-org,0.635064,0.729313,0.678933
B-nat,0.333333,0.5,0.4
B-art,0.147541,0.243243,0.183673
B-gpe,0.936557,0.928654,0.932589
O,0.99048,0.982101,0.986273
I-gpe,0.7,0.724138,0.711864
I-per,0.831557,0.858086,0.844613
I-tim,0.766473,0.826541,0.795375
I-org,0.623873,0.749242,0.680834


In [62]:
f1.mean(), rec.mean(), prec.mean()

(0.6077967853585879, 0.583321623993766, 0.6562391965765241)