In [1]:
import matplotlib.pyplot as plt

In [2]:
!pip install sklearn_crfsuite



In [3]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

A simple sentence NER example:

[**ORG** U.N. ] official [**PER** Ekeus ] heads for [**LOC** Baghdad ] 

We will concentrate on four types of named entities:
 * persons (**PER**), 
 * locations (**LOC**) 
 * organizations (**ORG**)
 * Others (**O**)

In [4]:
def _generate_examples(filepath):
        with open(filepath, encoding="utf-8") as f:
            sent = []
            for line in f:
                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                    if sent:
                        yield sent
                        sent = []
                else:
                    splits = line.split(" ")
                    token = splits[0]
                    pos_tag = splits[1]
                    ner_tag = splits[3].rstrip()
                    if 'MISC' in ner_tag:
                        ner_tag = 'O'
                    
                    sent.append((token, pos_tag, ner_tag))

In [5]:
%%time 
# hint use the above defined function
train_sents = list(_generate_examples('/content/train.txt'))
test_sents = list(_generate_examples('/content/test.txt'))

CPU times: user 281 ms, sys: 22.7 ms, total: 303 ms
Wall time: 332 ms


In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'postag': postag,
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True
    return features

In [7]:
test_sents[2]

[('United', 'NNP', 'B-LOC'),
 ('Arab', 'NNP', 'I-LOC'),
 ('Emirates', 'NNPS', 'I-LOC'),
 ('1996-12-06', 'CD', 'O')]

In [8]:
word2features(test_sents[2],0)

{'BOS': True, 'bias': 1.0, 'postag': 'NNP', 'word.lower()': 'united'}

In [9]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [10]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 384 ms, sys: 52.8 ms, total: 437 ms
Wall time: 439 ms


In [11]:
%%time 
#search for sklearn_crfsuite.CRF, use the lbfgs algorithm, c parameters should be 0.1 and max iterations 100, all possible transactions true
crf = sklearn_crfsuite.CRF(
    algorithm= 'lbfgs',
    c1=1.0,
    c2=1.0,
    max_iterations=100,
   all_possible_transitions = True   
)
# fit the model
crf.fit(X_train,y_train)

CPU times: user 13.5 s, sys: 69.4 ms, total: 13.5 s
Wall time: 13.6 s


In [12]:
# save a list of all labels in your model, hint crfs have a classes attribute
labels = crf.classes_

In [13]:
labels

['B-ORG', 'O', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-LOC']

In [14]:
#remove the label 'O' from your list
labels.remove('O')

In [15]:
labels

['B-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-LOC']

In [34]:
#perfrom a prediction on your test set
y_pred = crf.predict(X_test)

metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7201260473598221

In [35]:
# group B and I results, use the sorted function on the list labels with a lambda function as the key
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
sorted_labels

['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [None]:
#DOES NOT WORK
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

In [33]:
# what is the number of transition features in our model, crfs have an attribute called transition_features_
len(crf.transition_features_)

49

In [38]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

#list the top 20 unlikely transitions
print("\nTop unlikely transitions:")
print_transitions( Counter(crf.transition_features_).most_common()[-20:] )


Top likely transitions:
B-PER  -> I-PER   5.552948
B-ORG  -> I-ORG   5.287075
I-ORG  -> I-ORG   5.083802
B-LOC  -> I-LOC   4.658365
I-LOC  -> I-LOC   4.143020
I-PER  -> I-PER   2.641588
O      -> B-PER   1.659222
O      -> B-LOC   1.033480
O      -> O       0.911247
I-ORG  -> O       0.576871
B-ORG  -> O       0.569215
O      -> B-ORG   0.458256
B-LOC  -> O       0.425135
I-PER  -> O       0.304995
B-PER  -> O       0.282101
I-LOC  -> O       -0.128107
I-ORG  -> I-LOC   -0.351794
B-ORG  -> B-ORG   -0.493818
I-LOC  -> B-LOC   -0.512541
I-PER  -> I-LOC   -0.557391

Top unlikely transitions:
I-PER  -> I-ORG   -1.169497
I-LOC  -> B-ORG   -1.194337
I-ORG  -> B-LOC   -1.200116
B-PER  -> I-ORG   -1.200640
I-ORG  -> B-PER   -1.280830
I-LOC  -> B-PER   -1.323725
B-ORG  -> I-PER   -1.364609
B-LOC  -> I-PER   -1.465976
I-PER  -> B-LOC   -1.526602
B-ORG  -> B-LOC   -1.562089
B-LOC  -> I-ORG   -1.736941
B-PER  -> B-ORG   -1.810994
I-PER  -> B-PER   -2.073647
B-ORG  -> B-PER   -2.111594
I-PER  -> B-

In [36]:
# what is the number of transition features in our model, crfs have an attribute called state_features_
len(crf.state_features_)

6188

In [None]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))


def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

sorted_state_features_ = dict(sorted(crf.state_features_.items(), key=lambda item: item[1]))

#list the top 30 positive
print("Top positive:")

x = sorted_state_features_.items() #list(sorted_state_features_)[-30:]
print_state_features(list(x)[-30:])
#list the top 30 negative
print("\nTop negative:")

print_state_features(take(30, sorted_state_features_.items()))