# py-CRFsuite NER model

this version does not consider capitalization (all words are lowered with `.lower()`), in hopes of creating a case-independent model, for use in situations where case information is not available, such as when using the output of an automatic speech recognition (speech-to-text) system.

UPDATE: we use the train/test indices from `data_preprocessing` notebook so that we can make a direct comparison to the `keras` results

some code adapted from [scrapinghub's python-crfsuite](https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb) under the MIT license:

MIT License

Copyright (c) 2014-2017 ScrapingHub Inc. and contributors.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

In [1]:
import pandas as pd
import numpy as np
import pycrfsuite
from nltk import pos_tag
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from collections import Counter

## read ConLL data

read the data from `data_preprocessing` and split the data with the saved indices

In [2]:
# load binary files
train_indices = list(np.load('encoded/train_idx.npy'))
test_indices  = list(np.load('encoded/test_idx.npy'))
sentence_text = list(np.load('encoded/sentence_text.npy'))
sentence_post = list(np.load('encoded/sentence_post.npy'))
sentence_ners = list(np.load('encoded/sentence_ners.npy'))
sentence_zips = [list(zip(sentence_text[i], sentence_post[i])) for i in range(len(sentence_text))]

In [3]:
train_sents = [sentence_zips[i] for i in train_indices]
test_sents  = [sentence_zips[i] for i in test_indices]
y_train = [sentence_ners[i] for i in train_indices]
y_test  = [sentence_ners[i] for i in test_indices]

In [4]:
for idx, sent in enumerate(sentence_text[:2]):
    print(sent)
    print(sentence_post[idx])
    print(sentence_ners[idx])
    print(sentence_zips[idx])
    print('')

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O']
[('thousands', 'NNS'), ('of', 'IN'), ('demonstrators', 'NNS'), ('have', 'VBP'), ('marched', 'VBN'), ('through', 'IN'), ('london', 'NNP'), ('to', 'TO'), ('protest', 'VB'), ('the', 'DT'), ('war', 'NN'), ('in', 'IN'), ('iraq', 'NNP'), ('and', 'CC'), ('demand', 'VB'), ('the', 'DT'), ('withdrawal', 'NN'), ('of', 'IN'), ('british', 'JJ'), ('troops', 'NNS'), ('from', 'IN'), ('that', 'DT'), ('country', 'NN')]

['they', 'marched', 'from', 'the', 'houses', 'of', 'parliament', 'to', 'a', 'rally', 'in', 'hyde', 'park']

## gazetteers

precompiled lists for feature extraction

In [5]:
def file2list(filepath):
    with open(filepath, 'r') as f:
        data = f.readlines()
    results = []
    for d in data:
        results.append(d.lower().replace('\n', ''))
    return list(set(results))

gaz_countries = file2list('data/gazetteer_countries.txt')
gaz_names = file2list('data/gazetteer_names.txt')
gaz_cities = file2list('data/gazetteer_cities.txt')
gaz_times = file2list('data/gazetteer_datetime.txt')
gaz_demonyms = [s.split()[1] for s in file2list('data/gazetteer_demonyms.txt')]
vowels = ['a', 'e', 'i', 'o', 'u']
# http://academic.regis.edu/jseibert/Crypto/Frequency.pdf
rareletters = ['w', 'k', 'v', 'x', 'z', 'j', 'q']
commontrigrams = ['the', 'ing', 'and', 'her', 'ere', 'ent', 'tha', 'nth', 'was', 'eth', 'for', 'dth']

## feature extractors

In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'wordlength='+str(len(word)),
        'wordending[-3:]=' + word[-3:],
        'wordending[-2:]=' + word[-2:],
        'wordending[-1:]=' + word[-1:],
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'posclass=' + postag[:2],
        'word.isname=%s' % (word in gaz_names),
        'word.iscountry=%s' % (word in gaz_countries),
        'word.iscity=%s' % (word in gaz_cities),
        'word.isdatetime=%s' % (word in gaz_times),
        'word.isdemonym=%s' % (word in gaz_demonyms),
        'word.startsvowel=%s' % (word[0] in vowels),
        'word.endsvowel=%s' % (word[-1] in vowels),
        'word.rareletter=%s' % (len([l for l in list(word) if l in rareletters]) > 0),
        'word.commontrigram=%s' % (len([t for t in commontrigrams if t in word]) > 0),
        
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:wordending[-3:]=' + word1[-3:],
            '-1:wordending[-2:]=' + word1[-2:],
            '-1:wordending[-1:]=' + word1[-1:],
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1,
            '-1:posclass=' + postag1[:2],
            '-1:word.isname=%s' % (word1 in gaz_names),
            '-1:word.iscountry=%s' % (word1 in gaz_countries),
            '-1:word.iscity=%s' % (word1 in gaz_cities),
            '-1:word.isdatetime=%s' % (word1 in gaz_times),
            '-1:word.isdemonym=%s' % (word1 in gaz_demonyms),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:wordending[-3:]=' + word1[-3:],
            '+1:wordending[-2:]=' + word1[-2:],
            '+1:wordending[-1:]=' + word1[-1:],
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1,
            '+1:posclass=' + postag1[:2],
            '+1:word.isname=%s' % (word1 in gaz_names),
            '+1:word.iscountry=%s' % (word1 in gaz_countries),
            '+1:word.iscity=%s' % (word1 in gaz_cities),
            '+1:word.isdatetime=%s' % (word1 in gaz_times),
            '+1:word.isdemonym=%s' % (word1 in gaz_demonyms),
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2tokens(sent):
    return [token for token, postag in sent]

In [7]:
%%time
X_train = [sent2features(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]

CPU times: user 24 s, sys: 793 ms, total: 24.8 s
Wall time: 24.8 s


In [8]:
sent2features(train_sents[0])[0]

['bias',
 'wordlength=2',
 'wordending[-3:]=he',
 'wordending[-2:]=he',
 'wordending[-1:]=e',
 'word.isdigit=False',
 'postag=PRP',
 'posclass=PR',
 'word.isname=False',
 'word.iscountry=False',
 'word.iscity=False',
 'word.isdatetime=False',
 'word.isdemonym=False',
 'word.startsvowel=False',
 'word.endsvowel=True',
 'word.rareletter=False',
 'word.commontrigram=False',
 'BOS',
 '+1:wordending[-3:]=led',
 '+1:wordending[-2:]=ed',
 '+1:wordending[-1:]=d',
 '+1:word.isdigit=False',
 '+1:postag=VBD',
 '+1:posclass=VB',
 '+1:word.isname=False',
 '+1:word.iscountry=False',
 '+1:word.iscity=False',
 '+1:word.isdatetime=False',
 '+1:word.isdemonym=False']

## create model and train

we create a pycrfsuite model `Trainer` and train on the train data

In [9]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 200,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

CPU times: user 14.7 s, sys: 109 ms, total: 14.8 s
Wall time: 14.8 s


In [10]:
%%time
trainer.train('model/conll2002-test.crfsuite')

CPU times: user 8min 5s, sys: 513 ms, total: 8min 5s
Wall time: 8min 4s


## create tagger, evaluation

we can use the trained model to label

In [11]:
tagger = pycrfsuite.Tagger()
tagger.open('model/conll2002-test.crfsuite')

<contextlib.closing at 0x7f7029c28160>

In [12]:
idx = 4
example_sent = test_sents[idx]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(y_test[idx]))

the trip by holmes is taking place in the aftermath of heavy fighting between allied somali-ethiopian forces and islamists in mogadishu that killed hundreds of somalis and forced up to 4,00,000 others to escape to makeshift camps on the city 's outskirts

Predicted: O O O B-org O O O O O O O O O O O O O O O O B-geo O O O O B-gpe O O O O O O O O O O O O O O O O
Correct:   O O O B-per O O O O O O O O O O O O O O B-org O B-geo O O O O B-gpe O O O O O O O O O O O O O O O O


In [13]:
def bio_classification_report(y_true, y_pred):
    """
    from scrapinghub's python-crfsuite example
    
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [14]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      B-art       0.31      0.06      0.10        69
      I-art       0.00      0.00      0.00        54
      B-eve       0.52      0.35      0.42        46
      I-eve       0.35      0.22      0.27        36
      B-geo       0.85      0.90      0.87      5629
      I-geo       0.81      0.74      0.77      1120
      B-gpe       0.94      0.92      0.93      2316
      I-gpe       0.89      0.65      0.76        26
      B-nat       0.73      0.46      0.56        24
      I-nat       0.60      0.60      0.60         5
      B-org       0.78      0.69      0.73      2984
      I-org       0.77      0.76      0.76      2377
      B-per       0.81      0.81      0.81      2424
      I-per       0.81      0.90      0.85      2493
      B-tim       0.92      0.83      0.87      2989
      I-tim       0.82      0.70      0.75      1017

avg / total       0.83      0.82      0.82     23609

CPU times: user 3.02 s, sys: 24 ms, total: 

In [15]:
from collections import Counter
info = tagger.info()

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
10.672160 B-tim  word.isdatetime=True
8.881390 I-gpe  +1:wordending[-3:]=yor
7.795847 B-org  wordending[-3:]=213
7.558754 B-gpe  wordending[-3:]=hen
7.199009 I-tim  word.isdatetime=True
6.781945 B-gpe  wordending[-3:]=pal
6.772639 B-tim  BOS
6.482320 O      BOS
6.140998 B-art  wordending[-3:]=gdp
6.081018 B-eve  wordending[-3:]=pic
5.880988 B-gpe  wordending[-3:]=ger
5.863983 B-org  wordending[-3:]=fm
5.607637 B-per  wordending[-3:]=ms.
5.596039 B-gpe  wordending[-3:]=khs
5.493595 B-per  -1:wordending[-3:]=sab
5.437427 B-tim  wordending[-2:]=0s
5.428340 B-org  -1:wordending[-3:]=ngh
5.424400 I-org  wordending[-3:]=rp.
5.336027 I-per  -1:wordending[-3:]=ofi
5.173353 B-gpe  wordending[-3:]=qis

Top negative:
-3.155254 I-tim  wordending[-3:]=his
-3.197098 I-tim  wordending[-3:]=ear
-3.198708 O      wordending[-3:]=iri
-3.245888 O      wordending[-3:]=6th
-3.304280 I-org  +1:wordending[-3:]=yat
-3.314575 O      wordending[-3:]=3th
-3.378080 O      wordending[-3:]=erb
-3.52268

## decode some results and save to csv

In [16]:
def decode(s):
    toks = s.split()
    toks = [w.lower() for w in toks]
    post = pos_tag(toks)
    tags = tagger.tag(sent2features(post))
        
    return tags

In [17]:
test_strings = [[w for w, t in s] for s in test_sents]
test_postags = [[t for w, t in s] for s in test_sents]
# test_strings[:3], test_postags[:3]

In [18]:
decoded = []
for idx, sentlist in enumerate(test_strings[:500]):
    
    # join tokens into string and get preds
    preds = decode(' '.join(sentlist))
    
    # print(len(sentlist), len(y_test[idx]), len(preds))
    
    word, pos, tru, prd = [], [], [], []

    # for each word in the sentence...
    for jdx, wrd in enumerate(sentlist):

        # word
        word.append(wrd)
        # pos
        pos.append(test_postags[idx][jdx])
        # decode true NER tag
        tru.append(y_test[idx][jdx])
        # decode prediction
        prd.append(preds[jdx])

    answ = pd.DataFrame(
    {
        'word': word,
        'pos': pos,
        'true': tru,
        'pred': prd,
        'skip' : [' ' for s in word]
    })
    answ = answ[['word', 'pos', 'true', 'pred', 'skip']]
    answ = answ.T
    decoded.append(answ)

In [19]:
result = pd.concat(decoded)
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
word,the,post,became,vacant,in,october,",",when,former,costa,...,,,,,,,,,,
pos,DT,NN,VBD,JJ,IN,NNP,",",WRB,JJ,NNP,...,,,,,,,,,,
true,O,O,O,O,O,B-tim,O,O,O,O,...,,,,,,,,,,
pred,O,O,O,O,O,B-tim,O,O,O,O,...,,,,,,,,,,
skip,,,,,,,,,,,...,,,,,,,,,,


In [20]:
result.to_csv('results/pyCRF_sample.csv')