# py-CRFsuite NER model

this version does not consider capitalization (all words are lowered with `.lower()`), in hopes of creating a case-independent model, for use in situations where case information is not available, such as when using the output of an automatic speech recognition (speech-to-text) system.

code modified from:
https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

In [28]:
import pandas as pd
import numpy as np
import pycrfsuite
from nltk import pos_tag
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from collections import Counter

## read ConLL data

In [2]:
data = pd.read_csv('data/ner_dataset_utf8.csv')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
sentmarks = data["Sentence #"].tolist()
sentmarks = [str(s) for s in sentmarks]
sentmarks[:5]
words = data["Word"].tolist()
postags = data["POS"].tolist()
nertags = data["Tag"].tolist()

## optional: get rid of B, I prepends

see Stanford NLP named entity video (Manning): B, I tags may not contribute much

In [4]:
# new_nertags = []
# for tag in nertags:
#     if '-' in tag:
#         tt = tag.split('-')
#         new_nertags.append(tt[1])
#     else:
#         new_nertags.append(tag)
# nertags = new_nertags
# set(nertags)

## format data

In [5]:
sentence_zips = []
sentence_text = []
sentence_post = []
sentence_ners = []

vocab = []

this_zip = []
this_snt = []
this_pos = []
this_ner = []

for idx, s in enumerate(sentmarks):
    # reset if new sent
    if s != 'nan':
        # edit: ONLY IF HAS TAG!
    
        if len(this_snt) > 0 and this_snt[-1] == '0':
            if list(set(this_ner)) != ['O']:
                sentence_zips.append(list(zip(this_snt[:-1], this_pos[:-1])))
                sentence_text.append(this_snt[:-1])
                sentence_post.append(this_pos[:-1])
                sentence_ners.append(this_ner[:-1])
        this_snt = []
        this_pos = []
        this_ner = []
    
    # add to lists 
    this_snt.append(words[idx].lower())
    this_pos.append(postags[idx])
    this_ner.append(nertags[idx])
    vocab.append(words[idx].lower())

In [6]:
for idx, sent in enumerate(sentence_text[:2]):
    print(sent)
    print(sentence_post[idx])
    print(sentence_ners[idx])
    print(sentence_zips[idx])
    print('')

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O']
[('thousands', 'NNS'), ('of', 'IN'), ('demonstrators', 'NNS'), ('have', 'VBP'), ('marched', 'VBN'), ('through', 'IN'), ('london', 'NNP'), ('to', 'TO'), ('protest', 'VB'), ('the', 'DT'), ('war', 'NN'), ('in', 'IN'), ('iraq', 'NNP'), ('and', 'CC'), ('demand', 'VB'), ('the', 'DT'), ('withdrawal', 'NN'), ('of', 'IN'), ('british', 'JJ'), ('troops', 'NNS'), ('from', 'IN'), ('that', 'DT'), ('country', 'NN')]

['they', 'marched', 'from', 'the', 'houses', 'of', 'parliament', 'to', 'a', 'rally', 'in', 'hyde', 'park']

## gazetteers

precompiled lists for feature extraction

In [11]:
def file2list(filepath):
    with open(filepath, 'r') as f:
        data = f.readlines()
    results = []
    for d in data:
        results.append(d.lower().replace('\n', ''))
    return list(set(results))

gaz_countries = file2list('data/gazetteer_countries.txt')
gaz_names = file2list('data/gazetteer_names.txt')
gaz_cities = file2list('data/gazetteer_cities.txt')
gaz_times = file2list('data/gazetteer_datetime.txt')
gaz_demonyms = [s.split()[1] for s in file2list('data/gazetteer_demonyms.txt')]
vowels = ['a', 'e', 'i', 'o', 'u']
# http://academic.regis.edu/jseibert/Crypto/Frequency.pdf
rareletters = ['w', 'k', 'v', 'x', 'z', 'j', 'q']
commontrigrams = ['the', 'ing', 'and', 'her', 'ere', 'ent', 'tha', 'nth', 'was', 'eth', 'for', 'dth']

## feature extractors

In [12]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'wordlength='+str(len(word)),
        'wordending[-3:]=' + word[-3:],
        'wordending[-2:]=' + word[-2:],
        'wordending[-1:]=' + word[-1:],
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'posclass=' + postag[:2],
        'word.isname=%s' % (word in gaz_names),
        'word.iscountry=%s' % (word in gaz_countries),
        'word.iscity=%s' % (word in gaz_cities),
        'word.isdatetime=%s' % (word in gaz_times),
        'word.isdemonym=%s' % (word in gaz_demonyms),
        'word.startsvowel=%s' % (word[0] in vowels),
        'word.endsvowel=%s' % (word[-1] in vowels),
        'word.rareletter=%s' % (len([l for l in list(word) if l in rareletters]) > 0),
        'word.commontrigram=%s' % (len([t for t in commontrigrams if t in word]) > 0),
        
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:wordending[-3:]=' + word1[-3:],
            '-1:wordending[-2:]=' + word1[-2:],
            '-1:wordending[-1:]=' + word1[-1:],
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1,
            '-1:posclass=' + postag1[:2],
            '-1:word.isname=%s' % (word1 in gaz_names),
            '-1:word.iscountry=%s' % (word1 in gaz_countries),
            '-1:word.iscity=%s' % (word1 in gaz_cities),
            '-1:word.isdatetime=%s' % (word1 in gaz_times),
            '-1:word.isdemonym=%s' % (word1 in gaz_demonyms),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:wordending[-3:]=' + word1[-3:],
            '+1:wordending[-2:]=' + word1[-2:],
            '+1:wordending[-1:]=' + word1[-1:],
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1,
            '+1:posclass=' + postag1[:2],
            '+1:word.isname=%s' % (word1 in gaz_names),
            '+1:word.iscountry=%s' % (word1 in gaz_countries),
            '+1:word.iscity=%s' % (word1 in gaz_cities),
            '+1:word.isdatetime=%s' % (word1 in gaz_times),
            '+1:word.isdemonym=%s' % (word1 in gaz_demonyms),
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2tokens(sent):
    return [token for token, postag in sent]

In [13]:
train_sents, test_sents, y_train, y_test = train_test_split(sentence_zips, sentence_ners)

In [14]:
%%time
X_train = [sent2features(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]

CPU times: user 39.1 s, sys: 664 ms, total: 39.8 s
Wall time: 39.8 s


In [15]:
sent2features(train_sents[0])[0]

['bias',
 'wordlength=3',
 'wordending[-3:]=mr.',
 'wordending[-2:]=r.',
 'wordending[-1:]=.',
 'word.isdigit=False',
 'postag=NNP',
 'posclass=NN',
 'word.isname=False',
 'word.iscountry=False',
 'word.iscity=False',
 'word.isdatetime=False',
 'word.isdemonym=False',
 'word.startsvowel=False',
 'word.endsvowel=False',
 'word.rareletter=False',
 'word.commontrigram=False',
 'BOS',
 '+1:wordending[-3:]=awi',
 '+1:wordending[-2:]=wi',
 '+1:wordending[-1:]=i',
 '+1:word.isdigit=False',
 '+1:postag=NNP',
 '+1:posclass=NN',
 '+1:word.isname=False',
 '+1:word.iscountry=False',
 '+1:word.iscity=False',
 '+1:word.isdatetime=False',
 '+1:word.isdemonym=False']

## create model and train

In [16]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 200,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

CPU times: user 19.5 s, sys: 144 ms, total: 19.6 s
Wall time: 20 s


In [17]:
%%time
trainer.train('model/conll2002-test.crfsuite')

CPU times: user 9min 55s, sys: 188 ms, total: 9min 55s
Wall time: 9min 55s


## create tagger, evaluation

In [18]:
tagger = pycrfsuite.Tagger()
tagger.open('model/conll2002-test.crfsuite')

<contextlib.closing at 0x7efbc666da58>

In [19]:
idx = 4
example_sent = test_sents[idx]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(y_test[idx]))

the pontiff made the comments in french saturday as he accepted a political courage award from a french catholic television station and french political magazine

Predicted: O O O O O O B-geo B-tim O O O O O O O O O B-gpe O O O O B-gpe O O
Correct:   O O O O O O B-art B-tim O O O O O O O O O B-gpe O O O O B-gpe O O


In [20]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [21]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      B-art       0.41      0.10      0.16       109
      I-art       0.14      0.04      0.06        82
      B-eve       0.57      0.35      0.43        75
      I-eve       0.41      0.23      0.30        64
      B-geo       0.84      0.90      0.87      9222
      I-geo       0.78      0.79      0.78      1826
      B-gpe       0.95      0.91      0.93      3896
      I-gpe       0.83      0.51      0.63        57
      B-nat       0.81      0.21      0.34        61
      I-nat       0.50      0.22      0.31         9
      B-org       0.77      0.67      0.71      4900
      I-org       0.77      0.75      0.76      4068
      B-per       0.82      0.81      0.81      4048
      I-per       0.83      0.90      0.86      4110
      B-tim       0.92      0.83      0.87      5065
      I-tim       0.80      0.68      0.73      1649

avg / total       0.83      0.81      0.82     39241

CPU times: user 6.73 s, sys: 8 ms, total: 6

In [22]:
from collections import Counter
info = tagger.info()

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
8.573727 B-tim  word.isdatetime=True
7.688207 I-gpe  wordending[-3:]=ots
7.679906 I-gpe  +1:wordending[-3:]=yor
7.367344 B-gpe  wordending[-3:]=abs
7.344089 B-gpe  wordending[-3:]=hen
7.079502 B-gpe  wordending[-3:]=pal
6.563421 B-art  wordending[-3:]=gdp
6.062548 I-tim  wordending[-3:]=9th
5.788738 B-tim  wordending[-3:]=9th
5.722609 B-gpe  wordending[-3:]=ger
5.675574 I-org  wordending[-3:]=rp.
5.613537 B-tim  BOS
5.606815 B-gpe  wordending[-3:]=qis
5.573573 B-org  wordending[-3:]=ato
5.562263 O      BOS
5.515095 B-per  wordending[-3:]=ms.
5.477476 B-gpe  wordending[-3:]=nka
5.414745 B-tim  +1:word.isdatetime=True
5.411383 O      wordending[-3:]=ief
5.361535 B-org  wordending[-3:]=fm

Top negative:
-3.277264 O      +1:wordending[-3:]=rs.
-3.280711 I-tim  -1:wordending[-3:]=ate
-3.293249 O      postag=NNPS
-3.325322 O      wordending[-3:]=nos
-3.348904 O      -1:wordending[-3:]=ahr
-3.425627 B-gpe  postag=NNPS
-3.495621 I-tim  wordending[-3:]=his
-3.498225 I-per  +1:word

## decode some results and save to csv

In [23]:
def decode(s):
    toks = s.split()
    toks = [w.lower() for w in toks]
    post = pos_tag(toks)
    tags = tagger.tag(sent2features(post))
        
    return tags

In [27]:
test_strings = [[w for w, t in s] for s in test_sents]
test_postags = [[t for w, t in s] for s in test_sents]
# test_strings[:3], test_postags[:3]

In [29]:
decoded = []
for idx, sentlist in enumerate(test_strings[:500]):
    
    # join tokens into string and get preds
    preds = decode(' '.join(sentlist))
    
    # print(len(sentlist), len(y_test[idx]), len(preds))
    
    word, pos, tru, prd = [], [], [], []

    # for each word in the sentence...
    for jdx, wrd in enumerate(sentlist):

        # word
        word.append(wrd)
        # pos
        pos.append(test_postags[idx][jdx])
        # decode true NER tag
        tru.append(y_test[idx][jdx])
        # decode prediction
        prd.append(preds[jdx])

    answ = pd.DataFrame(
    {
        'word': word,
        'pos': pos,
        'true': tru,
        'pred': prd,
        'skip' : [' ' for s in word]
    })
    answ = answ[['word', 'pos', 'true', 'pred', 'skip']]
    answ = answ.T
    decoded.append(answ)

In [30]:
result = pd.concat(decoded)
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
word,many,christians,attend,services,friday,to,pray,and,reflect,on,...,,,,,,,,,,
pos,JJ,NNPS,NN,NNS,NNP,TO,VB,CC,VB,IN,...,,,,,,,,,,
true,O,O,O,O,B-tim,O,O,O,O,O,...,,,,,,,,,,
pred,O,B-gpe,O,O,B-tim,O,O,O,O,O,...,,,,,,,,,,
skip,,,,,,,,,,,...,,,,,,,,,,


In [31]:
result.to_csv('results/pyCRF_sample.csv')

## get word count features for spelling clf

this is an extension, exploring intra-word features for NER, for possible extension to semisupervised NER. see *Unsupervised Models for Named Entity Classification* (Collins and Singer 1999)

In [None]:
# labeled, unlabeled data split for testing
labeled_sents, unlabeled_sents, labeled_ners, unlabeled_ners = train_test_split(sentence_zips, sentence_ners, train_size=0.15)

In [None]:
set(nertags)

In [None]:
%%time
# get list of words belonging to each NER tag
wordlist = {}
vocab = []
for tag in set(nertags):
    wordlist[tag] = []
for idx, sent in enumerate(labeled_sents):
    for jdx, tup in enumerate(sent):
        word = tup[0]
        pos = tup[0]
        ner = labeled_ners[idx][jdx]
        vocab.append(word)
        wordlist[ner].append(word)

In [None]:
%%time
# get feature lists of bigrams, trigrams, 4-grams, etc?
def ngramSplitter(wordlist, n):
    ngramlist = []
    for word in wordlist:
        if len(word) >= n:
            for i in range(len(word)-n):
                ngramlist.append(word[i:i+n])
    ngramCounts = Counter(ngramlist)
    return ngramCounts

bigrams = ngramSplitter(vocab, 2)
trigrams = ngramSplitter(vocab, 3)
fourgrams = ngramSplitter(vocab, 4)
fivegrams = ngramSplitter(vocab, 5)

In [None]:
[t[0] for t in trigrams.most_common(10)]

In [None]:
# top n-grams list
#ngramfeats = [t[0] for t in bigrams.most_common(100)]
ngramfeats = [t[0] for t in trigrams.most_common(100)]
ngramfeats += [t[0] for t in fourgrams.most_common(150)]
ngramfeats += [t[0] for t in fivegrams.most_common(150)]

len(ngramfeats)

In [None]:
%%time
# get per-class ngram feature counts
featcounts = {}
for nertag in wordlist:
    lst = wordlist[nertag]
    fcounts = []
    for feat in ngramfeats:
        cntr = 0
        for word in lst:
            if feat in word:
                cntr += 1
        fcounts.append(cntr)
    featcounts[nertag] = fcounts

In [None]:
list(zip(ngramfeats, featcounts['geo']))[:10]

In [None]:
%%time
# get strength by (in_class ft cnt + k) / (x_class ft cnt + vk)
# k is smooth param = 0.1 and v = # classes (len(set(nertags)))
featweights = {}
k = 0.1
v = len(set(nertags))
for tag in set(nertags):
    featweights[tag] = []
for idx, ngram in enumerate(ngramfeats):
    totalcount = sum([featcounts[tag][idx] for tag in set(nertags)])
    for tag in set(nertags):
        # no smoothing according to p.103 #4
        # featweights[tag].append((featcounts[tag][idx]+k)/(totalcount+v*k))
        featweights[tag].append((featcounts[tag][idx]+k)/(totalcount+v*k))
    

In [None]:
list(zip(ngramfeats, featweights['geo']))[:10]

In [None]:
%%time
# for each feat, get max value and assoc class
topfeats = []
ntags = list(sorted(list(set(nertags))))
ntags.remove('O')
for idx, feat in enumerate(ngramfeats):
    tagweights = [featweights[tag][idx] for tag in ntags]
    top_idx = tagweights.index(max(tagweights))
    top_tag = ntags[top_idx]
    top_tup = (top_tag, feat, max(tagweights))
    if top_tag != 'O':
        topfeats.append(top_tup)

In [None]:
x = [(x, y, z) for x, y, z in sorted(topfeats, key=lambda pair: pair[2], reverse=True)]
x[:10]

In [None]:
word_clf = {}
for tag in ntags:
    tmp = [a for a in x if a[0] == tag][:10]
    word_clf[tag] = [a for a in x if a[0] == tag and a[2] > 0.949]
    for tup in tmp:
        print(tup)
    print('')

In [None]:
word_clf