# CRF for NER


Example of python-crfsuite

https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb


## Exercise:

Please redo assignment 4 (http://verbs.colorado.edu/~mahu0110/teaching/ling5832-2018/5832-hw4.html) with CRF, by using python-crfsuite


In [1]:
import pycrfsuite

### Read in the training file

In [2]:
def readconll(file):
    lines = [line.strip() for line in open(file)]
    while lines[-1] == '':  # Remove trailing empty lines
        lines.pop()
    s = [x.split('_') for x in '_'.join(lines).split('__')]  # Quick split corpus into sentences
    return [[y.split() for y in x] for x in s]

In [3]:
sentences = readconll('conll2003data/eng.train')

### Define features

In [4]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower='+word.lower(),
        'word[-3:]='+word[-3:],
        'word[-2:]='+word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag='+postag,
        'postag[:2]='+postag[:2]
    ]
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
                '-1:word.lower='+word1.lower(),
                '-1:word.istitle=%s' % word1.istitle(),
                '-1:word.isupper=%s' % word1.isupper(),
                '-1:postag='+postag1,
                '-1:postag[:2]='+postag1[:2]
            ])
    else:
        features.append('BOS')
        
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
                '-1:word.lower='+word1.lower(),
                '-1:word.istitle=%s' % word1.istitle(),
                '-1:word.isupper=%s' % word1.isupper(),
                '-1:postag='+postag1,
                '-1:postag[:2]='+postag1[:2]
            ])
    else:
        features.append('EOS')
    
    return features

In [5]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [6]:
def sent2labels(sent):
    return [label for token, postag, phraseloc, label in sent]

In [7]:
#Extract the features from the training data

X_train = [sent2features(s) for s in sentences]
y_train = [sent2labels(s) for s in sentences]

### Train the model

In [8]:
trainer = pycrfsuite.Trainer(verbose=False) #create pycrfsuite.Trainer
#verbose=False: don't print debug messages during training
#the default training algorithm is lbfgs, i.e. Gradient descent using the L-BFGS method.
#other optional training algorithms include:
    #l2sgd: Stochastic Gradient Descent with L2 regularization term 
    #ap: Averaged Perceptron
    #pa: Passive Aggressive
    #arow: Adaptive Rgularization of Weight Vector
#Please try different training algorithms

In [None]:
for xseq, yseq in zip(X_train, y_train): #load the training data to CRFsuite
    trainer.append(xseq, yseq)

In [9]:
#set training parameters
#the example use L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization
trainer.set_params({
        'c1':1.0, #coefficient for L1 penalty
        'c2':1e-3, #coefficient for L2 penalty
        'max_iterations':50, #stop earlier
        'feature.possible_transitions':True #include transitions that are possible, but not observed
    })

In [17]:
#Get training parameters
trainer.get_params()

{'c1': 1.0,
 'c2': 0.001,
 'delta': 1e-05,
 'epsilon': 1e-05,
 'feature.minfreq': 0.0,
 'feature.possible_states': False,
 'feature.possible_transitions': True,
 'linesearch': 'MoreThuente',
 'max_iterations': 50,
 'max_linesearch': 20,
 'num_memories': 6,
 'period': 10}

In [10]:
#train the model
trainer.train('conll2003.crfsuite')

### Make predictions

In [11]:
tagger = pycrfsuite.Tagger() #create pycrfsuite.Tagger
tagger.open('conll2003.crfsuite') #open the model

<contextlib.closing at 0x105b05240>

In [12]:
!wget http://verbs.colorado.edu/~mahu0110/teaching/ling5832-2018/conlleval.perl

--2018-04-13 11:39:23--  http://verbs.colorado.edu/~mahu0110/teaching/ling5832-2018/conlleval.perl
Resolving verbs.colorado.edu... 128.138.73.54
Connecting to verbs.colorado.edu|128.138.73.54|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12728 (12K)
Saving to: 'conlleval.perl.3'


2018-04-13 11:39:23 (2.97 MB/s) - 'conlleval.perl.3' saved [12728/12728]



In [13]:
test_sentences = readconll('conll2003data/eng.testa')
fw = open('eng.guessa', 'w')
for test_s in test_sentences:
    tagged = tagger.tag(sent2features(test_s))
    for i, guess in enumerate(tagged):
        print (test_s[i], guess)
        fw.write(' '.join(x for x in test_s[i]) + ' ' + guess + '\n')
    fw.write('\n')
fw.close()

['-DOCSTART-', '-X-', 'O', 'O'] O
['CRICKET', 'NNP', 'I-NP', 'O'] O
['-', ':', 'O', 'O'] O
['LEICESTERSHIRE', 'NNP', 'I-NP', 'I-ORG'] I-ORG
['TAKE', 'NNP', 'I-NP', 'O'] I-ORG
['OVER', 'IN', 'I-PP', 'O'] O
['AT', 'NNP', 'I-NP', 'O'] O
['TOP', 'NNP', 'I-NP', 'O'] O
['AFTER', 'NNP', 'I-NP', 'O'] O
['INNINGS', 'NNP', 'I-NP', 'O'] O
['VICTORY', 'NN', 'I-NP', 'O'] O
['.', '.', 'O', 'O'] O
['LONDON', 'NNP', 'I-NP', 'I-LOC'] I-LOC
['1996-08-30', 'CD', 'I-NP', 'O'] O
['West', 'NNP', 'I-NP', 'I-MISC'] I-LOC
['Indian', 'NNP', 'I-NP', 'I-MISC'] I-LOC
['all-rounder', 'NN', 'I-NP', 'O'] O
['Phil', 'NNP', 'I-NP', 'I-PER'] I-PER
['Simmons', 'NNP', 'I-NP', 'I-PER'] I-PER
['took', 'VBD', 'I-VP', 'O'] O
['four', 'CD', 'I-NP', 'O'] O
['for', 'IN', 'I-PP', 'O'] O
['38', 'CD', 'I-NP', 'O'] O
['on', 'IN', 'I-PP', 'O'] O
['Friday', 'NNP', 'I-NP', 'O'] O
['as', 'IN', 'I-PP', 'O'] O
['Leicestershire', 'NNP', 'I-NP', 'I-ORG'] I-ORG
['beat', 'VBD', 'I-VP', 'O'] O
['Somerset', 'NNP', 'I-NP', 'I-ORG'] I-ORG
['by', 

In [19]:
sent2features(test_sentences[1])

[['bias',
  'word.lower=cricket',
  'word[-3:]=KET',
  'word[-2:]=ET',
  'word.isupper=True',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=NNP',
  'postag[:2]=NN',
  'BOS',
  '-1:word.lower=-',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '-1:postag=:',
  '-1:postag[:2]=:'],
 ['bias',
  'word.lower=-',
  'word[-3:]=-',
  'word[-2:]=-',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=:',
  'postag[:2]=:',
  '-1:word.lower=cricket',
  '-1:word.istitle=False',
  '-1:word.isupper=True',
  '-1:postag=NNP',
  '-1:postag[:2]=NN',
  '-1:word.lower=leicestershire',
  '-1:word.istitle=False',
  '-1:word.isupper=True',
  '-1:postag=NNP',
  '-1:postag[:2]=NN'],
 ['bias',
  'word.lower=leicestershire',
  'word[-3:]=IRE',
  'word[-2:]=RE',
  'word.isupper=True',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=NNP',
  'postag[:2]=NN',
  '-1:word.lower=-',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '-1:postag=:',
  '-1:po

In [14]:
!perl conlleval.perl < eng.guessa

processed 51578 tokens with 5942 phrases; found: 5654 phrases; correct: 4829.
accuracy:  96.94%; precision:  85.41%; recall:  81.27%; FB1:  83.29
              LOC: precision:  87.46%; recall:  82.04%; FB1:  84.66  1723
             MISC: precision:  88.03%; recall:  78.20%; FB1:  82.83  819
              ORG: precision:  78.69%; recall:  74.65%; FB1:  76.62  1272
              PER: precision:  86.96%; recall:  86.86%; FB1:  86.91  1840


In [15]:
testb_sentences = readconll('conll2003data/eng.testb')
fwb = open('eng.guessb', 'w')
for test_s in testb_sentences:
    tagged = tagger.tag(sent2features(test_s))
    for i, guess in enumerate(tagged):
        print (test_s[i], guess)
        fwb.write(' '.join(x for x in test_s[i]) + ' ' + guess + '\n')
    fwb.write('\n')
fwb.close()

['-DOCSTART-', '-X-', '-X-', 'O'] O
['SOCCER', 'NN', 'I-NP', 'O'] O
['-', ':', 'O', 'O'] O
['JAPAN', 'NNP', 'I-NP', 'I-LOC'] I-LOC
['GET', 'VB', 'I-VP', 'O'] O
['LUCKY', 'NNP', 'I-NP', 'O'] O
['WIN', 'NNP', 'I-NP', 'O'] O
[',', ',', 'O', 'O'] O
['CHINA', 'NNP', 'I-NP', 'I-PER'] I-LOC
['IN', 'IN', 'I-PP', 'O'] O
['SURPRISE', 'DT', 'I-NP', 'O'] O
['DEFEAT', 'NN', 'I-NP', 'O'] O
['.', '.', 'O', 'O'] O
['Nadim', 'NNP', 'I-NP', 'I-PER'] I-PER
['Ladki', 'NNP', 'I-NP', 'I-PER'] I-PER
['AL-AIN', 'NNP', 'I-NP', 'I-LOC'] I-ORG
[',', ',', 'O', 'O'] O
['United', 'NNP', 'I-NP', 'I-LOC'] I-ORG
['Arab', 'NNP', 'I-NP', 'I-LOC'] I-ORG
['Emirates', 'NNPS', 'I-NP', 'I-LOC'] I-ORG
['1996-12-06', 'CD', 'I-NP', 'O'] O
['Japan', 'NNP', 'I-NP', 'I-LOC'] I-LOC
['began', 'VBD', 'I-VP', 'O'] O
['the', 'DT', 'I-NP', 'O'] O
['defence', 'NN', 'I-NP', 'O'] O
['of', 'IN', 'I-PP', 'O'] O
['their', 'PRP$', 'I-NP', 'O'] O
['Asian', 'JJ', 'I-NP', 'I-MISC'] I-MISC
['Cup', 'NNP', 'I-NP', 'I-MISC'] I-MISC
['title', 'NN', 'I

In [16]:
!perl conlleval.perl < eng.guessb

processed 46666 tokens with 5648 phrases; found: 5379 phrases; correct: 4154.
accuracy:  95.10%; precision:  77.23%; recall:  73.55%; FB1:  75.34
              LOC: precision:  82.05%; recall:  75.36%; FB1:  78.56  1532
             MISC: precision:  75.24%; recall:  66.67%; FB1:  70.69  622
              ORG: precision:  71.52%; recall:  66.23%; FB1:  68.77  1538
              PER: precision:  78.78%; recall:  82.19%; FB1:  80.45  1687
