In [1]:
import nltk
from nltk.corpus.reader import ConllCorpusReader

In [2]:
emergingE = ConllCorpusReader('emerging_entities_17-master/','.conll',  ('words', 'pos', 'chunk'))

### Data Preparation

In [3]:
## Training and testing

train_sents = list(emergingE.tagged_sents('wnut17train.conll')) 
valid_sents = list(emergingE.tagged_sents('emerging.dev.conll'))
test_sents = list(emergingE.tagged_sents('emerging.test.conll'))

print(train_sents[0])
#each tuple contains token, syntactic tag, ner label


[('@paulwalk', 'O'), ('It', 'O'), ("'s", 'O'), ('the', 'O'), ('view', 'O'), ('from', 'O'), ('where', 'O'), ('I', 'O'), ("'m", 'O'), ('living', 'O'), ('for', 'O'), ('two', 'O'), ('weeks', 'O'), ('.', 'O'), ('Empire', 'B-location'), ('State', 'I-location'), ('Building', 'I-location'), ('=', 'O'), ('ESB', 'B-location'), ('.', 'O'), ('Pretty', 'O'), ('bad', 'O'), ('storm', 'O'), ('here', 'O'), ('last', 'O'), ('evening', 'O'), ('.', 'O')]


In [4]:
# functions of sentence representations for sequence labelling
def word2features(sent, i):
    
    word = sent[i][0]
    postag = sent[i][1]


    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        # Indicate that it is the 'beginning of a document'
        features['BOS'] = True
        
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        # Features for words that are not at the end of a document
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [5]:
# sentence representations for sequence labelling
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_valid = [sent2features(s) for s in valid_sents]
y_valid = [sent2labels(s) for s in valid_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [6]:
train_sents[0], y_train[0]

([('@paulwalk', 'O'),
  ('It', 'O'),
  ("'s", 'O'),
  ('the', 'O'),
  ('view', 'O'),
  ('from', 'O'),
  ('where', 'O'),
  ('I', 'O'),
  ("'m", 'O'),
  ('living', 'O'),
  ('for', 'O'),
  ('two', 'O'),
  ('weeks', 'O'),
  ('.', 'O'),
  ('Empire', 'B-location'),
  ('State', 'I-location'),
  ('Building', 'I-location'),
  ('=', 'O'),
  ('ESB', 'B-location'),
  ('.', 'O'),
  ('Pretty', 'O'),
  ('bad', 'O'),
  ('storm', 'O'),
  ('here', 'O'),
  ('last', 'O'),
  ('evening', 'O'),
  ('.', 'O')],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-location',
  'I-location',
  'I-location',
  'O',
  'B-location',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'])

In [7]:
X_train[0], y_train[0]

([{'bias': 1.0,
   'word.lower()': '@paulwalk',
   'word[-3:]': 'alk',
   'word[-2:]': 'lk',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'O',
   'postag[:2]': 'O',
   'BOS': True,
   '+1:word.lower()': 'it',
   '+1:word.istitle()': True,
   '+1:word.isupper()': False,
   '+1:postag': 'O',
   '+1:postag[:2]': 'O'},
  {'bias': 1.0,
   'word.lower()': 'it',
   'word[-3:]': 'It',
   'word[-2:]': 'It',
   'word.isupper()': False,
   'word.istitle()': True,
   'word.isdigit()': False,
   'postag': 'O',
   'postag[:2]': 'O',
   '-1:word.lower()': '@paulwalk',
   '-1:word.istitle()': False,
   '-1:word.isupper()': False,
   '-1:postag': 'O',
   '-1:postag[:2]': 'O',
   '+1:word.lower()': "'s",
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'O',
   '+1:postag[:2]': 'O'},
  {'bias': 1.0,
   'word.lower()': "'s",
   'word[-3:]': "'s",
   'word[-2:]': "'s",
   'word.isupper()': False,
   'word.istitle()': False,
  

### Training
Here we are using L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [8]:
# train CRF model

import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)



In [9]:
crf

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [10]:
crf.fit(X_train, y_train, X_dev= X_valid, y_dev= y_valid)

# training model parameters

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

### Evaluation
There is much more O entities in data set, but we’re more interested in other entities. To account for this we’ll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [11]:
# get label set
labels = list(crf.classes_)
labels.remove('O')
print(labels)

['B-location', 'I-location', 'B-group', 'B-corporation', 'B-person', 'B-creative-work', 'B-product', 'I-person', 'I-creative-work', 'I-corporation', 'I-group', 'I-product']


In [12]:
# evaluate CRF model
from sklearn_crfsuite import metrics

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.8899135611696579