In [1]:
! pip install python-crfsuite



In [2]:
train_sents = [[('The', 'DT'), ('dog', 'N'), ('barks', 'V'), ('loudly', 'RB')],
            [('The', 'DT'), ('cat', 'N'), ('meows', 'V'), ('softly', 'RB')],
             [('The', 'DT'), ('boy', 'N'), ('ate', 'VZ'), ('rice', 'N')],
             [('The', 'DT'), ('girl', 'N'), ('drank', 'VZ'), ('water', 'N')]
            ]

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for token, postag in sent]

def sent2tokens(sent):
    return [token for token, postag in sent]

In [9]:
X_train = [sent2features(s) for s in train_sents]
y_train =[sent2labels(s) for s in train_sents]


In [10]:
import pycrfsuite

trainer = pycrfsuite.Trainer(algorithm='lbfgs', verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [11]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.train('postagger.crfsuite')

In [32]:
from itertools import chain

tagger = pycrfsuite.Tagger()
tagger.open('postagger.crfsuite')

test_sents = [[('The'), ('cat'), ('drinks'), ('slowly')]
            ]

X_test = [sent2features(s) for s in test_sents]
y_pred = list(chain(*[tagger.tag(xseq) for xseq in X_test]))

In [33]:
y_pred

['V', 'RB', 'V', 'RB']

In [34]:
from itertools import chain

tagger = pycrfsuite.Tagger()
tagger.open('postagger.crfsuite')

test_sents = [[('The', 'DT'), ('cat'), ('drinks'), ('slowly')]
            ]

X_test = [sent2features(s) for s in test_sents]
y_pred = list(chain(*[tagger.tag(xseq) for xseq in X_test]))

In [35]:
y_pred

['DT', 'N', 'V', 'RB']