In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics as crf_metrics
from sklearn_crfsuite.metrics import flat_f1_score, flat_precision_score, flat_recall_score, flat_classification_report

from utils import load_data, print_statistics, preprocessing

In [2]:
train_data, test_data = load_data("../data")
basic_features = ["LEMMA", "POS", "DEPREL"]
train_features, train_labels = preprocessing(train_data, basic_features)
test_features, test_labels = preprocessing(test_data, basic_features)

In [6]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [7]:
# try:
crf.fit(train_features, train_labels)
# except AttributeError:
#     pass



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [8]:
y_pred = crf.predict(test_features)

In [9]:
flat_f1_score(test_labels,y_pred,
                        average='macro')

0.42488530457768525

In [10]:
flat_recall_score(test_labels,y_pred, average='macro')

0.41373093795937843

In [11]:
flat_precision_score(test_labels,y_pred, average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


0.4648673251104133

In [12]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
C-AM-EXT -> C-AM-EXT 5.006387
AM-LOC -> R-AM-LOC 3.842399
_      -> A2      3.715848
A4     -> C-A4    3.615172
_      -> A1      3.509703
_      -> _       3.386811
_      -> A3      3.296518
AM-CAU -> C-AM-CAU 3.295259
_      -> A0      3.234089
_      -> AM-MNR  3.057011
A2     -> C-A2    3.052714
C-AM-ADV -> C-AM-ADV 2.927063
A1     -> A2      2.855698
C-A0   -> C-A0    2.600043
A0     -> A2      2.586647
A3     -> R-A3    2.559790
A0     -> R-A0    2.545416
A1     -> R-A1    2.541719
AM-EXT -> C-AM-EXT 2.520652
_      -> C-A1    2.507501

Top unlikely transitions:
AM-MOD -> AM-LOC  -2.779530
C-R-AM-TMP -> _       -2.786017
AM-CAU -> AM-TMP  -2.789634
AM-MNR -> AM-DIS  -2.810102
AM-MOD -> AM-PNC  -2.816723
R-AM-CAU -> AM-MOD  -2.857268
A1     -> C-A3    -2.907226
R-A4   -> A0      -2.920023
R-AM-MNR -> AM-MOD  -2.921969
AM-TM  -> _       -2.977551
AM-DIR -> AM-DIS  -3.040942
R-A4   -> _       -3.099440
R-AM-TMP -> A3      -3.104568
AM-EXT -> A4      -3.11421

In [13]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(5))

Top positive:
9.515160 A3       LEMMA:influenced
8.753281 _        DEPREL:P
8.637865 A3       LEMMA:vice
8.570740 AM-TMP   LEMMA:september
8.112278 AM-CAU   LEMMA:reason


In [14]:
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-5:])


Top negative:
-3.611509 A0       LEMMA:net
-3.929277 A0       DEPREL:MNR
-3.934907 A0       DEPREL:TMP
-3.939587 A2       POS:WP
-3.947910 _        LEMMA:snag


In [15]:
labels = list(crf.classes_)

In [16]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(flat_classification_report(
    test_labels, y_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           _      0.899     0.931     0.915     21622
        C-A0      0.000     0.000     0.000         0
        R-A0      0.683     0.853     0.759       116
        C-A1      0.394     0.272     0.322       136
        R-A1      0.574     0.391     0.466        69
        C-A2      0.000     0.000     0.000         3
        R-A2      0.000     0.000     0.000         4
        C-A3      0.000     0.000     0.000         0
        R-A3      0.000     0.000     0.000         0
        C-A4      0.000     0.000     0.000         0
        R-A4      0.000     0.000     0.000         0
        R-AA      0.000     0.000     0.000         0
    C-AM-ADV      0.000     0.000     0.000         0
    R-AM-ADV      0.000     0.000     0.000         0
    C-AM-CAU      0.000     0.000     0.000         1
    R-AM-CAU      1.000     1.000     1.000         2
    C-AM-DIR      0.000     0.000     0.000         1
    R-AM-DIR      0.000    

In [27]:
import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

In [28]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(train_features, train_labels)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
  _warn_prf(
  _warn_prf(
  _warn_prf(
  _warn_prf(
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 142.4min


KeyboardInterrupt: 