In [1]:
import nltk
import sklearn_crfsuite
import eli5

In [2]:
import pickle

In [4]:
from sklearn.model_selection import train_test_split

In [3]:
with open("word_data_file.obj", "rb") as infile:
    data = pickle.load(infile)

def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag 
    }
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1
        })
    else:
        features['BOS'] = True
        
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1
        })
    else:
        features['EOS'] = True
                
    return features

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [5]:
%%time
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

CPU times: user 3.81 s, sys: 500 ms, total: 4.31 s
Wall time: 4.39 s


In [18]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.2049587713810429, 
    c2=0.035279048949617219, 
    max_iterations=200, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 3min 9s, sys: 4.71 s, total: 3min 14s
Wall time: 3min 24s


In [19]:
eli5.show_weights(crf, top=5, show=['transition_features'])

From \ To,I,date,geogName,orgName,persName,placeName,time
I,2.805,-0.117,0.545,0.91,0.601,0.706,-0.261
date,-0.102,0.87,-1.956,-1.132,-1.5,-1.486,-2.078
geogName,0.563,-1.799,1.734,-0.718,-1.388,-1.418,-1.518
orgName,1.112,-0.798,-0.672,1.937,-1.085,-0.751,-0.661
persName,0.313,-0.855,-2.057,-1.874,0.988,-1.759,-1.821
placeName,0.249,-1.636,-1.652,-1.268,-1.712,0.911,-2.535
time,-0.181,-1.816,-1.507,-1.285,-1.998,-2.569,1.384


In [20]:
eli5.show_weights(crf, top=30)

From \ To,I,date,geogName,orgName,persName,placeName,time
I,2.805,-0.117,0.545,0.91,0.601,0.706,-0.261
date,-0.102,0.87,-1.956,-1.132,-1.5,-1.486,-2.078
geogName,0.563,-1.799,1.734,-0.718,-1.388,-1.418,-1.518
orgName,1.112,-0.798,-0.672,1.937,-1.085,-0.751,-0.661
persName,0.313,-0.855,-2.057,-1.874,0.988,-1.759,-1.821
placeName,0.249,-1.636,-1.652,-1.268,-1.712,0.911,-2.535
time,-0.181,-1.816,-1.507,-1.285,-1.998,-2.569,1.384

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+12.574,word.lower():wszyscy,,,,,
+9.967,word.lower():oklaski,,,,,
+9.474,postag:Adv,,,,,
+8.719,word.lower():wszystkich,,,,,
+8.615,word.lower():czego,,,,,
+8.305,postag:Ppron3,,,,,
+8.264,word.lower():tego,,,,,
+8.198,word.lower():wnioski,,,,,
+7.928,word.lower():jakie,,,,,
+7.802,word.lower():wszystkie,,,,,

Weight?,Feature
+12.574,word.lower():wszyscy
+9.967,word.lower():oklaski
+9.474,postag:Adv
+8.719,word.lower():wszystkich
+8.615,word.lower():czego
+8.305,postag:Ppron3
+8.264,word.lower():tego
+8.198,word.lower():wnioski
+7.928,word.lower():jakie
+7.802,word.lower():wszystkie

Weight?,Feature
+10.436,word.lower():lutego
+10.229,word.lower():kwietnia
+10.185,word.lower():stycznia
+9.648,word.lower():września
+9.617,word.lower():sierpnia
+9.342,word.lower():grudniu
+9.326,word.lower():listopadzie
+9.030,word.lower():czerwca
+8.879,word.lower():kwietniu
+8.852,word.lower():styczniu

Weight?,Feature
+7.429,word.lower():himalaistów
+6.947,word.lower():tatrach
+6.497,+1:word.lower():łączną
+6.478,word.lower():kaszubszczyźnie
+6.128,+1:word.lower():eiffla
+6.080,word.lower():pseudoeuropejczyków
+5.939,word.lower():l'alsacienne
+5.937,word.lower():wenus
+5.923,word.lower():kormoranów
+5.902,-1:word.lower():rz.

Weight?,Feature
+8.406,word[-2:]:iA
+8.223,word[-3:]:-ie
+8.178,-1:word.lower():parkować
+8.090,word.lower():benedyktyna
+7.745,+1:word.lower():nieobecnym
+7.744,"word.lower():""zurpex-ie"""
+7.496,+1:word.lower():dofinansowani
+7.414,word.lower():parlamentarzyście
+7.402,word.lower():esbeków
+7.361,word.lower():pis-owcy

Weight?,Feature
+9.211,word.lower():hitlerowskiego
+8.754,"word.lower():""babci"""
+8.664,word.lower():bożych
+8.284,word.lower():boże
+7.924,word.lower():jerzy
+7.299,word.lower():wiktor
+7.274,word.lower():nazirejczyków
+7.180,word.lower():alicja
+7.094,word.lower():hitlerowskich
+6.795,word.lower():boskich

Weight?,Feature
+9.138,word.lower():polskiego
+8.290,word.lower():rosyjskiego
+8.192,word.lower():francuskiego
+8.071,word.lower():niemieckiego
+7.955,word.lower():rybniczanie
+7.954,word.lower():amerykańskiego
+7.926,word.lower():polski
+7.797,word.lower():amerykanie
+7.370,word.lower():warszawiaków
+7.258,word.lower():warszawskiego

Weight?,Feature
+7.241,postag:Adv
+6.274,+1:word.lower():zadumie
+5.916,word.lower():godzinach
+5.888,word[-3:]:W
+5.888,word[-2:]:W
+5.876,-1:word.lower():wybiła
+5.802,word.lower():drugiej
+5.749,+1:word.lower():snuciu
+5.606,word.lower():godzina
+5.482,word.lower():godzinie


In [25]:
eli5.show_weights(crf, top=10, feature_re='^word\.is', 
                  horizontal_layout=True, show=['targets'], targets=['date', 'time', 
                                                                     'persName', 'placeName', 
                                                                     'geogName', 'orgName'])

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
+5.042,word.isdigit(),,,,
+0.262,word.isupper(),,,,
-4.449,word.istitle(),,,,
+2.172,word.isdigit(),,,,
+0.014,word.isupper(),,,,
-3.442,word.istitle(),,,,
+3.222,word.istitle(),,,,
+1.239,word.isupper(),,,,
-1.683,word.isdigit(),,,,
+0.779,word.istitle(),,,,

Weight?,Feature
5.042,word.isdigit()
0.262,word.isupper()
-4.449,word.istitle()

Weight?,Feature
2.172,word.isdigit()
0.014,word.isupper()
-3.442,word.istitle()

Weight?,Feature
3.222,word.istitle()
1.239,word.isupper()
-1.683,word.isdigit()

Weight?,Feature
0.779,word.istitle()
0.002,word.isupper()
-3.209,word.isdigit()

Weight?,Feature
1.205,word.istitle()
0.034,word.isupper()
-0.794,word.isdigit()

Weight?,Feature
2.441,word.isupper()
1.768,word.istitle()
-0.557,word.isdigit()


In [26]:
eli5.show_weights(crf, top=10, feature_re='^word\.is', 
                  horizontal_layout=True, show=['targets'], targets=['date', 'time', 
                                                                     'persName'])

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+5.042,word.isdigit(),
+0.262,word.isupper(),
-4.449,word.istitle(),
+2.172,word.isdigit(),
+0.014,word.isupper(),
-3.442,word.istitle(),
+3.222,word.istitle(),
+1.239,word.isupper(),
-1.683,word.isdigit(),
y=date  top features,y=time  top features,y=persName  top features

Weight?,Feature
5.042,word.isdigit()
0.262,word.isupper()
-4.449,word.istitle()

Weight?,Feature
2.172,word.isdigit()
0.014,word.isupper()
-3.442,word.istitle()

Weight?,Feature
3.222,word.istitle()
1.239,word.isupper()
-1.683,word.isdigit()


In [27]:
eli5.show_weights(crf, top=10, feature_re='^word\.is', 
                  horizontal_layout=True, show=['targets'], targets=['placeName', 
                                                                     'geogName', 'orgName'])

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+0.779,word.istitle(),
+0.002,word.isupper(),
-3.209,word.isdigit(),
+1.205,word.istitle(),
+0.034,word.isupper(),
-0.794,word.isdigit(),
+2.441,word.isupper(),
+1.768,word.istitle(),
-0.557,word.isdigit(),
y=placeName  top features,y=geogName  top features,y=orgName  top features

Weight?,Feature
0.779,word.istitle()
0.002,word.isupper()
-3.209,word.isdigit()

Weight?,Feature
1.205,word.istitle()
0.034,word.isupper()
-0.794,word.isdigit()

Weight?,Feature
2.441,word.isupper()
1.768,word.istitle()
-0.557,word.isdigit()


In [28]:
eli5.show_weights(crf, top=10, feature_re='postag', 
                  horizontal_layout=True, show=['targets'], targets=['placeName', 
                                                                     'geogName', 'orgName'])

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+1.444,postag:Noun,
+1.313,postag:Adj,
+1.248,postag:Xxx,
+0.862,postag:Brev,
+0.849,-1:postag:Xxx,
… 13 more positive …,… 13 more positive …,
… 20 more negative …,… 20 more negative …,
-0.825,-1:postag:Ppron12,
-0.958,+1:postag:Winien,
-1.747,postag:Qub,

Weight?,Feature
+1.444,postag:Noun
+1.313,postag:Adj
+1.248,postag:Xxx
+0.862,postag:Brev
+0.849,-1:postag:Xxx
… 13 more positive …,… 13 more positive …
… 20 more negative …,… 20 more negative …
-0.825,-1:postag:Ppron12
-0.958,+1:postag:Winien
-1.747,postag:Qub

Weight?,Feature
+0.991,-1:postag:Pred
+0.670,+1:postag:Num
+0.666,postag:Noun
… 14 more positive …,… 14 more positive …
… 21 more negative …,… 21 more negative …
-0.734,-1:postag:Imps
-0.757,-1:postag:Ppron12
-0.847,+1:postag:Qub
-0.913,postag:Prep
-0.991,+1:postag:Ppron12

Weight?,Feature
+1.530,postag:Ppas
+0.730,postag:Prep
+0.583,+1:postag:Xxx
+0.537,postag:Ppron3
+0.521,postag:Num
… 16 more positive …,… 16 more positive …
… 31 more negative …,… 31 more negative …
-0.532,+1:postag:Ppas
-0.601,+1:postag:Numcol
-0.628,-1:postag:Pcon


In [29]:
eli5.show_weights(crf, top=10, feature_re='postag', 
                  horizontal_layout=True, show=['targets'], targets=['date', 'time', 
                                                                     'persName'])

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+1.228,+1:postag:Adjc,
+1.122,postag:Qub,
+0.877,-1:postag:Ppas,
+0.840,+1:postag:Ppron12,
+0.616,postag:Verbfin,
… 20 more positive …,… 20 more positive …,
… 14 more negative …,… 14 more negative …,
-0.606,+1:postag:Qub,
-0.633,postag:Noun,
-0.804,-1:postag:Pact,

Weight?,Feature
+1.228,+1:postag:Adjc
+1.122,postag:Qub
+0.877,-1:postag:Ppas
+0.840,+1:postag:Ppron12
+0.616,postag:Verbfin
… 20 more positive …,… 20 more positive …
… 14 more negative …,… 14 more negative …
-0.606,+1:postag:Qub
-0.633,postag:Noun
-0.804,-1:postag:Pact

Weight?,Feature
+7.241,postag:Adv
+1.387,-1:postag:Interj
+1.245,+1:postag:Winien
+0.647,-1:postag:Prep
+0.625,-1:postag:Pcon
+0.603,-1:postag:Verbfin
… 10 more positive …,… 10 more positive …
… 14 more negative …,… 14 more negative …
-0.703,+1:postag:Adv
-1.371,postag:Prep

Weight?,Feature
+3.187,postag:Brev
+2.166,postag:Noun
+1.582,+1:postag:Pant
+1.129,+1:postag:Xxx
… 14 more positive …,… 14 more positive …
… 26 more negative …,… 26 more negative …
-1.287,postag:Xxx
-1.314,-1:postag:Num
-1.943,postag:Adj
-2.138,postag:Conj
