# Named Entity Recognition
## sklearn_crfsuite

In [36]:
import nltk
import sklearn_crfsuite
import eli5

nltk.download('conll2002')
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
print(train_sents[0])

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Tumi\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


## Feature Extraction

In [37]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(), # boolean
        'word[-3:]': word[-3:], # Last 3 charcaters of words
        'word.isupper()': word.isupper(), # boolean
        'word.istitle()': word.istitle(), # boolean
        'word.isdigit()': word.isdigit(), # boolean
        'postag': postag,
        'postag[:2]': postag[:2], # First 2 characters of word
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True # beginning of sentence

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True # end of sentence

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

print(X_train[0][1])


{'bias': 1.0, 'word.lower()': '(', 'word[-3:]': '(', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'Fpa', 'postag[:2]': 'Fp', '-1:word.lower()': 'melbourne', '-1:word.istitle()': True, '-1:word.isupper()': False, '-1:postag': 'NP', '-1:postag[:2]': 'NP', '+1:word.lower()': 'australia', '+1:word.istitle()': True, '+1:word.isupper()': False, '+1:postag': 'NP', '+1:postag[:2]': 'NP'}


## Training CRF Model

In [38]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.5,
    max_iterations=20,
    all_possible_transitions=False,
)
print(crf.fit(X_train, y_train))

eli5.show_weights(crf, top=15)





CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.5,
    keep_tempfiles=None, max_iterations=20)


From \ To,O,B-LOC,I-LOC,B-MISC,I-MISC,B-ORG,I-ORG,B-PER,I-PER
O,3.272,2.192,0.0,2.087,0.0,3.45,0.0,2.31,0.0
B-LOC,-0.258,-0.098,4.028,0.0,0.0,0.0,0.0,-0.212,0.0
I-LOC,-0.174,-0.605,3.411,0.0,0.0,0.0,0.0,0.0,0.0
B-MISC,-0.671,-0.338,0.0,0.0,4.039,-0.306,0.0,-0.329,0.0
I-MISC,-0.799,-0.991,0.0,-0.516,4.945,-0.811,0.0,-0.606,0.0
B-ORG,-0.095,-0.241,0.0,-0.566,0.0,-1.003,4.71,-0.305,0.0
I-ORG,-0.338,-1.744,0.0,-0.834,0.0,-1.371,5.031,-0.47,0.0
B-PER,-0.4,-0.765,0.0,0.0,0.0,-0.809,0.0,-0.929,4.303
I-PER,-0.673,-0.467,0.0,0.0,0.0,0.0,0.0,-0.654,3.726

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8
+4.063,postag[:2]:Fp,,,,,,,
+3.103,BOS,,,,,,,
+2.395,bias,,,,,,,
+2.107,postag:CC,,,,,,,
+2.107,postag[:2]:CC,,,,,,,
+2.089,"word.lower():,",,,,,,,
+2.089,"word[-3:]:,",,,,,,,
+2.089,postag:Fc,,,,,,,
+2.089,postag[:2]:Fc,,,,,,,
+1.981,EOS,,,,,,,

Weight?,Feature
+4.063,postag[:2]:Fp
+3.103,BOS
+2.395,bias
+2.107,postag:CC
+2.107,postag[:2]:CC
+2.089,"word.lower():,"
+2.089,"word[-3:]:,"
+2.089,postag:Fc
+2.089,postag[:2]:Fc
+1.981,EOS

Weight?,Feature
+2.516,word.istitle()
+2.210,-1:word.lower():en
+0.899,word[-3:]:rid
+0.898,word.lower():madrid
+0.641,word.lower():españa
+0.635,word[-3:]:ona
+0.593,+1:postag[:2]:Fp
+0.590,word[-3:]:aña
+0.511,word.lower():parís
… 2291 more positive …,… 2291 more positive …

Weight?,Feature
+0.883,-1:word.istitle()
+0.661,-1:word.lower():de
+0.580,word[-3:]:de
+0.576,word.lower():de
+0.525,-1:word.lower():san
+0.441,+1:word.istitle()
+0.439,word.istitle()
+0.333,-1:word.lower():la
+0.261,postag[:2]:SP
+0.261,postag:SP

Weight?,Feature
+1.756,word.isupper()
+0.690,word.istitle()
+0.602,postag:Fe
+0.602,postag[:2]:Fe
+0.602,"word.lower():"""
+0.602,"word[-3:]:"""
+0.534,+1:word.istitle()
+0.506,"-1:word.lower():"""
+0.506,-1:postag:Fe
+0.506,-1:postag[:2]:Fe

Weight?,Feature
+1.357,-1:word.istitle()
+0.671,-1:word.lower():de
+0.594,+1:postag[:2]:Fe
+0.594,+1:postag:Fe
+0.594,"+1:word.lower():"""
+0.368,-1:postag:NC
+0.368,-1:postag[:2]:NC
+0.322,-1:word.lower():liga
+0.316,word[-3:]:de
… 3686 more positive …,… 3686 more positive …

Weight?,Feature
+2.673,word.lower():efe
+2.510,word.isupper()
+2.068,word[-3:]:EFE
+1.165,word.lower():gobierno
+1.137,word.istitle()
+1.014,-1:word.lower():del
+0.951,word[-3:]:rno
+0.665,word.lower():pp
+0.665,word[-3:]:PP
… 3527 more positive …,… 3527 more positive …

Weight?,Feature
+1.493,-1:word.istitle()
+1.194,-1:word.lower():de
+0.535,-1:word.lower():real
+0.507,word[-3:]:rid
+0.445,word[-3:]:de
+0.432,word.lower():de
+0.426,-1:postag[:2]:SP
+0.426,-1:postag:SP
+0.396,word.lower():madrid
… 3483 more positive …,… 3483 more positive …

Weight?,Feature
+1.692,word.istitle()
+0.678,-1:postag:VMI
+0.599,+1:postag[:2]:VM
+0.587,+1:postag:VMI
+0.587,postag[:2]:NP
+0.587,postag:NP
… 4126 more positive …,… 4126 more positive …
… 357 more negative …,… 357 more negative …
-0.691,-1:word.istitle()
-0.708,postag[:2]:SP

Weight?,Feature
+2.731,-1:word.istitle()
+0.734,word.istitle()
+0.655,-1:word.lower():josé
+0.597,-1:postag:AQ
+0.597,-1:postag[:2]:AQ
+0.508,-1:postag[:2]:VM
+0.483,-1:word.lower():juan
+0.416,-1:word.lower():maría
+0.411,-1:postag:VMI
… 3908 more positive …,… 3908 more positive …
