In [41]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

print(sklearn.__version__)

0.19.0


In [1]:
import csv
txt = 'train.txt'
with open(txt, 'r') as file:
    data = file.readlines()
validation_len = len(data)/5
validation_set = data[0:validation_len]
train_set = data[validation_len:]

In [20]:
# Extract features from words. 
def word2features(word_line,pos_line,i):
    word = word_line[i]
    postag = pos_line[i]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = word_line[i-1]
        postag1 = pos_line[i-1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(word_line)-1:
        word1 = word_line[i+1]
        postag1 = pos_line[i+1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features   

In [42]:
# contruct train and test set for trainer
def data2features(data):
    X = []
    Y = []
    for i in range(0,len(data),3):
        word_line = data[i].split()
        pos_line = data[i+1].split()
        NE_line = data[i+2].split()

        feature = [word2features(word_line,pos_line,i) for i in range(len(word_line))]
        label = NE_line

        X.append(feature)
        Y.append(label)
    return X,Y

In [39]:
X_train,y_train = data2features(train_set)
X_test,y_test = data2features(validation_set)

trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [44]:
# set parameters
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [47]:
trainer.train('NER')
tagger = pycrfsuite.Tagger()
tagger.open('NER')

CPU times: user 12.5 s, sys: 79.9 ms, total: 12.5 s
Wall time: 12.6 s


In [52]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [53]:
y_pred = [tagger.tag(xseq) for xseq in X_test]
print(bio_classification_report(y_test, y_pred))

CPU times: user 318 ms, sys: 28.7 ms, total: 347 ms
Wall time: 348 ms


In [55]:
# helper function to output the NE results
def tagging(begin_index,end_index,tag):
    global PER,LOC,ORG,MISC
    if tag == 'PER':
         PER =  PER+str(begin_index)+'-'+str(end_index)+' '
    if tag == 'LOC':
        LOC =  LOC+str(begin_index)+'-'+str(end_index)+' '
    if tag == 'ORG':
        ORG =  ORG+str(begin_index)+'-'+str(end_index)+' '
    if tag == 'MISC':
        MISC = MISC+str(begin_index)+'-'+str(end_index)+' '

In [72]:
# helper function2 to output the NE results
def tagging2(tags,num_line):
    begin = False
    for i in range(len(tags)):
        if tags[i][0]=='B':
            begin_index = num_line[i]
            if begin == True:
                tag = tags[i-1][2:]
                end_index = num_line[i-1]
                tagging(begin_index,end_index,tag)
            begin = True
            
        if tags[i][0]=='O' and begin==True:
                tag = tags[i-1][2:]
                end_index = num_line[i-1]
                tagging(begin_index,end_index,tag)
                begin = False
                
        if i == len(tags) and begin==True:
            tag = tags[i][2:]
            end_index = num_line[i]
            tagging(begin_index,end_index,tag)

In [56]:
txt = 'train.txt'
with open(txt, 'r') as file:
    data = file.readlines()

X_train,y_train = data2features(data)
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.train('NER')
tagger = pycrfsuite.Tagger()
tagger.open('NER')

<contextlib.closing at 0x11666cfd0>

In [79]:
headers = ['Type','Prediction']
rows = []
PER = ''
LOC = ''
ORG = ''
MISC = ''

txt = 'test.txt'
with open(txt, 'r') as file:
    data = file.readlines()

for i in range(0,len(data),3):
        word_line = data[i].split()
        pos_line = data[i+1].split()
        num_line = data[i+2].split()
    
        feature = [word2features(word_line,pos_line,i) for i in range(len(word_line))]
        tags = tagger.tag(feature)
        tagging2(tags,num_line)
        


rows.append(('PER',PER))
rows.append(('LOC',LOC))
rows.append(('ORG',ORG))
rows.append(('MISC',MISC))
               
            
with open('CRF.csv','w') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(rows)  