In [2]:
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

In [3]:
from konlpy.tag import Mecab;m=Mecab()
import pickle
import re

In [4]:
label = pickle.load(open('data/flower_label.p','rb'))

In [5]:
train = label[:90]
test = label[10:]

In [6]:
len(train)

90

In [7]:
train[3]

[('원', 'NNG', 'O'),
 ('하', 'XSV', 'O'),
 ('시', 'EP', 'O'),
 ('는', 'ETM', 'O'),
 ('화분', 'NNG', 'B-PRD'),
 ('이', 'JKS', 'O'),
 ('있', 'VA', 'O'),
 ('으신', 'EP+ETM', 'O'),
 ('가요', 'NNG', 'O'),
 ('?', 'SF', 'O')]

# Feature 

In [8]:

def word2features(sent, i):
    """
    sent는 문장을 형태소 단위로 쪼개 POS 태깅을 한
    [(word, POS tag),...,] 리스트
    i는 각 형태소 튜플을 참조할 인덱스
    """
    
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-1:]=' + word[:-1],
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0: # 앞 뒤 단어 하나씩 더 본다.
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else: # 맨 앞 단어거나 맨 뒷 단어면 BOS, EOS 피처
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [9]:
sent2features(train[0])[0]

['bias',
 'word.lower=1',
 'word[-1:]=',
 'word.isdigit=True',
 'postag=SN',
 'postag[:2]=SN',
 'BOS',
 '+1:word.lower=분',
 '+1:postag=NNBC',
 '+1:postag[:2]=NN']

In [10]:
%%time
X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_test = [sent2features(s) for s in test]
y_test = [sent2labels(s) for s in test]

CPU times: user 20 ms, sys: 4 ms, total: 24 ms
Wall time: 21.2 ms


In [30]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 48 ms, sys: 0 ns, total: 48 ms
Wall time: 43.6 ms


In [31]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [32]:
%%time
trainer.train('flower-100.crfsuite')

CPU times: user 112 ms, sys: 8 ms, total: 120 ms
Wall time: 116 ms


In [11]:
tagger = pycrfsuite.Tagger()
tagger.open('flower-100.crfsuite')

<contextlib.closing at 0x7fbdf4463908>

In [36]:
example_sent = test[5]
print(' '.join(sent2tokens(example_sent)))

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

안녕 하 세요 ~ 윤여훈 님 로앤비 의 의뢰 를 받 은 문비서 입니다 . 10 월 19 일 결혼기념일 에 과일 바구니 와 꽃바구니 중 에 어떤 걸로 받 기 원 하 시 는지요 ?
Predicted: INT O O O INT O B-ORG O O O O O B-PER O O O O O O B-EVE O B-PRD I-PRD O O O O O O O O O O O O O
Correct:   INT O O O B-PER O B-ORG O O O O O B-PER O O O O O O B-EVE O B-PRD I-PRD O B-PRD O O O O O O O O O O O


In [1]:
X_test

NameError: name 'X_test' is not defined

In [12]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [13]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 12.1 ms


In [23]:
y_pred

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'INT',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'INT',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O'],
 ['INT', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['INT',
  'O',
  'O',
  'O',
  'INT',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-EVE',
  'O',
  'B-PRD',
  'I-PRD',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PRD', 'I-PRD', 'O'],
 ['O',
  'O',
  'B-PRD',
  'I-PRD',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'INT',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'I-LOC',
  'I-LOC',
  'I-LOC',
  'I-LOC',
  'I-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  '

In [39]:
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      B-EVE       1.00      1.00      1.00         2
        INT       0.92      0.78      0.85        78
      B-LOC       0.80      0.63      0.71        19
      I-LOC       0.53      0.96      0.69        25
      B-ORG       0.82      0.64      0.72        22
      I-ORG       0.30      0.20      0.24        15
      B-PER       0.91      0.50      0.65        20
      I-PER       1.00      0.67      0.80         6
      B-PRD       0.94      0.67      0.78        24
      I-PRD       0.83      0.71      0.77        14

avg / total       0.82      0.69      0.73       225

