In [1]:
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

from time import time

from konlpy import tag
from konlpy.corpus import kolaw
from konlpy.utils import csvwrite, pprint


def tagging(tagger, text):
    r = []
    try:
        r = getattr(tag, tagger)().pos(text)
    except Exception as e:
        print "Uhoh,", e
    return r


def measure_time(taggers, mult=6):
    doc = kolaw.open('constitution.txt').read()*6
    data = [['n'] + taggers]
    for i in range(mult):
        doclen = 10**i
        times = [time()]
        diffs = [doclen]
        for tagger in taggers:
            r = tagging(tagger, doc[:doclen])
            times.append(time())
            diffs.append(times[-1] - times[-2])
            print '%s\t%s\t%s' % (tagger[:5], doclen, diffs[-1])
            pprint(r[:5])
        data.append(diffs)
        print
    return data


def measure_accuracy(taggers, text):
    print '\n%s' % text
    result = []
    for tagger in taggers:
        print tagger,
        r = tagging(tagger, text)
        pprint(r)
        result.append([tagger] + map(lambda s: ' / '.join(s), r))
    return result


def plot(result):

    from matplotlib import pylab as pl
    import scipy as sp

    if not result:
        result = sp.loadtxt('morph.csv', delimiter=',', skiprows=1).T

    x, y = result[0], result[1:]

    for i in y:
        pl.plot(x, i)

    pl.xlabel('Number of characters')
    pl.ylabel('Time (sec)')
    pl.xscale('log')
    pl.grid(True)
    pl.savefig("images/time.png")
    pl.show()


if __name__=='__main__':

    PLOT = False
    MULT = 6

    examples = [u'아버지가방에들어가신다',  # 띄어쓰기
            u'나는 밥을 먹는다', u'하늘을 나는 자동차', # 중의성 해소
            u'아이폰 기다리다 지쳐 애플공홈에서 언락폰질러버렸다 6+ 128기가실버ㅋ'] # 속어

    taggers = [t for t in dir(tag) if t[0].isupper()]

    # Time
    data = measure_time(taggers, mult=MULT)
    with open('morph.csv', 'w') as f:
        csvwrite(data, f)

    # Accuracy
    for i, example in enumerate(examples):
        result = measure_accuracy(taggers, example)
        result = map(lambda *row: [i or '' for i in row], *result)
        with open('morph-%s.csv' % i, 'w') as f:
            csvwrite(result, f)

    # Plot
    if PLOT:
        plot(result)

Hanna	1	0.355000019073
[]
Kkma	1	7.23500013351
[(대, NNG)]
Komor	1	5.79499983788
[(대, NNB)]
Uhoh, global name 'Tagger' is not defined
Mecab	1	0.00100016593933
[]
Twitt	1	1.20799994469
[(대, Verb)]

Hanna	10	0.00600004196167
[]
Kkma	10	0.134999990463
[(대한민국, NNG),
 (헌법, NNG),
 (유구, XR)]
Komor	10	3.00900006294
[(대한민국헌법, NNP), (유구, XR)]
Uhoh, global name 'Tagger' is not defined
Mecab	10	0.00200009346008
[]
Twitt	10	0.565999984741
[(대한민국, Noun),
 (헌법, Noun),
 (유구, Noun)]

Hanna	100	0.00600004196167
[]
Kkma	100	0.195999860764
[(대한민국, NNG),
 (헌법, NNG),
 (유구, NNG),
 (하, XSV),
 (ㄴ, ETD)]
Komor	100	3.91700005531
[(대한민국헌법, NNP),
 (유구, XR),
 (하, XSA),
 (ㄴ, ETM),
 (역사, NNG)]
Uhoh, global name 'Tagger' is not defined
Mecab	100	0.00200009346008
[]
Twitt	100	0.0349998474121
[(대한민국, Noun),
 (헌법, Noun),
 (유구, Noun),
 (한, Josa),
 (역사, Noun)]

Hanna	1000	0.00599980354309
[]
Kkma	1000	0.898000001907
[(대한민국, NNG),
 (헌법, NNG),
 (유구, NNG),
 (하, XSV),
 (ㄴ, ETD)]
Komor	1000	4.38499999046
[(대한민국헌법, NNP),
 (유구, XR

In [None]:
# 성능비교함 : 아버지가방에들어가신다. => 띄어쓰기 성능
#             나는 밥을 먹는다/하늘을 나는 자동차 : 날다/날다 판단능력성능비교
#             아이폰기다리다 지쳐 ~: 사전에 없는 말의 형태소 분석