In [22]:
import pandas as pd
from konlpy.tag import Kkma
import nltk

In [4]:
kkma = Kkma()
example = '한국어 분석을 시작합니다. 기대되어요'
kkma.sentences(example) #문장 분석 결과

['한국어 분석을 시작합니다.', '기대되어요']

In [5]:
kkma.nouns(example) #명사 분석 결과

['한국어', '분석']

In [6]:
kkma.pos(example) #형태소 분석 결과

[('한국어', 'NNG'),
 ('분석', 'NNG'),
 ('을', 'JKO'),
 ('시작하', 'VV'),
 ('ㅂ니다', 'EFN'),
 ('.', 'SF'),
 ('기대되', 'VV'),
 ('어요', 'EFN')]

In [14]:
train = [('떼껄룩이 좋다','pos'),('고양이도 좋다','pos'),('난 수업이 지루하다','neg'),('떼껄룩은 사랑스러워','pos'),('나는 수업마치고 떼껄룩이랑 놀거야','pos'),('떼껄룩이 사고치면 화가나','neg')]

In [15]:
def tokenize(doc):
    return ['/'.join(t) for t in kkma.pos(doc)]
train_docs = [(tokenize(row[0]),row[1]) for row in train]

In [16]:
train_docs

[(['떼껄룩/UN', '이/JKS', '좋/VA', '다/EFN'], 'pos'),
 (['고양이/NNG', '도/JX', '좋/VA', '다/EFN'], 'pos'),
 (['낳/VV', 'ㄴ/ETD', '수업/NNG', '이/JKS', '지루/XR', '하/XSA', '다/EFN'], 'neg'),
 (['떼껄룩/UN', '은/JX', '사랑/NNG', '스럽/XSA', '어/ECS'], 'pos'),
 (['나/NP',
   '는/JX',
   '수업/NNG',
   '마/NNG',
   '치고/JX',
   '떼껄룩/UN',
   '이랑/JC',
   '놀/VV',
   'ㄹ/ETD',
   '거/NNB',
   '야/JX'],
  'pos'),
 (['떼껄룩/UN', '이/JKS', '사고/NNG', '치/VV', '면/ECE', '화가/NNG', '낳/VV'], 'neg')]

In [17]:
tokens = [t for d in train_docs for t in d[0]]

In [18]:
tokens

['떼껄룩/UN',
 '이/JKS',
 '좋/VA',
 '다/EFN',
 '고양이/NNG',
 '도/JX',
 '좋/VA',
 '다/EFN',
 '낳/VV',
 'ㄴ/ETD',
 '수업/NNG',
 '이/JKS',
 '지루/XR',
 '하/XSA',
 '다/EFN',
 '떼껄룩/UN',
 '은/JX',
 '사랑/NNG',
 '스럽/XSA',
 '어/ECS',
 '나/NP',
 '는/JX',
 '수업/NNG',
 '마/NNG',
 '치고/JX',
 '떼껄룩/UN',
 '이랑/JC',
 '놀/VV',
 'ㄹ/ETD',
 '거/NNB',
 '야/JX',
 '떼껄룩/UN',
 '이/JKS',
 '사고/NNG',
 '치/VV',
 '면/ECE',
 '화가/NNG',
 '낳/VV']

In [19]:
def term_exists(doc):
    return {'exists({})'.format(word) : (word in set(doc)) for word in tokens}

In [20]:
train_xy = [(term_exists(d),c) for d,c in train_docs]
train_xy

[({'exists(떼껄룩/UN)': True,
   'exists(이/JKS)': True,
   'exists(좋/VA)': True,
   'exists(다/EFN)': True,
   'exists(고양이/NNG)': False,
   'exists(도/JX)': False,
   'exists(낳/VV)': False,
   'exists(ㄴ/ETD)': False,
   'exists(수업/NNG)': False,
   'exists(지루/XR)': False,
   'exists(하/XSA)': False,
   'exists(은/JX)': False,
   'exists(사랑/NNG)': False,
   'exists(스럽/XSA)': False,
   'exists(어/ECS)': False,
   'exists(나/NP)': False,
   'exists(는/JX)': False,
   'exists(마/NNG)': False,
   'exists(치고/JX)': False,
   'exists(이랑/JC)': False,
   'exists(놀/VV)': False,
   'exists(ㄹ/ETD)': False,
   'exists(거/NNB)': False,
   'exists(야/JX)': False,
   'exists(사고/NNG)': False,
   'exists(치/VV)': False,
   'exists(면/ECE)': False,
   'exists(화가/NNG)': False},
  'pos'),
 ({'exists(떼껄룩/UN)': False,
   'exists(이/JKS)': False,
   'exists(좋/VA)': True,
   'exists(다/EFN)': True,
   'exists(고양이/NNG)': True,
   'exists(도/JX)': True,
   'exists(낳/VV)': False,
   'exists(ㄴ/ETD)': False,
   'exists(수업/NNG)': False

In [23]:
classifier = nltk.NaiveBayesClassifier.train(train_xy)

In [24]:
classifier.show_most_informative_features()

Most Informative Features
           exists(이/JKS) = True              neg : pos    =      2.8 : 1.0
          exists(화가/NNG) = False             pos : neg    =      1.8 : 1.0
           exists(면/ECE) = False             pos : neg    =      1.8 : 1.0
           exists(하/XSA) = False             pos : neg    =      1.8 : 1.0
           exists(지루/XR) = False             pos : neg    =      1.8 : 1.0
          exists(사고/NNG) = False             pos : neg    =      1.8 : 1.0
           exists(ㄴ/ETD) = False             pos : neg    =      1.8 : 1.0
            exists(치/VV) = False             pos : neg    =      1.8 : 1.0
          exists(떼껄룩/UN) = False             neg : pos    =      1.7 : 1.0
          exists(수업/NNG) = True              neg : pos    =      1.7 : 1.0


In [25]:
test_sentence = ['나는 수업을 마치면 떼껄룩이랑 놀거야']
test_docs = tokenize(test_sentence[0])
test_docs

['나/NP',
 '는/JX',
 '수업/NNG',
 '을/JKO',
 '마치/VV',
 '면/ECE',
 '떼껄룩/UN',
 '이랑/JC',
 '놀/VV',
 'ㄹ/ETD',
 '거/NNB',
 '야/JX']

In [26]:
test_sent_features = {word: (word in tokens) for word in test_docs}
test_sent_features

{'나/NP': True,
 '는/JX': True,
 '수업/NNG': True,
 '을/JKO': False,
 '마치/VV': False,
 '면/ECE': True,
 '떼껄룩/UN': True,
 '이랑/JC': True,
 '놀/VV': True,
 'ㄹ/ETD': True,
 '거/NNB': True,
 '야/JX': True}

In [27]:
classifier.classify(test_sent_features)

'pos'