In [29]:
from nltk.tokenize import word_tokenize
from konlpy.tag import Okt
import nltk

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\outda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
okt = Okt()

In [31]:
train = [
    ('I like you', 'pos'), 
    ('I hate you','neg'),
    ('I enjoyed it', 'pos'),
    ('I hate it','neg')
]

In [32]:
def pos_tokenize(raw_sent):
    pos_sent = []
    sent = okt.pos(raw_sent, norm=True, stem=True)
    
    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
    return ' '.join(pos_sent)

all_words = set()

for tup in train:
    sent, label = tup[0], tup[1]
    words = word_tokenize(sent) # i like you -> ['i','like','you']
    for word in words:
        all_words.add(word)

all_words

{'I', 'enjoyed', 'hate', 'it', 'like', 'you'}

In [33]:
train_features = []

for tup in train:
    sent, label = tup[0], tup[1]
    words = word_tokenize(sent)
    tmp = dict()
    tmp = {set_word: (set_word in words) for set_word in all_words}
    sent_tup = (tmp, label)
    train_features.append(sent_tup)

train_features

[({'you': True,
   'like': True,
   'it': False,
   'hate': False,
   'enjoyed': False,
   'I': True},
  'pos'),
 ({'you': True,
   'like': False,
   'it': False,
   'hate': True,
   'enjoyed': False,
   'I': True},
  'neg'),
 ({'you': False,
   'like': False,
   'it': True,
   'hate': False,
   'enjoyed': True,
   'I': True},
  'pos'),
 ({'you': False,
   'like': False,
   'it': True,
   'hate': True,
   'enjoyed': False,
   'I': True},
  'neg')]

In [34]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features()

Most Informative Features
                 enjoyed = False             neg : pos    =      1.7 : 1.0
                    like = False             neg : pos    =      1.7 : 1.0
                       I = True              neg : pos    =      1.0 : 1.0
                      it = False             neg : pos    =      1.0 : 1.0
                      it = True              neg : pos    =      1.0 : 1.0
                     you = False             neg : pos    =      1.0 : 1.0


In [35]:
test_sent = 'I like it'

words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}

In [36]:
test_feature

{'you': False,
 'like': True,
 'it': True,
 'hate': False,
 'enjoyed': False,
 'I': True}

In [37]:
classifier.classify(test_feature)

'pos'

In [38]:
train = [
    ('사과가 좋아', 'pos'),
    ('밤에 먹는 사과는 비추야', 'neg'),
    ('사과가 잘 익었어 맛있겠다', 'pos')
]

In [39]:
def pos_tokenize(raw_sent):
    pos_sent = []
    sent = okt.pos(raw_sent, norm=True, stem=True)
    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
    return ' '.join(pos_sent)

In [40]:
all_words = set()
for tup in train:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)
    for word in words:
        all_words.add(word)

all_words

{'가/Josa',
 '는/Josa',
 '맛있다/Adjective',
 '먹다/Verb',
 '밤/Noun',
 '비추다/Verb',
 '사과/Noun',
 '에/Josa',
 '익다/Verb',
 '자다/Verb',
 '좋다/Adjective'}

In [41]:
train_features = []

for tup in train:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)
    tmp = {set_word: (set_word in words) for set_word in all_words}
    sent_tup = (tmp, label)
    train_features.append(sent_tup)
    
train_features

[({'밤/Noun': False,
   '자다/Verb': False,
   '비추다/Verb': False,
   '익다/Verb': False,
   '좋다/Adjective': True,
   '가/Josa': True,
   '는/Josa': False,
   '에/Josa': False,
   '먹다/Verb': False,
   '맛있다/Adjective': False,
   '사과/Noun': True},
  'pos'),
 ({'밤/Noun': True,
   '자다/Verb': False,
   '비추다/Verb': True,
   '익다/Verb': False,
   '좋다/Adjective': False,
   '가/Josa': False,
   '는/Josa': True,
   '에/Josa': True,
   '먹다/Verb': True,
   '맛있다/Adjective': False,
   '사과/Noun': True},
  'neg'),
 ({'밤/Noun': False,
   '자다/Verb': True,
   '비추다/Verb': False,
   '익다/Verb': True,
   '좋다/Adjective': False,
   '가/Josa': True,
   '는/Josa': False,
   '에/Josa': False,
   '먹다/Verb': False,
   '맛있다/Adjective': True,
   '사과/Noun': True},
  'pos')]

In [42]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features()

Most Informative Features
           맛있다/Adjective = False             neg : pos    =      1.5 : 1.0
                 익다/Verb = False             neg : pos    =      1.5 : 1.0
                 자다/Verb = False             neg : pos    =      1.5 : 1.0
            좋다/Adjective = False             neg : pos    =      1.5 : 1.0


In [43]:
test_sent = '사과는 비추야'
test_sent = pos_tokenize(test_sent)
words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}
print(test_feature)

{'밤/Noun': False, '자다/Verb': False, '비추다/Verb': True, '익다/Verb': False, '좋다/Adjective': False, '가/Josa': False, '는/Josa': True, '에/Josa': False, '먹다/Verb': False, '맛있다/Adjective': False, '사과/Noun': True}


In [44]:
classifier.classify(test_feature)

'neg'