In [31]:
from nltk.tokenize import word_tokenize
from konlpy.tag import Okt
import nltk

In [60]:
okt = Okt()

In [32]:
def load_data(file_path):
    train = []
    
    count = 0
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 500:
                break
                
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1': 
                label = 'pos'
            elif label == '0': 
                label = 'neg'
            train.append((doc, label))
            count += 1
    return train

In [39]:
def pos_tokenize(raw_sent):
    pos_sent = []
    sent = okt.pos(raw_sent, norm=True, stem=True)
    
    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
    return ' '.join(pos_sent)

In [40]:
def make_word_dict(train, use_morph=False):
    all_words = set()
    
    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph:
            sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        for word in words:
            all_words.add(word)
    return all_words

In [41]:
def make_train_feats(train, all_words, use_morph=False):
    train_features = []
    
    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph:
            sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        tmp = {set_word: (set_word in words) for set_word in all_words}
        sent_tup = (tmp, label)
        train_features.append(sent_tup)
    return train_features

In [42]:
train = load_data('ratings_train.txt')
print(train[:5])

[('document', 'label'), ('아 더빙.. 진짜 짜증나네요 목소리', 'neg'), ('흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'pos'), ('너무재밓었다그래서보는것을추천한다', 'neg'), ('교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', 'neg')]


In [63]:
use_morph = True
all_words = make_word_dict(train, use_morph)
train_feature = make_train_feats(train, all_words, use_morph)

In [46]:
import urllib

In [48]:
test = urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt",
    filename="ratings_test.txt"
)

In [66]:
classifier = nltk.NaiveBayesClassifier.train(train_feature)
classifier.show_most_informative_features()

Most Informative Features
           재밌다/Adjective = True              pos : neg    =      7.8 : 1.0
                쓰레기/Noun = True              neg : pos    =      6.9 : 1.0
                  뭐/Noun = True              neg : pos    =      5.9 : 1.0
                해주다/Verb = True              pos : neg    =      5.2 : 1.0
            안/VerbPrefix = True              neg : pos    =      4.8 : 1.0
                 최고/Noun = True              pos : neg    =      4.6 : 1.0
                  요/Josa = True              pos : neg    =      4.2 : 1.0
            내/Determiner = True              neg : pos    =      4.0 : 1.0
                  냐/Josa = True              neg : pos    =      4.0 : 1.0
                  못/Noun = True              neg : pos    =      4.0 : 1.0


In [72]:
test_sent = str(load_data('ratings_test.txt')[1])
print(test_sent)
test_sent = pos_tokenize(test_sent)
words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}

('굳 ㅋ', 'pos')


In [73]:
classifier.classify(test_feature)

'neg'