In [53]:
import urllib
from nltk.tokenize import word_tokenize
from konlpy.tag import Okt
import nltk

okt = Okt()

In [54]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")

('ratings_train.txt', <http.client.HTTPMessage at 0x22487fcf370>)

In [55]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x2248774db20>)

In [65]:
def load_data(file_path, max_count=500):
    train = []
    
    count = 0
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == max_count:
                break
                
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1': 
                label = 'pos'
            elif label == '0': 
                label = 'neg'
            train.append((doc, label))
            count += 1
    return train

In [57]:
def pos_tokenize(raw_sent):
    pos_sent = []
    sent = okt.pos(raw_sent, norm=True, stem=True)
    
    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
    return ' '.join(pos_sent)

In [58]:
def make_word_dict(train, use_morph=False):
    all_words = set()
    
    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph:
            sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        for word in words:
            all_words.add(word)
    return all_words

In [59]:
def make_train_feats(train, all_words, use_morph=False):
    train_features = []
    
    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph:
            sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        tmp = {set_word: (set_word in words) for set_word in all_words}
        sent_tup = (tmp, label)
        train_features.append(sent_tup)
    return train_features

# 500개

In [85]:
test = load_data('ratings_test.txt', max_count=500)
test = test[1:]

In [86]:
use_morph = True
all_words = make_word_dict(test, use_morph)
test_feature = make_train_feats(test, all_words, use_morph)

In [87]:
classifier = nltk.NaiveBayesClassifier.train(test_feature)

In [88]:
classifier.show_most_informative_features()

Most Informative Features
           재밌다/Adjective = True              pos : neg    =      7.8 : 1.0
                쓰레기/Noun = True              neg : pos    =      6.9 : 1.0
                  뭐/Noun = True              neg : pos    =      5.9 : 1.0
                해주다/Verb = True              pos : neg    =      5.2 : 1.0
            안/VerbPrefix = True              neg : pos    =      4.8 : 1.0
                 최고/Noun = True              pos : neg    =      4.6 : 1.0
                  요/Josa = True              pos : neg    =      4.2 : 1.0
            내/Determiner = True              neg : pos    =      4.0 : 1.0
                  냐/Josa = True              neg : pos    =      4.0 : 1.0
                  못/Noun = True              neg : pos    =      4.0 : 1.0


In [89]:
nltk.classify.accuracy(classifier, test_features)

0.748

# 1000개

In [93]:
test = load_data('ratings_test.txt', max_count=1000)
test = test[1:]
all_words = make_word_dict(test, use_morph)
test_feature = make_train_feats(test, all_words, use_morph)
classifier = nltk.NaiveBayesClassifier.train(test_feature)

In [94]:
classifier.show_most_informative_features()

Most Informative Features
                 다시/Noun = True              pos : neg    =     13.2 : 1.0
          재미없다/Adjective = True              neg : pos    =     12.8 : 1.0
                해주다/Verb = True              pos : neg    =      8.7 : 1.0
                  냐/Josa = True              neg : pos    =      7.9 : 1.0
                 최고/Noun = True              pos : neg    =      7.6 : 1.0
                 가슴/Noun = True              pos : neg    =      7.4 : 1.0
                  뭐/Noun = True              neg : pos    =      7.2 : 1.0
                쓰레기/Noun = True              neg : pos    =      7.2 : 1.0
          재미있다/Adjective = True              pos : neg    =      6.8 : 1.0
                 마음/Noun = True              pos : neg    =      6.8 : 1.0


In [95]:
nltk.classify.accuracy(classifier, test_features)

0.794

# 결과 비교

500개 일 때는 정확도가 약 75% 정도이고 1000개 일 때는 80%이다.

학습한 문장이 1000개면 500개일 때보다 약 5% 정도 향상된다.