#### 데이터 전처리

In [1]:
# %%time
# !rm -f ratings_train.txt ratings_test.txt
# !wget -nc https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
# !wget -nc https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt

In [2]:
import codecs
# codecs = streaming encoder

with codecs.open('ratings_train.txt', encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
    data = data[1:]    # header 제외

In [3]:
from pprint import pprint
# pprint = pretty print
pprint(data[0])

['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']


In [4]:
X = list(zip(*data))[1]
y = np.array(list(zip(*data))[2], dtype=int)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

model1 = Pipeline([
    ('vect', CountVectorizer()),
    ('mb', MultinomialNB())
])

In [6]:
%%time
model1.fit(X, y)

Wall time: 3.6 s


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('mb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [7]:
with codecs.open('ratings_test.txt', encoding='utf-8') as f:
    data_test = [line.split('\t') for line in f.read().splitlines()]
    data_test = data_test[1:]    # header 제외

In [8]:
X_test = list(zip(*data_test))[1]
y_test = np.array(list(zip(*data_test))[2], dtype=int)

print(classification_report(y_test, model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83     24827
           1       0.84      0.81      0.82     25173

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



In [None]:
# 성능이 정말 80% 정도 나올까? 실험

In [11]:
model1.predict(['캡이다'])

array([1])

In [12]:
model1.predict(['꽝이야'])

array([0])

In [13]:
model1.predict(['감동이다'])

array([1])

In [14]:
model1.predict(['짜증나네'])

array([0])

In [15]:
model1.predict(['돈이 아깝지 않다'])

array([0])

In [16]:
model1.predict(['돈 아깝다'])

array([0])

In [17]:
model1.predict(['명작이군'])

array([1])

In [18]:
model1.predict(['망작이야'])

array([0])

In [19]:
model1.predict(['재밌다'])

array([1])

In [20]:
model1.predict(['별로다'])

array([0])

In [21]:
model1.predict(['또 보고싶다'])

array([1])

In [22]:
model1.predict(['졸립다'])

array([0])

In [34]:
model1.predict(['팝콘각'])

array([0])

In [24]:
model1.predict(['역겹다'])

array([0])

In [25]:
model1.predict(['역시'])

array([1])

In [26]:
model1.predict(['콜라'])

array([1])

In [27]:
model1.predict(['발연기'])

array([0])

In [28]:
model1.predict(['최민식'])

array([1])

In [29]:
model1.predict(['오지호'])

array([0])

In [30]:
model1.predict(['발연기가 아니다'])

array([0])

In [31]:
model1.predict(['우와 기분 나쁘다'])

array([0])

In [32]:
model1.predict(['우와 기분 좋다'])

array([1])

In [33]:
model1.predict(['기분 안좋다'])

array([0])

In [35]:
model1.predict(['굿'])

array([0])

In [39]:
#### Tfidf 사용

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

model2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('mb', MultinomialNB())
])

In [44]:
%%time
model2.fit(X, y)

Wall time: 4.15 s


Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('mb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [45]:
print(classification_report(y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83     24827
           1       0.84      0.81      0.83     25173

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



In [49]:
#### 형태소 분석기 사용

In [46]:
from konlpy.tag import Okt
pos_tagger = Okt()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc)]

In [47]:
model3 = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize_pos)),
    ('mb', MultinomialNB())
])

In [48]:
%%time
model3.fit(X, y)

Wall time: 7min 1s


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_pos at 0x00000270B086FC18>,
                                 vocabulary=None)),
                ('mb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [50]:
print(classification_report(y_test, model3.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85     24827
           1       0.86      0.85      0.85     25173

    accuracy                           0.85     50000
   macro avg       0.85      0.85      0.85     50000
weighted avg       0.85      0.85      0.85     50000



In [51]:
model4 = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1, 2))),
    ('mb', MultinomialNB())
])

In [52]:
%%time
model4.fit(X, y)

Wall time: 11min 14s


Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_pos at 0x00000270B086FC18>,
                                 use_idf=True, vocabulary=None)),
                ('mb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [53]:
print(classification_report(y_test, model4.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.87      0.87     24827
           1       0.87      0.86      0.87     25173

    accuracy                           0.87     50000
   macro avg       0.87      0.87      0.87     50000
weighted avg       0.87      0.87      0.87     50000



In [54]:
model4.predict(['돈이 아깝지 않다'])

array([1])

In [55]:
model4.predict(['돈 아깝다'])

array([0])

In [56]:
model4.predict(['팝콘각'])

array([0])

In [57]:
model4.predict(['발연기가 아니다'])

array([0])

In [58]:
model4.predict(['굿'])

array([1])