## 네이버 영화평 감성분석 - TFidfVectorizer

In [2]:
import pandas as pd
import numpy as np

In [6]:
train_df = pd.read_csv('naver_movie_train.tsv', sep='\t')
test_df = pd.read_csv('naver_movie_test_df.tsv', sep='\t')
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


- Tokenizer 함수 정의

In [5]:
from konlpy.tag import Okt
okt = Okt()

In [8]:
stopwords = ['은','는','이','가','의','들','좀','잘','걍','과','도','을','를','으로','자','에','와','한','하다']

In [9]:
def okt_tokenizer(text):
    tokens = okt.morphs(text, stem=True)
    tokens = [word for word in tokens if not word in stopwords]
    return tokens

In [21]:
review1 = ' 운 좋게 시사회로 봤는데 진짜 재밌었음.. 올해 본 영화중 최고였음 ㅠ 문화의 날로 할인받아서 엄마랑 볼려고 또 예매함ㅎ'

In [22]:
okt_tokenizer(review1)

['운',
 '좋다',
 '시사회',
 '로',
 '보다',
 '진짜',
 '재밌다',
 '..',
 '올해',
 '보다',
 '영화',
 '중',
 '최고',
 '이다',
 'ㅠ',
 '문화',
 '날로',
 '할인',
 '받다',
 '엄마',
 '랑',
 '보다',
 '또',
 '예매',
 '함',
 'ㅎ']

### TfidfVectorizer 변환

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(
    tokenizer=okt_tokenizer, ngram_range=(1,2), max_df = 0.9)

In [11]:
%time tvect.fit(train_df.document)

Wall time: 6min 42s


TfidfVectorizer(max_df=0.9, ngram_range=(1, 2),
                tokenizer=<function okt_tokenizer at 0x0000028904FB6AF0>)

In [15]:
%time X_train_tv = tvect.transform(train_df.document)

Wall time: 7min 40s


In [12]:
X_test_tv = tvect.transform(test_df.document)

In [16]:
y_train = train_df.label.values
y_test = test_df.label.values

### Naive Bayes 분류기로 학습/예측/평가

In [14]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [17]:
nb.fit(X_train_tv, y_train)

MultinomialNB()

In [18]:
from sklearn.metrics import accuracy_score
pred = nb.predict(X_test_tv)
accuracy_score(y_test, pred)

0.8613531993060516

### 실제 테스트

In [19]:
reviews = ['아름다운 음악과 아름다운 풍광~ 그렇지 못한 현실이 찡하네요~',
'메시지와 작위성의 불협화음!!!']

In [20]:
reviews_tv = tvect.transform(reviews)
pred = nb.predict(reviews_tv)
pred

array([1, 0], dtype=int64)