## 네이버 영화 리뷰 감성 분석

In [1]:
import re
import joblib
import numpy as np 
import pandas as pd

In [2]:
train_df = pd.read_csv('../../../Machine-Learning/00.data/NaverMovie/ratings_train.txt', sep='\t')
test_df = pd.read_csv('../../../Machine-Learning/00.data/NaverMovie/ratings_test.txt', sep='\t')
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [3]:
train_df.shape, test_df.shape
#((150000, 3), (50000, 3))

((150000, 3), (50000, 3))

### 데이터 전처리

In [4]:
# 중복 여부 확인
train_df['document'].nunique() #중복되지않은 유니크한녀석이 146182개(3천개이상이 중복이다..)
#146182

146182

In [5]:
# 중복 샘플 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape
#(146183, 3)

(146183, 3)

In [6]:
# Null값 확인
train_df.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [7]:
# Null값 제거
train_df = train_df.dropna(how = 'any')
train_df.shape
#최종 훈련데이터갯수 146182

(146182, 3)

In [8]:
# 긍정, 부정 레이블 값의 갯수
train_df.label.value_counts()
#부정0:73342개 긍정1:72840개 거진 1:1 비율 확인

0    73342
1    72840
Name: label, dtype: int64

- 테스트 데이터 셋에도 마찬가지로 적용

In [9]:
# 중복 제거
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape
#840개정도 중복이 삭제됨
#(49158, 3)

(49158, 3)

In [10]:
# Null 제거
test_df = test_df.dropna(how='any')
test_df.shape
#(49157, 3)

(49157, 3)

In [11]:
test_df.head(3)

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0


In [12]:
#일단저장 (중복,널값제거됨 / 영어,구둣점있음)
test_df.to_csv('../static/data/naver/movie_test0.tsv', sep='\t', index=False)

## 텍스트 전처리

In [13]:
#한글,공백남기고 모두 삭제(영어,구둣점)...널값등장;;
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [14]:
# 널값(빈값)을 nan으로만들어 dropna처리
train_df['document'].replace('', np.nan, inplace=True)
train_df.isnull().sum()

id            0
document    391
label         0
dtype: int64

In [15]:
train_df = train_df.dropna(how = 'any')
train_df.shape
#(145791, 3)

(145791, 3)

In [16]:
train_df.to_csv('../static/data/naver/movie_train.tsv', sep='\t', index=False)

- 테스트 데이터셋

In [17]:
#한글,공백남기고 모두 삭제(영어,구둣점)..널값등장;;
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [18]:
#인덱스만구하는것 "변환했더니 아무것도 없는것만 남기고 나머지 삭제?!"
indices = list(test_df['document'] != '')
indices[:3]
#[True, False, True]

[True, False, True]

In [19]:
#(중복,널값제거됨 / 영어,구둣점있음)movie_test0.tsv
df = pd.read_csv('../static/data/naver/movie_test0.tsv', sep='\t')
new_df = df.loc[indices,:] #필터링??한것..
new_df.shape
#제대로된데이터 (48995,3)

(48995, 3)

In [20]:
#제대로된데이터를 다시 저장
new_df.to_csv('../static/data/naver/movie_test.tsv', sep='\t', index=False)

In [21]:
#실제 테스트에 쓸 데이터 (영어, 숫자, 구둣점 삭제)
test_df = pd.read_csv('../static/data/naver/movie_test.tsv', sep='\t')
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_df.shape  #구두점을 보호하려고???????하셨다고여???ㅠ^ㅠ
#(48995, 3)

(48995, 3)

## 훈련

In [22]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## 토큰화

In [23]:
from konlpy.tag import Okt
okt = Okt()

In [24]:
#조사삭제
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을']

In [25]:
#한글,블랭크만 있는 데이터로 형태소분석(원형) 중복여부 확인
from tqdm import tqdm_notebook
X_train = []
for sentence in tqdm_notebook(train_df['document']):
    morphs = okt.morphs(sentence, stem=True) # 토큰화 형태소분석
    temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거, 스트링의 리스트 만듦
    X_train.append(temp_X)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=145791.0), HTML(value='')))




In [26]:
#한글,블랭크만 있는 데이터로 형태소분석(원형) 중복여부 확인
X_test = []
for sentence in tqdm_notebook(test_df['document']):
    morphs = okt.morphs(sentence, stem=True) # 토큰화 형태소분석
    temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거, 스트링의 리스트 만듦
    X_test.append(temp_X)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=48995.0), HTML(value='')))




In [27]:
y_train = train_df.label.values
y_test = test_df.label.values

In [28]:
#확인
X_train[:3]

['아 더빙 진짜 짜증나다 목소리',
 '흠 포스터 보고 초딩 영화 줄 오버 연기 조차 가볍다 않다',
 '너 무재 밓었 다그 래서 보다 추천 다']

In [29]:
#확인
y_train[:3]

array([0, 1, 0], dtype=int64)

## Case 1. CountVectorizer + LogisticRegression

In [30]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer()),
    ('lr_clf', LogisticRegression())
])
params = ({
    'count_vect__ngram_range': [(1,1), (1,2)],
    'count_vect__max_df': [300, 700],
    'lr_clf__C': [1, 10]
})

In [31]:
%time pipeline.fit(X_train, y_train)
#4.39 s초만에

Wall time: 4.39 s


Pipeline(steps=[('count_vect', CountVectorizer()),
                ('lr_clf', LogisticRegression())])

In [32]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)
# 결과값 Fitting 3 folds for each of 8 candidates, totalling 24 fits
# [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.8min finished
# {'count_vect__max_df': 700, 'count_vect__ngram_range': (1, 2), 'lr_clf__C': 1} 0.8180820489604983

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.8min finished
{'count_vect__max_df': 700, 'count_vect__ngram_range': (1, 2), 'lr_clf__C': 1} 0.8180820489604983


In [33]:
best_count_lr = grid_pipe.best_estimator_
pred_count_lr = best_count_lr.predict(X_test)
accuracy_score(y_test, pred_count_lr)
#결과값0.8196958873354424

0.8196958873354424

In [70]:
joblib.dump(best_count_lr, '../static/model/naver_count_lr8196.pkl')

['../static/model/naver_count_lr8196.pkl']

## Case 2. CountVectorizer + NaiveBayes

In [35]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer()),
    ('nb_clf', MultinomialNB())
])
params = ({
    'count_vect__ngram_range': [(1,1), (1,2)],
    'count_vect__max_df': [300, 700]
})

In [36]:
%time pipeline.fit(X_train, y_train)
#Wall time: 1.74 s

Wall time: 1.74 s


Pipeline(steps=[('count_vect', CountVectorizer()), ('nb_clf', MultinomialNB())])

In [37]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)
# 결과값 Fitting 3 folds for each of 4 candidates, totalling 12 fits
# [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   15.5s finished
# {'count_vect__max_df': 700, 'count_vect__ngram_range': (1, 2)} 0.8266491072837144

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   15.5s finished
{'count_vect__max_df': 700, 'count_vect__ngram_range': (1, 2)} 0.8266491072837144


In [38]:
best_count_nb = grid_pipe.best_estimator_
pred_count_nb = best_count_nb.predict(X_test)
accuracy_score(y_test, pred_count_nb)
#결과값 0.8284722930911318

0.8284722930911318

In [69]:
joblib.dump(best_count_nb, '../static/model/naver_count_nb8284.pkl')

['../static/model/naver_count_nb8284.pkl']

## Case 3. TfidfVectorizer + LogisticRegression

In [40]:
#미리 형태소분석기로 morphs했기때문에, tokenizer함수(tw_tokenizer)안써도됨!!!
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer()),
    ('lr_clf', LogisticRegression())
])
params = ({
    'tfidf_vect__ngram_range': [(1,1), (1,2)],
    'tfidf_vect__max_df': [300, 700],
    'lr_clf__C': [1, 10]
})

In [41]:
%time pipeline.fit(X_train, y_train)
#Wall time: 4.03 s

Wall time: 4.03 s


Pipeline(steps=[('tfidf_vect', TfidfVectorizer()),
                ('lr_clf', LogisticRegression())])

In [42]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)
# 결과값 Fitting 3 folds for each of 8 candidates, totalling 24 fits
# [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.4min finished
# {'lr_clf__C': 10, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.8204553093126462

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.4min finished
{'lr_clf__C': 10, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.8204553093126462


In [62]:
best_tfidf_lr = grid_pipe.best_estimator_
pred_tfidf_lr = best_tfidf_lr.predict(X_test)
accuracy_score(y_test, pred_tfidf_lr)
#결과값 0.8203490152056332

0.8298806000612308

In [68]:
joblib.dump(best_tfidf_lr, '../static/model/naver_tfidf_lr8298.pkl')

['../static/model/naver_tfidf_lr8298.pkl']

## Case 4. TfidfVectorizer + NaiveBayes

In [45]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer()),
    ('nb_clf', MultinomialNB())
])
params = ({
    'tfidf_vect__ngram_range': [(1,1), (1,2)],
    'tfidf_vect__max_df': [300, 700]
})

In [46]:
%time pipeline.fit(X_train, y_train)
# Wall time: 1.89 s

Wall time: 1.89 s


Pipeline(steps=[('tfidf_vect', TfidfVectorizer()), ('nb_clf', MultinomialNB())])

In [47]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)
#결과값 Fitting 3 folds for each of 4 candidates, totalling 12 fits
# [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   16.2s finished
# {'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.8286108195979175

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   16.2s finished
{'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.8286108195979175


In [48]:
best_tfidf_nb = grid_pipe.best_estimator_
pred_tfidf_nb = best_tfidf_nb.predict(X_test)
accuracy_score(y_test, pred_tfidf_nb)
#결과값 0.8298806000612308

0.8298806000612308

In [67]:
joblib.dump(best_tfidf_nb, '../static/model/naver_tfidf_nb8298.pkl')

['../static/model/naver_tfidf_nb8298.pkl']

## 테스트
- 테스트 데이터셋

In [50]:
index = 100
review = test_df.document[index]
label = test_df.label[index]
review, label
#('이렇게 지겨울수가', 0)

('이렇게 지겨울수가', 0)

In [51]:
test_data = []
morphs = okt.morphs(review, stem=True) # 토큰화(형태소분석기를 돌리는 과정)
temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거, 스트링의 리스트로 만듦
test_data.append(temp_X)
test_data
#['이렇게 지겹다']

['이렇게 지겹다']

In [52]:
pred_cl = best_count_lr.predict(test_data)
pred_cn = best_count_nb.predict(test_data)
pred_tl = best_tfidf_lr.predict(test_data)
pred_tn = best_tfidf_nb.predict(test_data)

In [53]:
label, pred_cl[0], pred_cn[0], pred_tl[0], pred_tn[0]

(0, 0, 0, 0, 0)

- 직접 입력

In [54]:
review = '이런 사랑영화가 다시 나올 수 있을까?'

In [55]:
review = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", review) #한글과빈칸남기고 모두제거
test_data = []
morphs = okt.morphs(review, stem=True) # 토큰화(형태소분석기)
temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거
test_data.append(temp_X)
test_data
#결과값:['이렇다 사랑 영화 다시 나오다 수 있다'] 이렇게 바꿔서 넣어줘야함..
#학습시킨대로 넣어주기;;;

['이렇다 사랑 영화 다시 나오다 수 있다']

In [56]:
pred_cl = best_count_lr.predict(test_data)
pred_cn = best_count_nb.predict(test_data)
pred_tl = best_tfidf_lr.predict(test_data)
pred_tn = best_tfidf_nb.predict(test_data)

In [57]:
pred_cl[0], pred_cn[0], pred_tl[0], pred_tn[0]

(1, 1, 1, 1)