# 네이버 영화평 감성 분석

In [57]:
import numpy as np 
import matplotlib.pyplot as plt 
%matttplotlib inline

UsageError: Line magic function `%matttplotlib` not found.


In [107]:
#'네이버 영화 리뷰 데이터' 로 검색
import pandas as pd 

train_df = pd.read_csv('/Users/daeun/Downloads/Machine-Learning-2021-2-main/06.TextAnalysis/naver_movie_train.tsv', sep='\t')
test_df = pd.read_csv('/Users/daeun/Downloads/Machine-Learning-2021-2-main/06.TextAnalysis/naver_movie_test.tsv', sep='\t')

#ex. 네이버 영화평 크롤링해서 평점 6이상: 긍정 / 그 외 부정..이런식으로

In [108]:
train_df.shape, test_df.shape

((145791, 4), (48995, 4))

### 데이터 전처리

- 중복 제거

In [110]:
# 중복 여부 확인
train_df.document.nunique()

143681

In [111]:
# 중복 데이터 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(143681, 4)

In [112]:

# Null 데이터가 있는지 확인
train_df.isnull().sum()

Unnamed: 0    0
id            0
document      0
label         0
dtype: int64

In [113]:
# Null 데이터 제거
train_df = train_df.dropna(how='any')
train_df.shape

(143681, 4)

In [114]:
# 긍정, 부정 레이블의 분포
train_df.label.value_counts()

0    72215
1    71466
Name: label, dtype: int64

In [115]:
# 중복 여부 확인
test_df.document.nunique()

48417

In [116]:
# 중복 데이터 제거
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape

(48417, 4)

In [117]:
# Null 데이터가 있는지 확인
test_df.isnull().sum()

Unnamed: 0    0
id            0
document      0
label         0
dtype: int64

In [118]:
# Null 데이터 제거
test_df = test_df.dropna(how='any')
test_df.shape

(48417, 4)

In [72]:
test['label'].value_counts()

1    24712
0    24446
Name: label, dtype: int64

In [119]:
# 긍정, 부정 레이블의 분포
test_df.label.value_counts()

1    24294
0    24123
Name: label, dtype: int64

### 텍스트 전처리

In [120]:
# 한글과 공백 이외는 제거
train_df['document'] = train_df.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','')
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,document,label
0,0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,2,10265843,너무재밓었다그래서보는것을추천한다,0


In [121]:
# '' 데이터는 Nan으로 변환한 후 제거
train_df['document'].replace('', np.nan, inplace=True)
train_df.document.isnull().sum()

0

In [122]:

train_df = train_df.dropna(how='any')
train_df.shape

(143681, 4)

In [123]:
test_df['document'] = test_df.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','')
test_df['document'].replace('', np.nan, inplace=True)
test_df = test_df.dropna(how='any')
test_df.shape

(48417, 4)

In [124]:
train_df.to_csv('naver_movie_train.tsv', sep='\t')
test_df.to_csv('naver_movie_test.tsv', sep='\t')

### 한글처리

In [125]:
from konlpy.tag import Okt
okt = Okt()

In [126]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [127]:

text = '교도소 이야기구먼 솔직히 재미는 없다평점 조정'
okt.morphs(text)

['교도소', '이야기', '구먼', '솔직히', '재미', '는', '없다', '평점', '조정']

In [128]:
okt.morphs(text, stem=True)

['교도소', '이야기', '구먼', '솔직하다', '재미', '는', '없다', '평점', '조정']

In [129]:
from tqdm import tqdm_notebook
X_train = []
for sentence in tqdm_notebook(train_df.document):
    morphs = okt.morphs(sentence, stem=True)
    temp_X = ' '.join([word for word in morphs if not word in stopwords])
    X_train.append(temp_X)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(train_df.document):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=143681.0), HTML(value='')))




In [140]:
from tqdm import tqdm_notebook
X_test = []
for sentence in tqdm_notebook(test_df.document):
    morphs = okt.morphs(sentence, stem=True)
    temp_X = ' '.join([word for word in morphs if not word in stopwords])
    X_test.append(temp_X)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(test_df.document):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=48417.0), HTML(value='')))




In [141]:
y_train = train_df.label.values
y_test = test_df.label.values

### 4. Feature 변환, 모델 학습/예측/평가

In [142]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import CountVectorizer

In [143]:
cvect = CountVectorizer()
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test) 

In [144]:
lr = LogisticRegression()
lr.fit(X_train_cv, y_train)
pred = lr.predict(X_test_cv)
accuracy_score(y_test, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8265898341491625

### 실제 테스트

- 텍스트전처리

In [167]:
review1 = '아름다운 음악과 아름다운 풍광~ 그렇지 못한 현실이 찡하네요~'
review2 = '메세지와 작위성의 불협화음~!~!!!'

In [168]:
review1 = review1.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','')
review1

'아름다운 음악과 아름다운 풍광~ 그렇지 못한 현실이 찡하네요~'

In [169]:
#import re 
#review1 = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','',review1)

In [170]:
morphs = okt.morphs(review1, stem=True)
review = ' '.join([word for word in morphs if word not in stopwords])
review

'아름답다 음악 아름답다 풍 광 ~ 그렇다 못 현실 찡하다 ~'

In [171]:
review_cv = cvect.transform([review]) #리스트로 넣기 #리뷰를 countvectorizer 로 피쳐 변환
pred = lr.predict(review_cv)
pred[0]

#결과가 1이면 긍정, 0이면 부정

1

In [172]:
review2 = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','',review2)

In [173]:
morphs = okt.morphs(review2, stem=True)
review = ' '.join([word for word in morphs if word not in stopwords])
review_cv = cvect.transform([review]) #리스트로 넣기 #리뷰를 countvectorizer 로 피쳐 변환
pred = lr.predict(review_cv)
pred[0]

0

In [178]:
reviews= ['아름다운 음악과 아름다운 풍광~ 그렇지 못한 현실이 찡하네요~',
            '메세지와 작위성의 불협화음']

In [188]:
review_list = []
for review in reviews:
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', review)
    morphs = okt.morphs(review, stem=True)
    temp_X = ' '.join([word for word in morphs if not word in stopwords])
    review_list.append(temp_X)



In [189]:
review_cv = cvect.transform(review_list)
pred = lr.predict(review_cv)
pred

array([1, 0])

### 6. GridSearchCV로 최적 파라미터 찾기

In [199]:
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV 

In [204]:
pipeline = Pipeline([
    ('cvect', CountVectorizer()),
    ('lr', LogisticRegression())
])

params = {
    'cvect__ngram_range':[(1,1),(1,2)],
    'cvect__max_df': [0.9,],
    'cvect__min)df': [1,3],
    'lr__C': [1,5]
}

In [205]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', n_jobs=-1)
%time grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_score_, grid_pipe.best_params_)

ValueError: Invalid parameter min)df for estimator CountVectorizer(max_df=0.9). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
pred = grid_pipe.best_estimator_.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'CounterVectorizer + LogisticRegression 정확도: {acc:.4f}')