# 원더우먼 리뷰 감성 분석

In [109]:
import pandas as pd

df = pd.read_excel('./py_data/crawling/원더우먼_리뷰.xlsx', engine='openpyxl')

In [110]:
def return_pn(number):
    if number >= 5:
        result = '긍정'
    else:
        result = '부정'
    return result

In [111]:
ls_pos_neg = []
for point in df['점수']:
    ls_pos_neg.append(return_pn(point))

In [112]:
df['pos_neg'] = ls_pos_neg

In [113]:
len(df)

4173

In [114]:
df = df[:4000]

In [115]:
df = df[['댓글', 'pos_neg']]
df.head()

Unnamed: 0,댓글,pos_neg
0,탁당이 More More하면서 원더우먼과 대치할때 수어사이드 스쿼드의 엉덩이 춤의 ...,부정
1,DC가 DC했다.,부정
2,"원더 우먼이라는 캐릭터 자체가 별다른 매력없이 소비되고, 흥미롭게 시작하는 첫부분은...",부정
3,오락영화가 이정도면 괜찮을듯한데... 스토리도 괜찮았음. 갤가돗은 예뻤고.. 마블...,긍정
4,진짜 보다가 잤음 노젬 돈 아까움,부정


In [116]:
df = df.astype('str')

In [118]:
df_train = df[:3600]
df_test = df[3600:]

In [119]:
print(len(df_train))
print(len(df_test))
print(len(df_test)/(len(df_train)+len(df_test)))

3600
400
0.1


In [120]:
train = []
for t in df_train.iterrows():
    train.append((t[1][0], t[1][1]))

test = []
for t in df_test.iterrows():
    test.append((t[1][0], t[1][1]))

- 긍/부정 비율

In [176]:
train_pos , train_neg, test_pos , test_neg = 0, 0, 0, 0

for t in train:
    if t[1] == '긍정':
        train_pos += 1
    elif t[1] == '부정':
        train_neg += 1
        
for t in test:
    if t[1] == '긍정':
        test_pos += 1
    elif t[1] == '부정':
        test_neg += 1
        
print(f'학습 데이터 >>> pos: {train_pos}, neg: {train_neg}, 비율: {train_pos/(train_pos+train_neg):.2f} / {train_neg/(train_pos+train_neg):.2f}')
print(f'평가 데이터 >>> pos: {test_pos}, neg: {test_neg}, 비율: {train_pos/(train_pos+train_neg):.2f} / {train_neg/(train_pos+train_neg):.2f}')

학습 데이터 >>> pos: 1756, neg: 1844, 비율: 0.49 / 0.51
평가 데이터 >>> pos: 290, neg: 110, 비율: 0.49 / 0.51


In [125]:
from textblob.classifiers import NaiveBayesClassifier
from tqdm import tqdm
import time

# train
start = time.time()
pos_neg = NaiveBayesClassifier(train)
print('training time: {} min'.format((time.time()-start)/60))

# test
start = time.time()
accuracy = pos_neg.accuracy(test)
print('test time: {} min'.format((time.time()-start)/60))
print('Accuracy: ', accuracy)

training time: 1.1191531658172607 min
test time: 1.502736254533132 min
Accuracy:  0.745


In [127]:
pos_neg.show_informative_features()

Most Informative Features
           contains(최악의) = True               부정 : 긍정     =     20.0 : 1.0
         contains(좋았습니다) = True               긍정 : 부정     =     15.8 : 1.0
           contains(아깝다) = True               부정 : 긍정     =     15.6 : 1.0
          contains(재밌어요) = True               긍정 : 부정     =     15.1 : 1.0
           contains(쓰레기) = True               부정 : 긍정     =     14.3 : 1.0
          contains(재미없음) = True               부정 : 긍정     =     11.1 : 1.0
           contains(영화라) = True               긍정 : 부정     =     10.9 : 1.0
            contains(이걸) = True               부정 : 긍정     =     10.5 : 1.0
           contains(봤어요) = True               긍정 : 부정     =     10.4 : 1.0
           contains(여전히) = True               긍정 : 부정     =     10.2 : 1.0


# 원더우먼 리뷰 감성분석 - 형태소 분석

In [153]:
from konlpy.tag import Okt

okt = Okt()

train_data = [(['/'.join(token) for token in okt.pos(sentence)], result) for sentence, result in train]
test_data = [(['/'.join(token) for token in okt.pos(sentence)], result) for sentence, result in test]

In [154]:
print(train_data[0])

(['탁/Noun', '당/Suffix', '이/Josa', 'More/Alpha', 'More/Alpha', '하면서/Verb', '원더우먼/Noun', '과/Josa', '대치/Noun', '할/Verb', '때/Noun', '수어/Noun', '사이드/Noun', '스쿼드/Noun', '의/Josa', '엉덩이/Noun', '춤/Noun', '의/Josa', '악몽/Noun', '이/Josa', '떠오르더군요/Verb', './Punctuation', 'DC/Alpha', '는/Verb', '대본/Noun', '과/Josa', '연출/Noun', '이/Josa', '마블/Noun', '대비/Noun', '고루/Noun', '한/Josa', '것/Noun', '같아요/Adjective', './Punctuation', '보통/Noun', '미드/Noun', '의/Josa', '작가진은/Adjective', '탄탄하던데/Adjective', 'DC/Alpha', '영화/Noun', '작가진은/Adjective', '어설픈것/Adjective', '같아/Adjective', '안타깝습니다/Adjective', './Punctuation', '수어/Noun', '사이드/Noun', '스쿼드/Noun', ',/Punctuation', '샤잠/Noun', '.../Punctuation', '저스티스리그/Noun', '는/Josa', '조금/Noun', '낫긴/Verb', '했는데/Verb', '.../Punctuation', '아쉽게도/Adjective', '마블/Noun', '과/Josa', '너무/Adverb', '대비/Noun', '되네요/Verb', './Punctuation', '재미있는/Adjective', 'DC/Alpha', '영화/Noun', '가/Josa', '나오길/Verb', '아직/Adverb', '기다려/Verb', '봅니다/Verb', './Punctuation'], '부정')


In [155]:
print(test_data[0])

(['잘생긴/Adjective', '남주/Verb', '여주/Noun', '얼굴/Noun', '말고/Josa', '볼거/Verb', '없음/Adjective'], '부정')


In [156]:
from textblob.classifiers import NaiveBayesClassifier
from tqdm import tqdm
import time

# train
start = time.time()
pos_neg = NaiveBayesClassifier(train_data)
print('training time: {} min'.format((time.time()-start)/60))

# test
start = time.time()
accuracy = pos_neg.accuracy(test_data)
print('test time: {} min'.format((time.time()-start)/60))
print('Accuracy: ', accuracy)

training time: 0.7658995588620504 min
test time: 0.9329940954844157 min
Accuracy:  0.7725


In [157]:
pos_neg.show_informative_features()

Most Informative Features
       contains(최악/Noun) = True               부정 : 긍정     =     37.1 : 1.0
contains(재밌어요/Adjective) = True               긍정 : 부정     =     22.8 : 1.0
 contains(아깝다/Adjective) = True               부정 : 긍정     =     21.9 : 1.0
      contains(쓰레기/Noun) = True               부정 : 긍정     =     20.8 : 1.0
      contains(봤어요/Verb) = True               긍정 : 부정     =     17.3 : 1.0
contains(좋았습니다/Adjective) = True               긍정 : 부정     =     17.2 : 1.0
       contains(역대/Noun) = True               부정 : 긍정     =     16.8 : 1.0
      contains(0/Number) = True               부정 : 긍정     =     14.3 : 1.0
contains(ㅡㅡ/KoreanParticle) = True               부정 : 긍정     =     14.3 : 1.0
contains(재미없음/Adjective) = True               부정 : 긍정     =     14.3 : 1.0


# 원더우먼 리뷰 감성분석 - 형태소 분석 / 조사 제거

In [159]:
from konlpy.tag import Okt

okt = Okt()

train_data = [(['/'.join(token) for token in okt.pos(sentence) if token[1] != 'Josa'], result) for sentence, result in train]
test_data = [(['/'.join(token) for token in okt.pos(sentence) if token[1] != 'Josa'], result) for sentence, result in test]

In [160]:
print(train_data[0])

(['탁/Noun', '당/Suffix', 'More/Alpha', 'More/Alpha', '하면서/Verb', '원더우먼/Noun', '대치/Noun', '할/Verb', '때/Noun', '수어/Noun', '사이드/Noun', '스쿼드/Noun', '엉덩이/Noun', '춤/Noun', '악몽/Noun', '떠오르더군요/Verb', './Punctuation', 'DC/Alpha', '는/Verb', '대본/Noun', '연출/Noun', '마블/Noun', '대비/Noun', '고루/Noun', '것/Noun', '같아요/Adjective', './Punctuation', '보통/Noun', '미드/Noun', '작가진은/Adjective', '탄탄하던데/Adjective', 'DC/Alpha', '영화/Noun', '작가진은/Adjective', '어설픈것/Adjective', '같아/Adjective', '안타깝습니다/Adjective', './Punctuation', '수어/Noun', '사이드/Noun', '스쿼드/Noun', ',/Punctuation', '샤잠/Noun', '.../Punctuation', '저스티스리그/Noun', '조금/Noun', '낫긴/Verb', '했는데/Verb', '.../Punctuation', '아쉽게도/Adjective', '마블/Noun', '너무/Adverb', '대비/Noun', '되네요/Verb', './Punctuation', '재미있는/Adjective', 'DC/Alpha', '영화/Noun', '나오길/Verb', '아직/Adverb', '기다려/Verb', '봅니다/Verb', './Punctuation'], '부정')


In [161]:
print(test_data[0])

(['잘생긴/Adjective', '남주/Verb', '여주/Noun', '얼굴/Noun', '볼거/Verb', '없음/Adjective'], '부정')


In [162]:
from textblob.classifiers import NaiveBayesClassifier
from tqdm import tqdm
import time

# train
start = time.time()
pos_neg = NaiveBayesClassifier(train_data)
print('training time: {} min'.format((time.time()-start)/60))

# test
start = time.time()
accuracy = pos_neg.accuracy(test_data)
print('test time: {} min'.format((time.time()-start)/60))
print('Accuracy: ', accuracy)

training time: 0.5741668740908304 min
test time: 0.8835611740748087 min
Accuracy:  0.7725


In [163]:
pos_neg.show_informative_features()

Most Informative Features
       contains(최악/Noun) = True               부정 : 긍정     =     37.1 : 1.0
contains(재밌어요/Adjective) = True               긍정 : 부정     =     22.8 : 1.0
 contains(아깝다/Adjective) = True               부정 : 긍정     =     21.9 : 1.0
      contains(쓰레기/Noun) = True               부정 : 긍정     =     20.8 : 1.0
      contains(봤어요/Verb) = True               긍정 : 부정     =     17.3 : 1.0
contains(좋았습니다/Adjective) = True               긍정 : 부정     =     17.2 : 1.0
       contains(역대/Noun) = True               부정 : 긍정     =     16.8 : 1.0
      contains(0/Number) = True               부정 : 긍정     =     14.3 : 1.0
contains(ㅡㅡ/KoreanParticle) = True               부정 : 긍정     =     14.3 : 1.0
contains(재미없음/Adjective) = True               부정 : 긍정     =     14.3 : 1.0
