# 감성사전

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [77]:
# !pip install afinn

In [78]:
from afinn import Afinn
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

## 분석할 데이터: 부정적인 댓글, 긍정적인 댓글 데이터

In [79]:
import glob
neg_reviews = glob.glob('./data/neg/*.txt')
pos_reviews = glob.glob('./data/pos/*.txt')

In [80]:
# pos_text
f= open(pos_reviews[20], 'r')
pos_text = f.read()
print(pos_text)

Critics need to review what they class as a quality movie. I think the critics have seen too many actions films and have succumbed to the Matrix style of films. Europa is a breath of fresh air, a film with so many layers that one viewing is not enough to understand or appreciate this outstanding film. Lars von Trier shows that old styles of filming can produce marvellous cinema and build drama and tension. The back projection effect he uses during the film arouses and enhances the characters, and the focus of the conversation they are having. Other effects he uses such as the colour and black and white in one scene much like Hitchcock and the girl with the red coat grabs attention and enhances the drama and meaning of the scene. The commentary is superb and has a hypnotic effect, again maintaining the focus on the central characters in the scene and there actions.<br /><br />I could talk about the effects more but I think you all would agree they push this film into a category of its o

In [81]:
# neg_text
f= open(neg_reviews[20], 'r')
neg_text = f.read()
print(neg_text)

Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography. I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of factual errors, but it would go on for pages. In all, I believe that Lucille Ball is one of those inimitable people who simply cannot be portrayed by anyone other than themselves. If I were Lucie Arnaz and Desi, Jr., I would be irate at how many mistakes were made in this film. The filmmakers tried hard, but the movie seems awfully sloppy to me.


### 감성어 사전1 - Afinn(모듈)

In [82]:
afinn = Afinn()

In [84]:
afinn.score(pos_text)

14.0

### 감성어 사전2 - NRC(csv파일)

In [85]:
nrc = pd.read_csv('./data/NRC.txt',engine='python',header=None, sep='\t')
nrc

Unnamed: 0,0,1,2
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
...,...,...,...
141815,zoom,negative,0
141816,zoom,positive,0
141817,zoom,sadness,0
141818,zoom,surprise,0


In [86]:
# 총 10개의 감성
nrc.iloc[:,1].value_counts()

anger           14182
anticipation    14182
disgust         14182
fear            14182
joy             14182
negative        14182
positive        14182
sadness         14182
surprise        14182
trust           14182
Name: 1, dtype: int64

In [87]:
# 14181 개의 단어가 10개의 감성에 포함되면 1 안되면 0
nrc.iloc[:,0].value_counts()

aback            10
poker            10
poignant         10
point            10
pointedly        10
                 ..
fallow           10
falsehood        10
falsely          10
falsification    10
zoom             10
Name: 0, Length: 14181, dtype: int64

#### nrc 전처리

In [88]:
# 1값들만 모아주기
nrc = nrc[(nrc != 0).all(1)]
nrc.head()

Unnamed: 0,0,1,2
19,abacus,trust,1
23,abandon,fear,1
25,abandon,negative,1
27,abandon,sadness,1
30,abandoned,anger,1


In [89]:
# 단어 happy에 들어간 감정: anticipation, joy, positive,trust
nrc[nrc[0]== 'happy']

Unnamed: 0,0,1,2
57871,happy,anticipation,1
57874,happy,joy,1
57876,happy,positive,1
57879,happy,trust,1


In [27]:
# 인덱스 다시 설정
nrc = nrc.reset_index(drop = True)
nrc

Unnamed: 0,0,1,2
0,abacus,trust,1
1,abandon,fear,1
2,abandon,negative,1
3,abandon,sadness,1
4,abandoned,anger,1
...,...,...,...
13896,zest,anticipation,1
13897,zest,joy,1
13898,zest,positive,1
13899,zest,trust,1


#### nrc를 활용한 긍정댓글 감성 분석

In [90]:
lines1 = pos_text

In [91]:
tokenizer = RegexpTokenizer('[\w]+')
stop_words = stopwords.words('english')
p_stemmer = PorterStemmer()

In [53]:
raw = lines1.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if i not in stop_words]
stopped_tokens

['critics',
 'need',
 'review',
 'class',
 'quality',
 'movie',
 'think',
 'critics',
 'seen',
 'many',
 'actions',
 'films',
 'succumbed',
 'matrix',
 'style',
 'films',
 'europa',
 'breath',
 'fresh',
 'air',
 'film',
 'many',
 'layers',
 'one',
 'viewing',
 'enough',
 'understand',
 'appreciate',
 'outstanding',
 'film',
 'lars',
 'von',
 'trier',
 'shows',
 'old',
 'styles',
 'filming',
 'produce',
 'marvellous',
 'cinema',
 'build',
 'drama',
 'tension',
 'back',
 'projection',
 'effect',
 'uses',
 'film',
 'arouses',
 'enhances',
 'characters',
 'focus',
 'conversation',
 'effects',
 'uses',
 'colour',
 'black',
 'white',
 'one',
 'scene',
 'much',
 'like',
 'hitchcock',
 'girl',
 'red',
 'coat',
 'grabs',
 'attention',
 'enhances',
 'drama',
 'meaning',
 'scene',
 'commentary',
 'superb',
 'hypnotic',
 'effect',
 'maintaining',
 'focus',
 'central',
 'characters',
 'scene',
 'actions',
 'br',
 'br',
 'could',
 'talk',
 'effects',
 'think',
 'would',
 'agree',
 'push',
 'film',
 

In [54]:
match_tokens = [x for x in stopped_tokens if x in list(nrc[0])] # nrc[0]은 단어목록
match_tokens

['outstanding',
 'build',
 'tension',
 'focus',
 'black',
 'white',
 'attention',
 'superb',
 'focus',
 'talk',
 'agree',
 'heighten',
 'artistic']

In [55]:
# emotion 단어 모아주기
emotion = []

for i in match_tokens:
    em_word = nrc[nrc[0]==i][1].values
    for j in em_word:
        emotion.append(j)
emotion

['joy',
 'negative',
 'positive',
 'positive',
 'anger',
 'positive',
 'negative',
 'sadness',
 'anticipation',
 'joy',
 'positive',
 'trust',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'fear',
 'negative',
 'positive']

In [56]:
sent_result = pd.Series(emotion).value_counts()
sent_result

positive        10
negative         3
joy              2
anger            1
sadness          1
anticipation     1
trust            1
fear             1
dtype: int64

In [94]:
def emotion_score(series): # series: 감성 단어의 점수 series
    emot_score = 0
    for j in range(len(series)):
        if series.index[j] in ('positive', 'anticipation', 'trust', 'joy', 'surprise'):
            emot_score += series.iloc[j]
        else:
            emot_score -= series.iloc[j]
    return emot_score

In [95]:
emotion_score(sent_result)

8

In [97]:
# 부정 댓글
lines2 = neg_text

raw = lines2.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in stop_words]
match_tokens = [x for x in stopped_tokens if x in list(nrc[0])]

emotion = []
for i in match_tokens:
    em_word = nrc[nrc[0] == i][1].values
    for j in em_word:
        emotion.append(j)
        
sent_result = pd.Series(emotion).value_counts()

emotion_score(sent_result)

-2

## 한글 감성어 사전

In [98]:
ko_dict = pd.read_csv('./data/polarity.csv')
ko_dict.head(10)

Unnamed: 0,ngram,freq,COMP,NEG,NEUT,None,POS,max.value,max.prop
0,가*/JKS,1,0.0,0.0,0.0,0.0,1.0,POS,1.0
1,가*/JKS;있/VV,1,0.0,0.0,0.0,0.0,1.0,POS,1.0
2,가*/JKS;있/VV;었/EP,1,0.0,0.0,0.0,0.0,1.0,POS,1.0
3,가*/VV,3,0.0,0.0,0.0,0.0,1.0,POS,1.0
4,가*/VV;ㄴ다*/EF,1,0.0,0.0,0.0,0.0,1.0,POS,1.0
5,가/JKC,17,0.0,0.470588,0.235294,0.0,0.294118,NEG,0.470588
6,가/JKC;되/VV,11,0.0,0.363636,0.272727,0.0,0.363636,NEG,0.363636
7,가/JKC;되/VV;ㄴ/ETM,2,0.0,0.0,1.0,0.0,0.0,NEUT,1.0
8,가/JKC;되/VV;ㄹ/ETM,1,0.0,0.0,1.0,0.0,0.0,NEUT,1.0
9,가/JKC;되/VV;어/EC,2,0.0,1.0,0.0,0.0,0.0,NEG,1.0


In [99]:
import csv

In [69]:
table = dict()

with open('./data/polarity.csv', 'r', -1, 'utf-8') as polarity:
    next(polarity)
    
    for line in csv.reader(polarity):
        key = str()
        for word in line[0].split(':'):
            key += word.split('/')[0]
        table[key] = {'neg': line[3], 'neut': line[4], 'pos': line[6]}

In [None]:
columns = ['neg', 'neut', 'pos']

df= pd.DataFrame(colunms)