# 비 지도학습 기반 감성분석 - Lexicon 기반
 

In [24]:
import numpy as np
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

Wordnet Synset 및 Sentiwordnet SenitSynset 클래스

In [10]:
from nltk.corpus import wordnet

term = 'present' 
# 명사로는 선물, 현재 동사로는 표현, 발표
synsets = wordnet.synsets(term)

In [11]:
type(synsets), len(synsets)

(list, 18)

In [12]:
print(synsets)

[Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [14]:
for synset in synsets[:5]:
    print(f'#### name: {synset.name()} ####')
    print( 'POS:', synset.lexname())
    print('정의:', synset.definition())
    print('표제어:', synset.lemma_names())

#### name: present.n.01 ####
POS: noun.time
정의: the period of time that is happening now; any continuous stretch of time including the moment of speech
표제어: ['present', 'nowadays']
#### name: present.n.02 ####
POS: noun.possession
정의: something presented as a gift
표제어: ['present']
#### name: present.n.03 ####
POS: noun.communication
정의: a verb tense that expresses actions or states at the time of speaking
표제어: ['present', 'present_tense']
#### name: show.v.01 ####
POS: verb.perception
정의: give an exhibition of to an interested audience
표제어: ['show', 'demo', 'exhibit', 'present', 'demonstrate']
#### name: present.v.02 ####
POS: verb.communication
정의: bring forward and present to the mind
표제어: ['present', 'represent', 'lay_out']


In [15]:
# 신셋 클래스의 자료주고. 위와 같은 메소드를 가지고 있는 클래스. 

- 어휘간의 유사도

In [16]:
# 타이거는 워드넷에 있는 
for synset in wordnet.synsets('tiger'):
    print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [60]:
# 단어, 품사를 아는 경우에은 synset()
tiger = wordnet.synset('tiger.n.02')
tree = wordnet.synset('tree.n.01')
lion = wordnet.synset('lion.n.01')
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

In [63]:
# 단어간의 유사도 사자가 타이거와 유사도가 높아. 의미의 유사성 
tiger.path_similarity(lion), tiger.path_similarity(dog),tiger.path_similarity(tree)

(0.3333333333333333, 0.16666666666666666, 0.07142857142857142)

In [22]:
# 5개 단어간의 상대적인 유사도
similarities = []
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
    similary = [entity.path_similarity(another) for another in entities]
    similarities.append(similary)

In [25]:
# 유사도를 데이터 프레임으로 만들기 
df = pd.DataFrame(similarities, columns=['tree', 'lion', 'tiger', 'cat', 'dog'])
df

Unnamed: 0,tree,lion,tiger,cat,dog
0,1.0,0.066667,0.066667,0.071429,0.111111
1,0.066667,1.0,0.333333,0.25,0.166667
2,0.066667,0.333333,1.0,0.25,0.166667
3,0.071429,0.25,0.25,1.0,0.2
4,0.111111,0.166667,0.166667,0.2,1.0


- SentiSynset 객체
아까의 Synset 객체와는 다르게 센티라는것을 가지고 있음 



In [39]:
from nltk.corpus import sentiwordnet

senti_synsets = list(sentiwordnet.senti_synsets('slow'))

In [64]:
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
7
[SentiSynset('mother.n.01'), SentiSynset('mother.n.02'), SentiSynset('mother.n.03'), SentiSynset('mother.n.04'), SentiSynset('mother.n.05'), SentiSynset('mother.v.01'), SentiSynset('beget.v.01')]


In [41]:
from nltk.corpus import sentiwordnet

senti_synsets = list(sentiwordnet.senti_synsets('father'))

In [42]:
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
9
[SentiSynset('father.n.01'), SentiSynset('forefather.n.01'), SentiSynset('father.n.03'), SentiSynset('church_father.n.01'), SentiSynset('father.n.05'), SentiSynset('father.n.06'), SentiSynset('founder.n.02'), SentiSynset('don.n.03'), SentiSynset('beget.v.01')]


In [44]:
# 단어의 긍정 감성지수, 부정감성지수, 객관성지수. 
father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [59]:
# 단어의 긍정 감성지수, 부정감성지수, 객관성지수. 
# senti_synsets = list(sentiwordnet.senti_synsets('mother'))
father = sentiwordnet.senti_synset('mother.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [53]:
# 단어의 긍정 감성지수, 부정감성지수, 객관성지수. 
fabulous = sentiwordnet.senti_synset('fabulous.a.01')
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()

(0.875, 0.125, 0.0)

In [58]:
# 부사는 어떻게 되지 ?
list(sentiwordnet.senti_synsets('just'))

[SentiSynset('just.a.01'),
 SentiSynset('equitable.a.01'),
 SentiSynset('fair.a.01'),
 SentiSynset('good.s.07'),
 SentiSynset('merely.r.01'),
 SentiSynset('precisely.r.01'),
 SentiSynset('just.r.03'),
 SentiSynset('just.r.04'),
 SentiSynset('barely.r.01'),
 SentiSynset('just.r.06')]

In [67]:
# 졸리면 
# work 단어의 긍정감성 지수, 부정감성 지수, 객관성 지수 동사는 감성지수 값이 없다. 
# 그럴줄 알았는데 이걸 용재가??? 
work = sentiwordnet.senti_synset('work.v.01')
work.pos_score(), work.neg_score(), work.obj_score()

(0.0, 0.0, 1.0)

In [68]:
love = sentiwordnet.senti_synset('love.v.01')
love.pos_score(), love.neg_score(), love.obj_score()

(0.5, 0.0, 0.5)

In [69]:
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

- 감성지수 계산

In [70]:
# from nltk.tokenize import word_tokenize
# from nltk.tag import pos_tag
# sentence = "It's good to see you again."

In [71]:
from nltk import word_tokenize, pos_tag
sentence = "It's good to see you again."
word_list = word_tokenize(sentence)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again', '.']

In [72]:
pos_tag(word_list)
# 워드넷과의 차이 

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB'),
 ('.', '.')]

In [73]:
tag = ('good', 'JJ')
tag[1].startswith('J')

True

In [82]:
# 508p 워드넷 품사 태그로 변환 
# what is J
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('V'):
        return wordnet.VERB
    return None        

In [83]:
for word, tag in pos_tag(word_list):
    print(word, penn_to_wn(tag))

good a
see n
you None
again r


In [None]:
# 이번에는 태그 자리에 penn_to_wn???



In [80]:
# Sentence로부터 Senti_Synset 객체를 만드는 과정 
sentence = "It's good to see you again."
word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
word_list

['good', 'see', 'you', 'again']

In [85]:
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [86]:
sentiment = 0

for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()
sentiment

# 해석좀.. 

0.75

In [92]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [93]:
sentiment = 0
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()
sentiment


0.75

In [95]:
# 실습 위 어떤 데이터로 할 끄나. ? 
# labeledTrainData.tsv 요놈으로 간다
# 06.Text분석\data\labeledTrainData.tsv 의 9번째 줄

In [96]:
from nltk import sent_tokenize
document = "I watched this video at a friend's house. I'm glad I did not waste money buying this one. The video cover has a scene from the 1975 movie Capricorn One. The movie starts out with several clips of rocket blow-ups, most not related to manned flight. Sibrel's smoking gun is a short video clip of the astronauts preparing a video broadcast. He edits in his own voice-over instead of letting us listen to what the crew had to say. The video curiously ends with a showing of the Zapruder film. His claims about radiation, shielding, star photography, and others lead me to believe is he extremely ignorant or has some sort of ax to grind against NASA, the astronauts, or American in general. His science is bad, and so is this video."

In [99]:
sentiment = 0.0
for sentence in sent_tokenize(document):
    word_list = word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
    for word, tag in pos_tag(word_list):
        wn_tag = penn_to_wn(tag)
        if wn_tag:
            lemma = lemmatizer.lemmatize(word, wn_tag)
            synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
            if not synsets:
                print(word)
                continue
            synset = synsets[0]
            sentiment += synset.pos_score() - synset.neg_score()
print('긍정긍정' if sentiment >=0 else '부정')          

scene
blow-ups
Sibrel
voice-over
Zapruder
others
부정


In [None]:
# 어떤놈이 괴롭혔나..  # 어기 신셋츠에서 뻑난놈 찾아서 어케 알아보지 ???
# scene
# blow-ups
# Sibrel
# voice-over
# Zapruder
# others
# -1.25