/
SentiAnalysis.py
153 lines (143 loc) · 7.15 KB
/
SentiAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# -*- coding: utf-8 -*-
"""
Created on Thu April 21 12:54:39 2016
@author: Robin
"""
import jieba
from BuildDict import sentiDictionary, featureDictionary
class commentSentiCalc:
def __init__(self, commentSentence=""):
self.invalid_chat = 0
self.commentSentence = commentSentence
self.sentiDic = sentiDictionary()
self.featureDic = featureDictionary()
self.positiveValue = 4 #正面情感词的默认分值
self.negativeValue = -4 #负面情感词的默认分值
self.notValue = -1 #否定词的默认分值
self.extremeLevel = 2 #“极其”类程度副词的默认分值
self.veryLevel = 1.25 #“很”类程度副词的默认分值
self.moreLevel = 1.2 #“较”类程度副词的默认分值
self.ishLevel = 0.8 #“稍稍”类程度副词的默认分值
self.insufficientLevel = 0.5 #“不足,稍欠”类程度副词的默认分值
self.overLevel = 1.5 #“超”类程度副词的默认分值
self.notvery = 0.5 #“不很”类程度副词的默认分值
self.positiveWords_unicode, self.negativeWords_unicode, self.denyWords_unicode, self.levelDic_unicode = \
self.sentiDic.buildSentiDic()
#利用标点符号将评论切分成若干块,每一块都用分词工具分好词
def segByPunc(self):
punctuation = [u',', u'/', u'!', u'?', u'。', u' ', u'\'']
wordSequenceList = [] #类型说明 [[(id,comtend),()....]] 每一个元素都是一个长短语,其中没有短语
seg_list = jieba.cut(self.commentSentence)
segmentedComment = [item for item in seg_list]
segmentedCommentTuple = list(enumerate(segmentedComment))
subWordSequenceList = []
for wordTuple in segmentedCommentTuple:
if (wordTuple[1] in punctuation):
if (subWordSequenceList != []):
wordSequenceList.append(subWordSequenceList)
subWordSequenceList = []
else:
subWordSequenceList.append(wordTuple)
if (subWordSequenceList != []):
wordSequenceList.append(subWordSequenceList)
return (wordSequenceList)
#针对每一组词,辨识出其中的情感词,否定词和程度副词
def sentiVec(self, wordSequence):
positiveWords_unicode = self.positiveWords_unicode
negativeWords_unicode = self.negativeWords_unicode
denyWords_unicode = self.denyWords_unicode
levelDic_unicode = self.levelDic_unicode
wordNum = len(wordSequence)
polarVec = []
notVec = []
levelVec = []
for i in range(wordNum):
if (wordSequence[i] in positiveWords_unicode):
polarVec.append((i, 'positive', self.positiveValue)) #positive:4; negative:-4
elif (wordSequence[i] in negativeWords_unicode):
polarVec.append((i, 'negative', self.negativeValue))
elif (wordSequence[i] in denyWords_unicode):
notVec.append((i, -1))
elif (wordSequence[i] in levelDic_unicode[0]):
levelVec.append((i, self.extremeLevel))
elif (wordSequence[i] in levelDic_unicode[1]):
levelVec.append((i, self.veryLevel))
elif (wordSequence[i] in levelDic_unicode[2]):
levelVec.append((i, self.moreLevel))
elif (wordSequence[i] in levelDic_unicode[3]):
levelVec.append((i, self.ishLevel))
elif (wordSequence[i] in levelDic_unicode[4]):
levelVec.append((i, self.insufficientLevel))
elif (wordSequence[i] in levelDic_unicode[5]):
levelVec.append((i, self.overLevel))
if (polarVec == []):
return ([])
else:
GroupVec = []
polarNum = len(polarVec)
for i in range(polarNum):
notGroupVec = []
levelGroupVec = []
if (i == 0):
if (notVec != []):
for item in notVec:
if (item[0] < polarVec[i][0]):
notGroupVec.append(item)
if (levelVec != []):
for item in levelVec:
if (item[0] < polarVec[i][0]):
levelGroupVec.append(item)
else:
if (notVec != []):
for item in notVec:
if (item[0] < polarVec[i][0] and item[0] > polarVec[i - 1][0]):
notGroupVec.append(item)
if (levelVec != []):
for item in levelVec:
if (item[0] < polarVec[i][0] and item[0] > polarVec[i - 1][0]):
levelGroupVec.append(item)
GroupVec.append((polarVec[i], notGroupVec, levelGroupVec))
return (GroupVec)
#针对每一组词,计算出其情感值
def sentiValueCalc(self, wordSequence):
GroupVec = self.sentiVec(wordSequence)
sentiValue = 0
for polarVec, notVec, levelVec in GroupVec:
W_level = 1
W_not = 1
#a big problem if there are over 1 'not' words and mover 1 'level' words
if (levelVec != []):
levelVec = sorted(levelVec, key=lambda x: x[-1])
W_level = levelVec[-1][-1]
if (notVec != []):
notNum = len(notVec)
if (notNum % 2 != 0):
if (levelVec[-1][0] > notVec[-1][0]):
W_level = self.notvery
else:
W_level = -1 * W_level #程度副词在否定词中间或者前面都视为负,只有程度副词在所有否定词之后才视为0.5
elif (notVec != []):
notNum = len(notVec)
if (notNum % 2 != 0):
W_not = -1
sentiValue += polarVec[-1] * W_level * W_not
return (sentiValue)
#将评论分解成词块,将每一词块归类到定义的11中特征维度中,并且计算每一词块的情感值
def groupSentiCalc(self,commentSentence):
self.commentSentence = commentSentence
wordSequenceList = self.segByPunc() # 利用jieba包 进行分词处理
# sentiSummary = {'quality':[], 'color':[], 'material':[], 'style':[], 'function':[], 'package':[], 'price':[], 'service':[], 'logistic':[], 'description':[], 'others':[]}
# keys = ['quality', 'color', 'material', 'style', 'function', 'package', 'price', 'service', 'logistic', 'description']
total_score = 0
for wordSequenceTuple in wordSequenceList:
wordSequence = []
for loc, seg_word in wordSequenceTuple:
wordSequence.append(seg_word)
groupSentiValue = self.sentiValueCalc(wordSequence) #计算一句话的情感分数
total_score += groupSentiValue
if len(wordSequenceList) == 0:
print self.commentSentence
self.invalid_chat += 1
else:
total_score /= len(wordSequenceList)
return total_score