In [1]:
! pip install jieba



In [2]:
import numpy as np
import jieba
import pandas as pd

# Load sentiment dictionary

In [3]:
stop_words = open("./emotion_dict/stop_words.txt", encoding="utf-8").readlines()
stop_words = [s.strip() for s in stop_words]
for sw in stop_words:
    jieba.del_word(sw)
    
def cutword(x):
    seg = jieba.cut(x) 
    new_seg = []
    for key in seg:
        if not(key.strip() in stop_words) and (len(key.strip()) > 1):
            new_seg.append(key)
    return new_seg
    

def cut_sentence(words):
    start = 0
    i = 0
    token = 'meaningless'
    sents = []
    punt_list = ',.!?;~，。！？；～… '
    for word in words:
        if word not in punt_list:  
            i += 1
            token = list(words[start:i+2]).pop()
        elif word in punt_list and token in punt_list:  
            i += 1
            token = list(words[start:i+2]).pop()
        else:
            sents.append(words[start:i+1])  
            start = i + 1
            i += 1
    if start < len(words): 
        sents.append(words[start:])
        
    return sents


def read_lines(filename):
    fp = open(filename, 'r', encoding="utf-8")
    lines = []
    for line in fp.readlines():
        line = line.strip()
        line = line
        lines.append(line)
    return lines


def del_stopwords(seg_sent):
    stopwords = read_lines("./emotion_dict/stop_words.txt")  
    new_sent = []   
    for word in seg_sent:
        if word in stopwords:
            continue
        else:
            new_sent.append(word)
    return new_sent


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/f9/nxp8wrj16mq509ksxvv0krj00000gn/T/jieba.cache
Loading model cost 0.440 seconds.
Prefix dict has been built successfully.


In [4]:
posdict = read_lines("./emotion_dict/pos_all_dict.txt")
negdict = read_lines("./emotion_dict/neg_all_dict.txt")

mostdict = read_lines('./degree_dict/most.txt')   
verydict = read_lines('./degree_dict/very.txt')  
moredict = read_lines('./degree_dict/more.txt')  
nearlydict = read_lines('./degree_dict/nearly.txt')   
barelydict = read_lines('./degree_dict/barely.txt') 
inversedict = read_lines('./degree_dict/inversion.txt')  

print(inversedict)

['\ufeffno', '不', '没', '不要', '难以', '未曾', '无', '非', '莫', '弗', '毋', '未', '否', '别', '无', '不够', '不是', '不曾', '未必', '没有']


# Calculate sentiment score

In [5]:
def match(word, sentiment_value):
    if word in mostdict:
        sentiment_value *= 2.0
        
    elif word in verydict:
        sentiment_value *= 1.75
        
    elif word in moredict:
        sentiment_value *= 1.5
        
    elif word in nearlydict:
        sentiment_value *= 1
        
    elif word in barelydict:
        sentiment_value *= 0.5
        
    elif word in inversedict:
        sentiment_value *= -1

    return sentiment_value

In [6]:
def transform_to_positive_num(poscount, negcount):
    pos_count = 0
    neg_count = 0
    
    if poscount < 0 and negcount >= 0:
        neg_count += negcount - poscount
        pos_count = 0
        
    elif negcount < 0 and poscount >= 0:
        pos_count = poscount - negcount
        neg_count = 0
        
    elif poscount < 0 and negcount < 0:
        neg_count = -poscount
        pos_count = -negcount
    else:
        pos_count = poscount
        neg_count = negcount
        
    return (pos_count, neg_count)

In [7]:
def single_review_sentiment_score(content):
    single_review_senti_score = []
    cuted_review = cut_sentence(content)  

    for sent in cuted_review:
        seg_sent = cutword(sent)
        seg_sent = del_stopwords(seg_sent)[:]

        i = 0    
        s = 0   
        poscount = 0    
        negcount = 0

        for word in seg_sent:   
            if word in posdict: 
                poscount += 1  
                for w in seg_sent[s:i]:
                    poscount = match(w, poscount)
                s = i + 1  

            elif word in negdict: 
                negcount += 1
                for w in seg_sent[s:i]:
                    negcount = match(w, negcount)
                s = i + 1

            elif word == "！" or word == "!":
                for w2 in seg_sent[::-1]:  
                    if w2 in posdict:
                        poscount += 2
                        break
                    elif w2 in negdict:
                        negcount += 2
                        break
            i += 1
        single_review_senti_score.append(transform_to_positive_num(poscount, negcount))   
    pos_result, neg_result = 0, 0
    
    for res1, res2 in single_review_senti_score:  
        pos_result += res1
        neg_result += res2
        
    result = pos_result - neg_result  
    result = round(result, 1)
    return result

# Load public opinion and conduct prediction

In [8]:
def run_score(file_name):
    fp_test = open(file_name, 'r')
    
    contents = []
    for content in fp_test.readlines():
        content = content.strip()
        contents.append(content)
    
    results = []
    for content in contents:
        score = single_review_sentiment_score(content)  
        results.append((score, content))
        
    return results

In [9]:
def write_results(file_name, results):
    fp_result = open(file_name, 'w')
    
    fp_result.write("Public opinion")
    fp_result.write('\t')
    fp_result.write("Sentiment score")
    fp_result.write('\n')
    
    for result in results:
        fp_result.write(result[1])
        fp_result.write('\t')
        fp_result.write(str(result[0]))
        fp_result.write('\n')
        
    fp_result.close()

    
def summary(results):
    pos_number, neg_number = 0, 0
    pos_mean, neg_mean = 0, 0
    pos_variance, neg_variance, var_ratio = 0, 0, 0
    pos_list, neg_list, total_list = [], [], []
    
    for result in results:
        total_list.append(result[0])
        if result[0] >= 0:
            pos_list.append(result[0])   
        else:
            neg_list.append(result[0])   

    pos_number = len(pos_list)
    neg_number = len(neg_list)
    
    total_number = pos_number + neg_number
    pos_number_ratio = round(float(pos_number)/float(total_number), 2)
    neg_number_ratio = round(float(neg_number)/float(total_number), 2)
    text_pos_number = "Number of positive smaples: " + str(pos_number) + " Percentage: " + str(pos_number_ratio*100)
    text_neg_number = "Number of negative smaples: " + str(neg_number) + " Percentage: " + str(neg_number_ratio*100)
    
    pos_array = np.array(pos_list)
    neg_array = np.array(neg_list)   
    total_array = np.array(total_list)
    pos_mean = pos_array.mean()
    neg_mean = neg_array.mean()
    total_mean = total_array.mean()   
        
    pos_variance = pos_array.var(axis=0)
    neg_variance = neg_array.var(axis=0)
    total_variance = total_array.var(axis=0)

        
    result_dict = {}
    result_dict['pos_number'] = pos_number   
    result_dict['neg_number'] = neg_number  
    
    result_dict['pos_mean'] = round(pos_mean, 1)  
    result_dict['neg_mean'] = round(neg_mean, 1)  
    result_dict['total_mean'] = round(total_mean, 1) 
    
    result_dict['pos_variance'] = round(pos_variance, 1)  
    result_dict['neg_variance'] = round(neg_variance, 1)  
    result_dict['total_variance'] = round(total_variance, 1)
    
    result_dict['text_pos_number'] = text_pos_number   
    result_dict['text_neg_number'] = text_neg_number

    return result_dict

In [10]:
results = run_score("./data/test_public_opinion.txt")
write_results("./data/predicted_results.txt", results)
result_dict = summary(results)   

for key, value in result_dict.items():
    print(key, " | ", value)

pos_number  |  2
neg_number  |  3
pos_mean  |  3.0
neg_mean  |  -1.3
total_mean  |  0.4
pos_variance  |  9.0
neg_variance  |  0.2
total_variance  |  8.2
text_pos_number  |  Number of positive smaples: 2 Percentage: 40.0
text_neg_number  |  Number of negative smaples: 3 Percentage: 60.0


In [11]:
df = pd.read_csv("./data/predicted_results.txt", sep="\t",)
df.head()

Unnamed: 0,Public opinion,Sentiment score
0,【#老顾客投诉被店家说手法娴熟# 消费者认为对方在诽谤】2月5号，张先生点了一份麻辣香锅外卖...,-1
1,『县政府副县长王海娟到夏蔚镇检查食品安全工作』,0
2,点了个肯德基嫩牛五方 它竟然是臭的？你敢信！我吃了几口不对劲直接吐了！国内食品安全我想说，在...,-1
3,2月10日，河南省市场监管局召开全省落实食品安全属地管理和企业主体“两个责任”工作推进会议。...,6
4,民以食为天，食以安为先。食品安全的重要性实在不必多说，但如此重要的领域，却仍然年年都能被媒体...,-2
