In [4]:
import jieba.posseg as seg
from snownlp import SnowNLP
import pynlpir
import numpy as np
import pandas as pd

def sentiment_analysis_with_jieba(text):
    analysis_words = []
    for i in seg.cut(text):
        analysis_words.append((i.word, i.flag))
    #print(analysis_words)

    keywords = []
    for i in analysis_words:
        # Take out adj, adv and normal noun from the list
        if i[1] in ["a", "d", "v"]:
            keywords.append(i[0])
    print("keywords=", keywords)

    postive_num = 0
    negative_num = 0
    sentiments_list = []
    for i in keywords:
        sl = SnowNLP(i)
        if sl.sentiments > 0.6:
            postive_num = postive_num + 1
        elif sl.sentiments < 0.4:
            negative_num = negative_num + 1
        sentiments_list.append((i, sl.sentiments))

    print(sentiments_list)
    if (postive_num + negative_num) != 0:
        sentiment = postive_num / (postive_num + negative_num)
    else:
        #could modify this criteria
        sentiment = 0.5

    if sentiment >= 0.5:
        return "P"
    else:
        return "N"
    #return sentiment
    
def sentiment_analysis_without_jieba(text):
    s = SnowNLP(text)
    sentiment = s.sentiments
    print(text, sentiment)

    if sentiment >= 0.5:
        return "P"
    else:
        return "N"
    #return sentiment

def sentiment_analysis_for_distinct_word(text):
    splited_text = [word for word in text]
    postive_num = 0
    negative_num = 0
    sentiments_list = []
    for i in splited_text:
        sl = SnowNLP(i)
        if sl.sentiments > 0.6:
            postive_num = postive_num + 1
        elif sl.sentiments < 0.4:
            negative_num = negative_num + 1
        sentiments_list.append((i, sl.sentiments))
    
    #print(sentiments_list)
    if (postive_num + negative_num) != 0:
        sentiment = postive_num / (postive_num + negative_num)
    else:
        #could modify this criteria
        sentiment = 0.5
        
    if sentiment >= 0.5:
        return "P"
    else:
        return "N"
    #return sentiment

def sentiment_analysis_for_distinct_word_without_snownlp(text):
    #load the lexicons
    path_pos = "lexicon_positive_chinese.txt"
    path_neg = "lexicon_negative_chinese.txt"
    lexicon_pos, lexicon_neg  = read_lexicon(path_pos, path_neg)
    
    splited_text = [word for word in text]
    postive_num = 0
    negative_num = 0
    
    for i in splited_text:
        if i in lexicon_pos:
            postive_num = postive_num + 1
        elif i in lexicon_neg:
            negative_num = negative_num + 1
            
    if (postive_num + negative_num) != 0:
        sentiment = postive_num / (postive_num + negative_num)
    else:
        #could modify this criteria
        sentiment = 0.5

    if sentiment >= 0.5:
        return "P"
    else:
        return "N"
    #return sentiment

def build_ngrams(sentence_list, N):
    if N < 2:
        return sentence_list
    size = len(sentence_list) - N + 1
    if size < 1:
        return None
    items = []
    for i in range(size):
        items.append("".join(sentence_list[i:i+N]))
    return items

def sentiment_with_ngrams(input_text, Max_N = 5):    
    #count the number of positive and negative
    pc=[]
    nc=[]
    # Parse the input text
    splited_input_text = [word for word in input_text]
    #seq = parse(input_text)
    #print(splited_input_text)
    punctuations = ["，", "。", "、", "？", "：", "“", "”", " "]
    p, n = 0, 0
    for N in range(Max_N,0,-1):  #do this reversely, first check large n-gram's sentiment
        seq_ngrams = build_ngrams(splited_input_text, N)
        print(seq_ngrams)
        if seq_ngrams:
            i = 0
            while i < len(seq_ngrams):
                w = seq_ngrams[i]
                #print(w)
                #check if it is a puntuation
                has_punctuation = False
                for j in punctuations:
                    if j in w:
                        has_punctuation = True
                        continue
                #check if it is full n-gram
                if has_punctuation == True or w == "" or len(w) != N:
                    i = i + 1
                    continue
                #analysis the sentiment of this n-gram
                found = False
                s = SnowNLP(w)
                sentiment_snownlp = s.sentiments
                #if it is a positive word(check by dictionary of snownlp)
                if sentiment_snownlp > 0.6:
                    pc.append(w)
                    p = N + p
                    found = True
                #if it is a negative word(check by dictionary of snownlp)    
                elif sentiment_snownlp < 0.4:
                    nc.append(w)
                    n = N + n
                    found = True
                #if found this n-gram has sentiment, then skip the following n-1 single words
                #and set those single word in splited_input as none in case the program check it again in lower N
                if found:
                    for j in range(N):
                        splited_input_text[i+j] = ""
                    i = i + N
                else:
                    i = i + 1
                print("pc=",pc)
                print("nc=",nc)
                #print("p=",p)
                #print("n=",n)
                #print(splited_input_text)
    
    if len(pc) >= len(nc):
        return "P"
    else:
        return "N"

def read_lexicon(path_pos, path_neg):
    f_pos = open(path_pos, encoding="utf-8")
    f_neg = open(path_neg, encoding="utf-8")
    lexicon_pos = f_pos.readlines()
    lexicon_pos = [line.strip("\n") for line in lexicon_pos]
    lexicon_neg = f_neg.readlines()
    lexicon_neg = [line.strip("\n") for line in lexicon_neg]
    f_pos.close()
    f_neg.close()
    return lexicon_pos, lexicon_neg


def sentiment_with_ngrams_without_snownlp(input_text, Max_N = 5):   
    #load the lexicons
    path_pos = "lexicon_positive_chinese.txt"
    path_neg = "lexicon_negative_chinese.txt"
    lexicon_pos, lexicon_neg  = read_lexicon(path_pos, path_neg)
    
    #count the number of positive and negative
    pc=[]
    nc=[]
    # Parse the input text
    splited_input_text = [word for word in input_text]
    #seq = parse(input_text)
    #print(splited_input_text)
    punctuations = ["，", "。", "、", "？", "：", "“", "”", " "]
    p, n = 0, 0
    for N in range(Max_N,0,-1):  #do this reversely, first check large n-gram's sentiment
        seq_ngrams = build_ngrams(splited_input_text, N)
        print(seq_ngrams)
        if seq_ngrams:
            i = 0
            while i < len(seq_ngrams):
                w = seq_ngrams[i]
                #print(w)
                #check if it is a puntuation
                has_punctuation = False
                for j in punctuations:
                    if j in w:
                        has_punctuation = True
                        continue
                #check if it is full n-gram
                if has_punctuation == True or w == "" or len(w) != N:
                    i = i + 1
                    continue
                #analysis the sentiment of this n-gram
                found = False
                #if it is a positive word(check by dictionary of snownlp)
                if w in lexicon_pos:
                    pc.append(w)
                    p = N + p
                    found = True
                #if it is a negative word(check by dictionary of snownlp)    
                elif w in lexicon_neg:
                    nc.append(w)
                    n = N + n
                    found = True
                #if found this n-gram has sentiment, then skip the following n-1 single words
                #and set those single word in splited_input as none in case the program check it again in lower N
                if found:
                    for j in range(N):
                        splited_input_text[i+j] = ""
                    i = i + N
                else:
                    i = i + 1
                print("pc=",pc)
                print("nc=",nc)
                '''
                print("p=",p)
                print("n=",n)
                print(splited_input_text)'''
    
    if len(pc) >= len(nc):
        return "P"
    else:
        return "N"

def sentiment_analysis_with_jieba_without_snownlp(text):
    #load the lexicons
    path_pos = "lexicon_positive_chinese.txt"
    path_neg = "lexicon_negative_chinese.txt"
    lexicon_pos, lexicon_neg  = read_lexicon(path_pos, path_neg)
    
    #count the number of positive and negative
    pc=[]
    nc=[]
    # Parse the input text
    
    analysis_words = []
    for i in seg.cut(text):
        analysis_words.append((i.word, i.flag))
    #print(analysis_words)

    keywords = []
    for i in analysis_words:
        # Take out adj, adv and normal noun from the list
        if i[1] in ["a", "d", "v"]:
            keywords.append(i[0])
    print("keywords=", keywords)

    p, n = 0, 0
    
    for w in keywords:
        if w in lexicon_pos:
            pc.append(w)
            p = len(w) + p
            found = True
                #if it is a negative word(check by dictionary of snownlp)    
        elif w in lexicon_neg:
            nc.append(w)
            n = len(w) + n
            found = True

    #print(sentiments_list)
    if len(pc) >= len(nc):
        return "P"
    else:
        return "N"
    #return sentiment


In [5]:
comments = pd.read_csv("comments.csv")
print("shape_csv=", comments.shape)
print(list(comments))

comments["sentiment_analysis_with_jieba"] = np.nan
comments["sentiment_analysis_with_jieba"] = comments["Comments_in_chinese"].apply(sentiment_analysis_with_jieba)
print("shape_csv=", comments.shape)
print(comments.head(10))

comments["sentiment_analysis_without_jieba"] = np.nan
comments["sentiment_analysis_without_jieba"] = comments["Comments_in_chinese"].apply(sentiment_analysis_without_jieba)
print("shape_csv=", comments.shape)
print(comments.head(10))

comments["sentiment_analysis_distinct_word"] = np.nan
comments["sentiment_analysis_distinct_word"] = comments["Comments_in_chinese"].apply(sentiment_analysis_for_distinct_word)
print("shape_csv=", comments.shape)
print(comments.head(10))

comments["sentiment_analysis_for_distinct_word_without_snownlp"] = np.nan
comments["sentiment_analysis_for_distinct_word_without_snownlp"] = comments["Comments_in_chinese"].apply(sentiment_analysis_for_distinct_word_without_snownlp)
print("shape_csv=", comments.shape)
print(comments.head(10))


comments["sentiment_analysis_with_ngrams"] = np.nan
comments["sentiment_analysis_with_ngrams"] = comments["Comments_in_chinese"].apply(sentiment_with_ngrams)
print("shape_csv=", comments.shape)
print(comments.head(10))

comments["sentiment_with_ngrams_without_snownlp"] = np.nan
comments["sentiment_with_ngrams_without_snownlp"] = comments["Comments_in_chinese"].apply(sentiment_with_ngrams_without_snownlp)
print("shape_csv=", comments.shape)
print(comments.head(10))

comments["sentiment_analysis_with_jieba_without_snownlp"] = np.nan
comments["sentiment_analysis_with_jieba_without_snownlp"] = comments["Comments_in_chinese"].apply(sentiment_analysis_with_jieba_without_snownlp)
print("shape_csv=", comments.shape)
print(comments.head(10))

comments.to_excel("Comment_analysised.xlsx", index=False)

shape_csv= (6, 1)
['Comments_in_chinese']
keywords= []
[]
keywords= ['不坏']
[('不坏', 0.8999999999999999)]
keywords= ['不好']
[('不好', 0.24509033778476041)]
keywords= ['坏']
[('坏', 0.33423913043478237)]
keywords= []
[]
keywords= ['坏']
[('坏', 0.33423913043478237)]
shape_csv= (6, 2)
  Comments_in_chinese sentiment_analysis_with_jieba
0                 好消息                             P
1               不坏的消息                             P
2               不好的消息                             N
3                坏的消息                             N
4                只是消息                             P
5               坏的好消息                             N
好消息 0.4467309078976355
不坏的消息 0.792230848408126
不好的消息 0.12091746448919471
坏的消息 0.1753936531064676
只是消息 0.2574107392864008
坏的好消息 0.26737349498904595
shape_csv= (6, 3)
  Comments_in_chinese sentiment_analysis_with_jieba  \
0                 好消息                             P   
1               不坏的消息                             P   
2               不好的消息          