# Part 1. Calculation and Analysis of Sentiment Value of Review Text

In [1]:
import json
import pandas as pd
from snownlp import SnowNLP

## 1. Building an sentiment lexicon

### Dictionary for degree adverbs

In [2]:
#Define weights for different degree adverbs
category1 = {
    '绝对': 2,
    '绝对化': 2,
    '绝对性': 2,
    '绝顶': 2,
    '万分': 2,
    '不得了': 2,
    '不可开交': 2,
    '不亦乐乎': 2,
    '不折不扣': 2,
    '彻头彻尾': 2,
    '充分': 2,
    '到头': 2,
    '地地道道': 2,
    '非常': 2,
    '极': 2,
    '极度': 2,
    '极端': 2,
    '极其': 2,
    '极为': 2,
    '截然': 2,
    '尽': 2,
    '惊人地': 2,
    '绝': 2,
    '刻骨': 2,
    '酷': 2,
    '满': 2,
    '满贯': 2,
    '满心': 2,
    '莫大': 2,
    '奇': 2,
    '入骨': 2,
    '甚为': 2,
    '十二分': 2,
    '十分': 2,
    '十足': 2,
    '死': 2,
    '滔天': 2,
    '痛': 2,
    '透': 2,
    '完全': 2,
    '完完全全': 2,
    '万': 2,
    '万般': 2,
    '无比': 2,
    '无度': 2,
    '无可估量': 2,
    '无以复加': 2,
    '无以伦比': 2,
    '要命': 2,
    '要死': 2,
    '已极': 2,
    '已甚': 2,
    '异常': 2,
    '逾常': 2,
    '贼': 2,
    '之极': 2,
    '之至': 2,
    '至极': 2,
    '卓绝': 2,
    '最为': 2,
    '佼佼': 2,
    '郅': 2,
    '綦': 2,
    '齁': 2,
    '最': 2,
}

category2 = {
    '不为过': 1.8,
    '超': 1.8,
    '超额': 1.8,
    '超外差': 1.8,
    '超微结构': 1.8,
    '超物质': 1.8,
    '出头': 1.8,
    '多': 1.8,
    '浮': 1.8,
    '过': 1.8,
    '过度': 1.8,
    '过分': 1.8,
    '过火': 1.8,
    '过劲': 1.8,
    '过了头': 1.8,
    '过猛': 1.8,
    '过热': 1.8,
    '过甚': 1.8,
    '过头': 1.8,
    '过于': 1.8,
    '过逾': 1.8,
    '何止': 1.8,
    '何啻': 1.8,
    '开外': 1.8,
    '苦': 1.8,
    '老': 1.8,
    '偏': 1.8,
    '强': 1.8,
    '溢': 1.8,
    '忒': 1.8
}

category3 = {
    "不过": 1.5,
    "不少": 1.5,
    "不胜": 1.5,
    "惨": 1.5,
    "沉": 1.5,
    "沉沉": 1.5,
    "出奇": 1.5,
    "大为": 1.5,
    "多": 1.5,
    "多多": 1.5,
    "多加": 1.5,
    "多么": 1.5,
    "分外": 1.5,
    "格外": 1.5,
    "够瞧的": 1.5,
    "够戗": 1.5,
    "好不": 1.5,
    "何等": 1.5,
    "很": 1.5,
    "很是": 1.5,
    "坏": 1.5,
    "可": 1.5,
    "老": 1.5,
    "老大": 1.5,
    "良": 1.5,
    "颇": 1.5,
    "颇为": 1.5,
    "甚": 1.5,
    "实在": 1.5,
    "太": 1.5,
    "太甚": 1.5,
    "特": 1.5,
    "特别": 1.5,
    "尤": 1.5,
    "尤其": 1.5,
    "尤为": 1.5,
    "尤以": 1.5,
    "远": 1.5,
    "着实": 1.5,
    "曷": 1.5,
    "碜": 1.5
}

category4 = {
    '大不了': 1.2,
    '多': 1.2,
    '更': 1.2,
    '更加': 1.2,
    '更进一步': 1.2,
    '更为': 1.2,
    '还': 1.2,
    '还要': 1.2,
    '较': 1.2,
    '较比': 1.2,
    '较为': 1.2,
    '进一步': 1.2,
    '那般': 1.2,
    '那么': 1.2,
    '那样': 1.2,
    '强': 1.2,
    '如斯': 1.2,
    '益': 1.2,
    '益发': 1.2,
    '尤甚': 1.2,
    '逾': 1.2,
    '愈': 1.2,
    '愈 ... 愈': 1.2,
    '愈发': 1.2,
    '愈加': 1.2,
    '愈来愈': 1.2,
    '愈益': 1.2,
    '远远': 1.2,
    '越 ... 越': 1.2,
    '越发': 1.2,
    '越加': 1.2,
    '越来越': 1.2,
    '越是': 1.2,
    '这般': 1.2,
    '这样': 1.2,
    '足': 1.2,
    '足足': 1.2,
}

category5 = {
    '点点滴滴': 0.8,
    '多多少少': 0.8,
    '怪': 0.8,
    '好生': 0.8,
    '还': 0.8,
    '或多或少': 0.8,
    '略': 0.8,
    '略加': 0.8,
    '略略': 0.8,
    '略微': 0.8,
    '略为': 0.8,
    '稍': 0.8,
    '稍稍': 0.8,
    '稍微': 0.8,
    '稍为': 0.8,
    '稍许': 0.8,
    '挺': 0.8,
    '未免': 0.8,
    '相当': 0.8,
    '些': 0.8,
    '些微': 0.8,
    '些小': 0.8,
    '一点': 0.8,
    '一点儿': 0.8,
    '一些': 0.8,
    '有点': 0.8,
    '有点儿': 0.8,
    '有些': 0.8
}

category6 = {
    "半点": 0.5,
    "不大": 0.5,
    "不丁点儿": 0.5,
    "不甚": 0.5,
    "不怎么": 0.5,
    "聊": 0.5,
    "没怎么": 0.5,
    "轻度": 0.5,
    "弱": 0.5,
    "丝毫": 0.5,
    "微": 0.5,
    "相对": 0.5
}


In [3]:
# Create a list of 6 dictionaries
dict_list = [category1, category2, category3, category4, category5, category6]

# Merge all dictionaries into a single dictionary
merged_dict = {}
for d in dict_list:
    merged_dict.update(d)

# Save the merged dictionary to a txt file
# Open the file in write mode, encode with utf-8 and write as a json object
with open('sentiment_dic/adv_dic.txt', 'w', encoding='utf-8') as f:
    json.dump(merged_dict, f, ensure_ascii=False)


### Dictionary for negative words 

In [4]:
# Define a function to read a txt file and return its contents as a list of lines
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read().splitlines()

# Read words from negative_dic.txt file
neg_words = read_txt_file('sentiment_dic/base/negative_dic.txt')

# Convert the list of words into a dictionary and set the value of each word as 1
neg_dict = {word: 1 for word in neg_words}

# Save the negative dictionary to a txt file
# Open the file in write mode, encode with utf-8 and write as a json object
with open('sentiment_dic/neg_dic.txt', 'w', encoding='utf-8') as f:
    json.dump(neg_dict, f, ensure_ascii=False)

### Dictionary for emotional words

In [5]:
def read_txt_file(file_path, encoding):
    with open(file_path, 'r', encoding=encoding) as f:
        return f.read().splitlines()

# Define the file paths and encodings for each sentiment word file
SENTIMENT_FILES = [
    ('sentiment_dic/base/ntusd_dic/NTUSD_negative.txt', 'utf-16le'),
    ('sentiment_dic/base/ntusd_dic/NTUSD_positive.txt', 'utf-16le'),
    ('sentiment_dic/base/hownet_dic/hownet_negative_review.txt', 'gbk'),
    ('sentiment_dic/base/hownet_dic/hownet_positive_review.txt', 'gbk'),
    ('sentiment_dic/base/hownet_dic/hownet_negative_sentiment.txt', 'gbk'),
    ('sentiment_dic/base/hownet_dic/hownet_positive_sentiment.txt', 'gbk')
]

# Define the weights for each sentiment word file
SENTIMENT_WEIGHTS = [
    -1, # negative NTUSD
    1,  # positive NTUSD
    -1, # negative HowNet review
    1,  # positive HowNet review
    -2, # negative HowNet sentiment
    2   # positive HowNet sentiment
]

# Create a dictionary of sentiment words and their weights
sentiment_dict = {}
for file_path, encoding in SENTIMENT_FILES:
    words = read_txt_file(file_path, encoding)
    weight = SENTIMENT_WEIGHTS.pop(0)
    for word in words:
        sentiment_dict[word] = weight

# Save the dictionary as a JSON file
with open('sentiment_dic/senti_dic.txt', 'w', encoding='utf-8') as f:
    json.dump(sentiment_dict, f, ensure_ascii=False)

In [6]:
# Formatting the stored dictionaries
with open('sentiment_dic/senti_dic.txt', 'r') as file:
    content = file.read()

content = content.replace(' ":', '":')
with open('sentiment_dic/senti_dic.txt', 'w') as file:
    file.write(content)

In [7]:
# Read the two txt files and convert them into dictionaries
with open('sentiment_dic/senti_dic.txt', 'r', encoding='utf-8') as f:
    dict1 = json.load(f)

with open('sentiment_dic/neg_dic.txt', 'r', encoding='utf-8') as f:
    dict2 = json.load(f)

with open('sentiment_dic/adv_dic.txt', 'r', encoding='utf-8') as f:
    dict3 = json.load(f)

# Remove keys in dict1 that are also in dict2
for key in dict2.keys():
    if key in dict1:
        del dict1[key]
        
# Remove keys in dict1 that are also in dict3
for key in dict3.keys():
    if key in dict1:
        del dict1[key]

# Save the updated dict1 to a new txt file
with open('sentiment_dic/senti_dic_pro.txt', 'w', encoding='utf-8') as f:
    json.dump(dict1, f, ensure_ascii=False)

## 2. Calculation of emotional value

### Algorithm design

In [8]:
# Read sentiment dictionary
with open('sentiment_dic/senti_dic_pro.txt', 'r', encoding='utf-8') as f:
    sentiment_dict = json.load(f)

# Read degree adverb dictionary
with open('sentiment_dic/adv_dic.txt', 'r', encoding='utf-8') as f:
    degree_dict = json.load(f)

# Read negation dictionary
with open('sentiment_dic/neg_dic.txt', 'r', encoding='utf-8') as f:
    negation_dict = json.load(f)

# Function to calculate sentiment score for a single sentence
def sentiment_score(sen):
    seg_list = sen.split()  # Convert text to list of words
    emotion_score = 0  # Sentiment score
    neg_count = 0  # Count of negation words
    for i in range(len(seg_list)):
        # Check for sentiment words
        if seg_list[i] in sentiment_dict:
            score = sentiment_dict[seg_list[i]]
            degree_score = 1  # Default degree score is 1
            negation_score = 1  # Default negation score is 1
            # Look for degree and negation words within the valid distance
            for j in range(max(0, i-3), i):
                if seg_list[j] in degree_dict:
                    degree_score *= degree_dict[seg_list[j]]
                elif seg_list[j] in negation_dict:
                    neg_count += 1
            # Determine whether the sentiment score should be reversed based on the number of negation words
            if neg_count % 2 == 0:
                emotion_score += score * degree_score
            else:
                emotion_score -= score * degree_score
            neg_count = 0  # Reset negation count
    return round(emotion_score, 2)

In [9]:
# Testing a typical positive comment
sen = '位置 优秀 从来不 担心 车 没 地方 停 收费 贵 点儿 地下 停车场 设计 很 好 停车 极其 楼 基础设施 完善 逛起来 很 舒服 好评 点赞'
score = sentiment_score(sen)
print(score)

11.0


In [10]:
# Testing a typical negative comment
sen = '座位 没有 想 坐下 休息 地方 没有 逛累 顾客 滚蛋'
score = sentiment_score(sen)
print(score)

-2


In [11]:
# Testing a typical negative comment
sen = '商场 整体 环境 还 行 有点 闷 月份 很 热 吃 饭 热 不行'
score = sentiment_score(sen)
print(score)

1.02


### Apply to previously processed comment text

In [12]:
# Process four text files
for i in range(1, 5):
    # Read in text file
    with open(f'../data/review/processed/reviews_{i}.txt', 'r', encoding='utf-8') as f:
        reviews = f.readlines()

    # Calculate sentiment score and polarity for each review
    sentiment_scores = []
    polarity = []
    for review in reviews:
        score = sentiment_score(review)
        sentiment_scores.append(score)
        if score < 0:
            polarity.append('N') # If sentiment score is negative, assign 'N' for negative polarity
        elif score >= 1:
            polarity.append('P') # If sentiment score is positive, assign 'P' for positive polarity
        else:
            polarity.append('I') # If sentiment score is neutral, assign 'I' for neutral polarity

    # Create a DataFrame for the reviews and their sentiment scores and polarity
    df = pd.DataFrame(reviews, columns=['review_text'])
    df['sentiment_score'] = sentiment_scores
    df['polarity'] = polarity

    # Save the DataFrame as a CSV file
    df.to_csv(f'../data/review/processed/with_sentiment_score/reviews_{i}.csv', index=False)


## 3. Use snownlp to help verify comment polarity

In [13]:
# Process four text files
for i in range(1, 5):
    # Read the CSV file
    df = pd.read_csv(f'../data/review/processed/with_sentiment_score/reviews_{i}.csv')

    # Read the review text file
    with open(f'../data/review/reviews_{i}.txt', 'r', encoding='utf-8') as f:
        reviews = f.readlines()

    # Calculate sentiment score and polarity for each review and store in lists
    sentiment_scores = []
    polarities = []
    for review in reviews:
        s = SnowNLP(review)
        sentiment_scores.append(s.sentiments)
        if s.sentiments < 0.4:
            polarities.append('N')
        elif s.sentiments < 0.6:
            polarities.append('I')
        else:
            polarities.append('P')

    # Add the sentiment scores and polarities as new columns to the DataFrame
    df['snownlp'] = sentiment_scores
    df['polarity_snow_nlp'] = polarities

    # Write the modified data back to the original CSV file
    df.to_csv(f'../data/review/processed/with_sentiment_score/reviews_{i}.csv', index=False)

## 4. Data Analysis

### Data Concatenation and Organization

In [24]:
for i in range(1, 5):
    # Read in the first CSV file
    df_csv_1 = pd.read_csv(f'../data/review/processed/with_sentiment_score/reviews_{i}.csv')
    # Read in the second CSV file
    df_csv_2 = pd.read_csv(f'../data/review/reviews_{i}.csv')
    # Concatenate the two DataFrames horizontally
    df_concat = pd.concat([df_csv_1, df_csv_2], axis=1)
    # Save the concatenated DataFrame as a CSV file
    df_concat.to_csv(f'../data/review/analysis/reviews_{i}.csv', index=False)

In [31]:
# Create an empty dictionary to store DataFrames
dfs = {}

# Loop through CSV files
for i in range(1, 5):
    # Read in a CSV file
    df = pd.read_csv(f'../data/review/analysis/reviews_{i}.csv')
    
    # Get the name of the last column
    last_col = df.columns[-1]
    
    # Move the last column to the beginning of the DataFrame
    df = df.iloc[:, [df.shape[1]-1] + list(range(df.shape[1]-1))]
    
    # Rename the columns for consistency
    df = df.rename(columns={
        last_col: 'review',
        'review_text': 'review_splitting',
        'sentiment_score': 'senti_score',
        'polarity': 'polarity',
        'snownlp': 'senti_score_snownlp',
        'polarity_snow_nlp': 'polarity_snownlp'
    })
    
    # Add the DataFrame to the dictionary
    dfs[f'reviews_{i}'] = df

# Assign DataFrames to variables
reviews_1_df = dfs['reviews_1']
reviews_2_df = dfs['reviews_2']
reviews_3_df = dfs['reviews_3']
reviews_4_df = dfs['reviews_4']

In [32]:
reviews_1_df

Unnamed: 0,review,review_splitting,senti_score,polarity,senti_score_snownlp,polarity_snownlp
0,"特别赞的一家商场,在王府井这个商场云集和大牌云集的地方,apm商场算是人气特别高的了,交通,...",特别 赞 一家 商场 王府井 商场 云集 大牌 云集 地方 apm 商场 算是 人气 特别 ...,18.25,P,1.000000,P
1,"LINLEE在王府井apm也开新店了,超级喜欢他们家的口味,和朋友逛街无意看到的,果断去买一...",LINLEE 王府井 apm 开 新店 超级 喜欢 家 口味 朋友 逛街 无意 果断 买 一...,10.30,P,1.000000,P
2,"北京apm,东城区商场热门榜第一名️,王府井大街号,环境,商场环境干净整齐,布局分明,美食购...",北京 apm 东城区 商场 热门 榜 第一名 ️ 王府井大街 号 环境 商场 环境 干净 整...,4.00,P,0.999969,P
3,"一定要错过下午点半以后,不然吃饭只能排队等半个小时,而且拿到号了一定别因为还有十几桌就走开,...",错过 下午 点半 吃饭 只能 排队 半个 小时 拿到 号 别 十几桌 走开 过 号 好久\n,0.00,I,0.016099,N
4,"花,Young的年华,年️日,月日,北京apm首层中庭,迎来位艺术家作品联展,春意盎然的️,...",花 Young 年华 年 ️ 日 月 日 北京 apm 首层 中庭 迎来 位 艺术家 作品 ...,7.00,P,0.999991,P
...,...,...,...,...,...,...
4253,"在王府井较大的商场,亮堂宽敞,品牌云集,原来叫新东安商场,周围也云集了很多商场,商场之间有些...",王府井 较大 商场 亮堂 宽敞 品牌 云集 新东安 商场 云集 商场 商场 之间 有些 差异...,-2.30,N,0.999937,P
4254,"新年伊始,出门看个天使陷落,打卡完小吃,正好到最近的apm百老汇,王府井的老商场了,各种品牌...",新年伊始 出门 看个 天使 陷落 打卡 完 小吃 正好 apm 百老汇 王府井 老 商场 品...,2.60,P,0.999997,P
4255,"常去apm不是为逛街,主要为了吃,当然吃完也要逛逛消食,北京apm,新东安广场,地处北京王府...",常去 apm 不是 逛街 吃 吃 完 逛逛 消食 北京 apm 新东安 广场 地处 北京王府...,6.20,P,0.999995,P
4256,王府井apm经常和媳妇儿过来逛因为有的吃有的逛而且媳妇儿喜欢这儿的外婆家这次过来是陪朋友买手...,王府井 apm 媳妇儿 逛 吃 逛 媳妇儿 喜欢 外婆家 陪 朋友 买手机 顺便 逛逛 媳妇...,1.00,P,0.977859,P
