In [1]:
import pandas as pd
import re
import os
import regex as re
import string
import csv
import jieba
import jieba.posseg as pseg
from collections import Counter

## 1. Extract the review data from csv and save it as txt

In [2]:
# Define the file paths to read from
file_paths = ['../data/北京apm.csv', '../data/王府中環.csv', '../data/王府井百货.csv', '../data/东方新天地.csv']

# Loop over each file, extract the data from the 'review' column and save it to a text file
for i, path in enumerate(file_paths):
    # Read the CSV file
    df = pd.read_csv(path, index_col=0)  # Set the index_col to 0 to use the first column as the index
    # Define the path for the output text file
    txt_path = f'../data/review/reviews_{i+1}.txt'
    # Write the review data to the output text file
    with open(txt_path, 'w') as f:
        for j, review in enumerate(df['review']):
            # Clean up the text data by removing special characters
            review = re.sub('[\n\r\t ]+', '', review)
            # Write the review data to the output text file
            f.write(str(review) + '\n')

## 2. Formatting (Remove emoji, spaces & Replace punctuations)

In [3]:
# Define the file paths to modify
txt_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']

# Define a string of all punctuation marks
all_punct = string.punctuation + '，。？！：；（）“”‘’【】『』《》〈〉·—…～、｜「」'

# Define regular expression matching pattern
pattern = re.compile(',+')

# Loop over each file and modify its contents
for txt_path in txt_paths:
    # Open the file for reading
    with open(txt_path, 'r') as file:
        # Read the contents of the file and replace all punctuation marks with commas
        contents = file.read().translate(str.maketrans(all_punct, ',' * len(all_punct)))
        
        # Remove all emoji symbols
        contents = re.sub(r'\p{Emoji}', '', contents)
        
        # Replacing consecutive commas
        contents = re.sub(pattern, ',', contents)
        
        # Remove trailing commas from each line
        contents = '\n'.join(line.rstrip(',') for line in contents.split('\n'))
        
    # Open the file for writing
    with open(txt_path, 'w') as file:
        # Write the modified contents to the file
        file.write(contents)


## 3. Merge into a new txt

In [4]:
# Define the file paths to read from
txt_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']

# Define the path for the output merged text file
merged_txt_path = '../data/review/reviews_merged.txt'

# Open the output file for writing
with open(merged_txt_path, 'w', newline='\n') as merged_file:
    # Loop over each input file
    for txt_path in txt_paths:
        # Open the input file for reading
        with open(txt_path, 'r', newline='') as input_file:
            # Loop over each line in the input file
            for line in input_file:
                # Write the line to the output file
                merged_file.write(line.rstrip('\r\n') + '\n')


### Read into dataframe to validate

In [5]:
# Define the path to the merged text file
merged_txt_path = '../data/review/reviews_merged.txt'

# Read the merged text file into a DataFrame
df = pd.read_csv(merged_txt_path, delimiter='\t', names=['review'])

# Print the DataFrame
pd.set_option('display.max_colwidth', None)

df['review'].head(3)

0      特别赞的一家商场,在王府井这个商场云集和大牌云集的地方,apm商场算是人气特别高的了,交通,地铁号线金鱼胡同地铁口,交通便利,环境,整个商场特别收拾的很干净而且吃喝完了种类特别的全,这次来正好看到有画展,很有意思还在画展逛了一圈的,很不错,就逛逛别的地方,这里正的是可以让你带一天的好地方
1    LINLEE在王府井apm也开新店了,超级喜欢他们家的口味,和朋友逛街无意看到的,果断去买一杯比较推荐他家的泰绿手打柠檬茶和手打柠檬茶,如果喜欢喝酸一点的是特浓手打柠檬茶,个人推荐三分甜,感觉酸涩度刚刚的好,夏天了,喝柠檬茶解渴而且还补充维C,来一杯,元气满满,超赞而且还有小鸭子送,太可爱了
2                                        北京apm,东城区商场热门榜第一名️,王府井大街号,环境,商场环境干净整齐,布局分明,美食购物️丽人,汽车为一体,坐落于王府井大街,每一层都有每一层的特色,其中就有必吃的小大董,还有捞王锅物料理,楼全是美食,麻六记,局气
Name: review, dtype: object

## 4. Convert to csv file for storage

In [6]:
# Define the list of txt file paths that need to be converted
txt_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt', '../data/review/reviews_merged.txt']

# Loop through each txt file and convert it to a csv file
for txt_path in txt_paths:
    # Generate the corresponding csv file path
    csv_path = txt_path.replace('.txt', '.csv')
    
    # Open the txt file and csv file
    with open(txt_path, 'r', encoding='utf-8') as f_txt, open(csv_path, 'w', encoding='utf-8', newline='') as f_csv:
        # Create a csv.writer object to write data to the csv file
        writer = csv.writer(f_csv)
        
        # Write the header row
        writer.writerow(['review'])
        
        # Read each line in the txt file and write it to the csv file
        for line in f_txt:
            # Wrap each line of data in a list to represent a row of data in the csv file
            writer.writerow([line.strip()])



## 5. Splitting

### Custom stopwords dictionary（two different versions）

In [7]:
# Define a list of file paths for the stop word files
stopword_paths = ['stopwords/source/hit_stopwords.txt', 'stopwords/source/scu_stopwords.txt', 'stopwords/source/baidu_stopwords.txt', 'stopwords/source/cn_stopwords.txt']

# Read all stop words into a single list
stopwords = []
for path in stopword_paths:
    with open(path, 'r', encoding='utf-8') as f:
        # Read each line in the file, strip whitespace, and add the resulting words to the list
        words = [w.strip() for w in f.readlines()]
        stopwords.extend(words)

# Remove duplicate stop words
stopwords = list(set(stopwords))

# Sort the stop words alphabetically
stopwords.sort()

# Write all stop words to a file
with open('stopwords/stopwords.txt', 'w', encoding='utf-8') as f:
    # Join the stop words into a single string separated by newlines, and write to the file
    f.write('\n'.join(stopwords))

# Read adverb words from file
adv_path = 'sentiment_dic/base/adv_dic.txt'
with open(adv_path, 'r', encoding='utf-8') as f:
    adverbs = [w.strip() for w in f.readlines()]

# Remove adverb words from stopwords
stopwords = [w for w in stopwords if w not in adverbs]

# Read negative words from file
neg_path = 'sentiment_dic/base/negative_dic.txt'
with open(neg_path, 'r', encoding='utf-8') as f:
    negatives = [w.strip() for w in f.readlines()]

# Remove negative words from stopwords
stopwords = [w for w in stopwords if w not in negatives]

# Sort the stop words alphabetically
stopwords.sort()

# Write modified stop words to a file
with open('stopwords/stopwords_pro.txt', 'w', encoding='utf-8') as f:
    # Join the stop words into a single string separated by newlines, and write to the file
    f.write('\n'.join(stopwords))


### Split word processing

In [8]:
# Define the file paths to read from and write to
input_file_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt', '../data/review/reviews_merged.txt']
output_directory = '../data/review/processed/'

# Set the file path for the stop words
stopword_path = 'stopwords/stopwords_pro.txt'

# Create a set of stop words by opening the file and using a set comprehension
with open(stopword_path, 'r', encoding='utf-8') as f:
    stopwords = {word.strip() for word in f}

# Load the custom dictionary
custom_dict_path = 'stopwords/custom_dict.txt'
jieba.load_userdict(custom_dict_path)

# Define a function to process each review text
def process_text(text):
    # Tokenize the text with Jieba and enable HMM
    tokens = jieba.cut(text, cut_all=False, HMM=True)
    # Remove stop words and punctuation
    tokens = [token for token in tokens if token not in stopwords and token.strip() and not token.isnumeric()]
    # Return the processed text as a string
    return ' '.join(tokens)

# Loop over each file, process the review text and save it to a new text file
for input_file_path in input_file_paths:
    # Create the output file path
    output_file_path = os.path.join(output_directory, os.path.basename(input_file_path))
    # Open the input and output files
    with open(input_file_path, 'r', encoding='utf-8') as input_file, open(output_file_path, 'w', encoding='utf-8') as output_file:
        # Process each line of the input file
        for line in input_file:
            # Process the review text
            processed_text = process_text(line.strip())
            # Write the processed text to the output file
            output_file.write(processed_text + '\n')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/st/jr2134cd02575p7pnq8xy5440000gn/T/jieba.cache
Loading model cost 0.283 seconds.
Prefix dict has been built successfully.


## 6.Part-of-speech tagging and word frequency statistics.

### 北京APM

In [9]:
# Read file
with open('../data/review/processed/reviews_1.txt', 'r', encoding='utf-8') as f:
    content = f.read()

# Perform Part-of-Speech tagging on the text
words = pseg.cut(content)

# Count the frequency of nouns and verbs
word_counts = Counter([word for word, flag in words if flag.startswith('n') or flag.startswith('v')])

# Print the 50 most frequently occurring words
for word, count in word_counts.most_common(50):
    print(word, count)

商场 3607
王府井 1894
逛 1654
品牌 1290
吃 1189
北京 1137
店 1125
买 1004
喜欢 1001
活动 982
没有 775
感觉 766
没 636
吃饭 604
逛街 567
疫情 540
地方 511
环境 478
购物 467
打卡 465
逛逛 454
开 422
餐饮 422
拍照 421
朋友 407
店铺 400
时尚 395
美食 391
想 391
适合 388
东西 387
东安市场 382
说 378
步行街 371
衣服 366
装修 351
排队 349
苹果 348
展览 347
王府井大街 331
玩 331
做 326
爱 320
人 313
餐厅 311
可爱 305
大牌 300
体验 290
整体 286
位置 286


### 王府中環

In [10]:
# Read file
with open('../data/review/processed/reviews_2.txt', 'r', encoding='utf-8') as f:
    content = f.read()

# Perform Part-of-Speech tagging on the text
words = pseg.cut(content)

# Count the frequency of nouns and verbs
word_counts = Counter([word for word, flag in words if flag.startswith('n') or flag.startswith('v')])

# Print the 50 most frequently occurring words
for word, count in word_counts.most_common(50):
    print(word, count)

商场 2240
王府井 1050
品牌 827
环境 826
咖啡 764
喜欢 758
逛 746
吃 583
北京 572
没有 548
感觉 544
餐厅 518
高端 501
地方 485
买 484
大牌 481
店 453
没 403
人 374
活动 370
说 367
打卡 354
购物 351
拍照 335
吃饭 328
带 325
适合 311
卫生间 280
孩子 280
停车场 271
中环 269
疫情 264
餐饮 259
服务 252
开 247
整体 246
东西 244
草坪 239
爱 237
喝 227
娃 227
奢侈品 224
停车 223
步行街 220
想 220
设计 219
排队 217
朋友 215
拍 210
体验 208


### 王府井百货

In [11]:
# Read file
with open('../data/review/processed/reviews_3.txt', 'r', encoding='utf-8') as f:
    content = f.read()

# Perform Part-of-Speech tagging on the text
words = pseg.cut(content)

# Count the frequency of nouns and verbs
word_counts = Counter([word for word, flag in words if flag.startswith('n') or flag.startswith('v')])

# Print the 50 most frequently occurring words
for word, count in word_counts.most_common(50):
    print(word, count)

百货大楼 1609
北京 1228
王府井 1019
商场 702
和平 423
逛 412
买 384
没有 373
感觉 357
地方 294
没 287
小时候 285
品牌 250
回忆 242
北京市 235
局 233
打卡 230
步行街 221
王府井大街 217
说 210
张秉贵 204
吃 201
东西 194
胡同 191
喜欢 183
店 181
带 177
疫情 174
逛逛 171
中国 169
记忆 162
孩子 161
化妆品 160
人 158
购物 158
果局 152
卖 152
环境 152
玩具 143
想 136
来 135
活动 135
大楼 133
拍照 124
一楼 115
位于 115
感 114
朋友 114
值得 113
游客 113


### 东方新天地

In [12]:
# Read file
with open('../data/review/processed/reviews_4.txt', 'r', encoding='utf-8') as f:
    content = f.read()

# Perform Part-of-Speech tagging on the text
words = pseg.cut(content)

# Count the frequency of nouns and verbs
word_counts = Counter([word for word, flag in words if flag.startswith('n') or flag.startswith('v')])

# Print the 50 most frequently occurring words
for word, count in word_counts.most_common(50):
    print(word, count)

商场 970
王府井 513
品牌 379
逛 348
吃 240
购物 232
没有 231
环境 217
疫情 205
北京 202
东单 201
地方 196
地铁 196
店 196
喜欢 194
没 193
感觉 191
吃饭 169
买 165
停车 162
店铺 158
走 140
长安街 136
餐饮 134
开 126
逛街 123
说 122
东方广场 118
东西 117
新天地 112
服务 107
大牌 106
想 102
位置 102
交通 98
人 96
活动 91
高端 91
逛逛 89
免费 88
美食 87
适合 86
选择 85
时间 85
来 84
找 80
朋友 78
餐厅 78
位于 78
步行街 74
