In [1]:
import pandas as pd
import re
import os
import regex as re
import string
import csv

## 1. Extract the review data from csv and save it as txt

In [2]:
# Define the file paths to read from
file_paths = ['../data/北京apm.csv', '../data/王府中環.csv', '../data/王府井百货.csv', '../data/东方新天地.csv']

# Loop over each file, extract the data from the 'review' column and save it to a text file
for i, path in enumerate(file_paths):
    # Read the CSV file
    df = pd.read_csv(path, index_col=0)  # Set the index_col to 0 to use the first column as the index
    # Define the path for the output text file
    txt_path = f'../data/review/reviews_{i+1}.txt'
    # Write the review data to the output text file
    with open(txt_path, 'w') as f:
        for j, review in enumerate(df['review']):
            # Clean up the text data by removing special characters
            review = re.sub('[\n\r\t ]+', '', review)
            # Write the review data to the output text file
            f.write(str(review) + '\n')

## 2. Formatting (Remove emoji, spaces & Replace punctuations)

In [3]:
# Define the file paths to modify
txt_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']

# Define a string of all punctuation marks
all_punct = string.punctuation + '，。？！：；（）“”‘’【】『』《》〈〉·—…～、｜「」'

# Define regular expression matching pattern
pattern = re.compile(',+')

# Loop over each file and modify its contents
for txt_path in txt_paths:
    # Open the file for reading
    with open(txt_path, 'r') as file:
        # Read the contents of the file and replace all punctuation marks with commas
        contents = file.read().translate(str.maketrans(all_punct, ',' * len(all_punct)))
        
        # Remove all emoji symbols
        contents = re.sub(r'\p{Emoji}', '', contents)
        
        # Replacing consecutive commas
        contents = re.sub(pattern, ',', contents)
        
        # Remove trailing commas from each line
        contents = '\n'.join(line.rstrip(',') for line in contents.split('\n'))
        
    # Open the file for writing
    with open(txt_path, 'w') as file:
        # Write the modified contents to the file
        file.write(contents)


## 3. Merge into a new txt

In [4]:
# Define the file paths to read from
txt_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']

# Define the path for the output merged text file
merged_txt_path = '../data/review/reviews_merged.txt'

# Open the output file for writing
with open(merged_txt_path, 'w', newline='\n') as merged_file:
    # Loop over each input file
    for txt_path in txt_paths:
        # Open the input file for reading
        with open(txt_path, 'r', newline='') as input_file:
            # Loop over each line in the input file
            for line in input_file:
                # Write the line to the output file
                merged_file.write(line.rstrip('\r\n') + '\n')


### Read into dataframe to validate

In [5]:
# Define the path to the merged text file
merged_txt_path = '../data/review/reviews_merged.txt'

# Read the merged text file into a DataFrame
df = pd.read_csv(merged_txt_path, delimiter='\t', names=['review'])

# Print the DataFrame
df

Unnamed: 0,review
0,"特别赞的一家商场,在王府井这个商场云集和大牌云集的地方,apm商场算是人气特别高的了,交通,..."
1,"LINLEE在王府井apm也开新店了,超级喜欢他们家的口味,和朋友逛街无意看到的,果断去买一..."
2,"北京apm,东城区商场热门榜第一名️,王府井大街号,环境,商场环境干净整齐,布局分明,美食购..."
3,"一定要错过下午点半以后,不然吃饭只能排队等半个小时,而且拿到号了一定别因为还有十几桌就走开,..."
4,"花,Young的年华,年️日,月日,北京apm首层中庭,迎来位艺术家作品联展,春意盎然的️,..."
...,...
10479,"商场好大,要不是朋友告诉我,穿过这个商场,就可以走到地铁"
10480,"年国庆成立年阅兵的时候这个商场刚刚开,那个时候这里全是高消费的地方,各种名牌荟聚,转眼已经多..."
10481,"很久没有来东方广场了,今天路过来看看,硬件保持的还是挺好的,里面加了好多珠宝品牌,珠宝什么时..."
10482,"嗨,就你,你快乐,新年到,马上就喜迎金鼠啦,现在在东方新天地,你可以遇到各种姿态的超萌的,跟..."


## 4. Convert to csv file for storage

In [6]:
# Define the list of txt file paths that need to be converted
txt_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt', '../data/review/reviews_merged.txt']

# Loop through each txt file and convert it to a csv file
for txt_path in txt_paths:
    # Generate the corresponding csv file path
    csv_path = txt_path.replace('.txt', '.csv')
    
    # Open the txt file and csv file
    with open(txt_path, 'r', encoding='utf-8') as f_txt, open(csv_path, 'w', encoding='utf-8', newline='') as f_csv:
        # Create a csv.writer object to write data to the csv file
        writer = csv.writer(f_csv)
        
        # Write the header row
        writer.writerow(['review'])
        
        # Read each line in the txt file and write it to the csv file
        for line in f_txt:
            # Wrap each line of data in a list to represent a row of data in the csv file
            writer.writerow([line.strip()])



## 5. Remove stopwords

In [7]:
# Define the file paths to read from and write to
input_file_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']
output_file_paths = ['../data/review/processed/processed_sw_1.txt', '../data/review/processed/processed_sw_2.txt', '../data/review/processed/processed_sw_3.txt', '../data/review/processed/processed_sw_4.txt']

# Define the paths to your custom stop word dictionaries
stopword_paths = ['stopwords/hit_stopwords.txt', 'stopwords/scu_stopwords.txt', 'stopwords/baidu_stopwords.txt', 'stopwords/cn_stopwords.txt']

# Combine all stop word dictionaries into one set
stopwords = set([word.strip() for path in stopword_paths for word in open(path, 'r', encoding='utf-8')])

# Define a function to process each review text
def process_text(text):
    # Remove all emoji
    text = re.sub(r'\p{Emoji}', '', text)
    # Remove all punctuation
    text = re.sub(r'[^\w\s]+', '', text)
    # Remove stop words
    for word in stopwords:
        text = text.replace(word, '')
    # Return the processed text as a string
    return text.strip()

# Loop over each file, process the review text and save it to a new text file
for i, input_file_path in enumerate(input_file_paths):
    with open(input_file_path, 'r', encoding='utf-8') as input_file, open(output_file_paths[i], 'w', encoding='utf-8') as output_file:
        for line in input_file:
            # Process the review text
            processed_text = process_text(line.strip())
            # Write the processed text to the output file
            output_file.write(processed_text + '\n')