In [8]:
import pandas as pd
import re
import os
import regex as re

## 1. Extract the review data from csv and save it as txt

In [2]:
# Define the file paths to read from
file_paths = ['../data/北京apm.csv', '../data/王府中環.csv', '../data/王府井百货.csv', '../data/东方新天地.csv']

# Loop over each file, extract the data from the 'review' column and save it to a text file
for i, path in enumerate(file_paths):
    # Read the CSV file
    df = pd.read_csv(path, index_col=0)  # Set the index_col to 0 to use the first column as the index
    # Define the path for the output text file
    txt_path = f'../data/review/reviews_{i+1}.txt'
    # Write the review data to the output text file
    with open(txt_path, 'w') as f:
        for j, review in enumerate(df['review']):
            # Clean up the text data by removing special characters
            review = re.sub('[\n\r\t ]+', '', review)
            # Write the review data to the output text file
            f.write(str(review) + '\n')


### Merge into a new txt

In [3]:
# Define the file paths to read from
txt_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']

# Define the path for the output merged text file
merged_txt_path = '../data/review/review_merged.txt'

# Open the output file for writing
with open(merged_txt_path, 'w') as merged_file:
    # Loop over each input file
    for txt_path in txt_paths:
        # Open the input file for reading
        with open(txt_path, 'r') as input_file:
            # Loop over each line in the input file
            for line in input_file:
                # Write the line to the output file
                merged_file.write(line)


## 2. Remove punctuation and emoji

In [10]:
# Define the file paths to read from and write to
input_file_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']
output_file_paths = ['../data/review/processed/processed_1.txt', '../data/review/processed/processed_2.txt', '../data/review/processed/processed_3.txt', '../data/review/processed/processed_4.txt']

# Define a function to process each review text
def process_text(text):
    # Remove all emoji
    text = re.sub(r'\p{Emoji}', '', text)
    # Remove all punctuation
    text = re.sub(r'[^\w\s]+', '', text)
    # Return the processed text as a string
    return text.strip()

# Loop over each file, process the review text and save it to a new text file
for i, input_file_path in enumerate(input_file_paths):
    with open(input_file_path, 'r', encoding='utf-8') as input_file, open(output_file_paths[i], 'w', encoding='utf-8') as output_file:
        for line in input_file:
            # Process the review text
            processed_text = process_text(line.strip())
            # Write the processed text to the output file
            output_file.write(processed_text + '\n')


## 3. Remove stopwords

In [11]:
# Define the file paths to read from and write to
input_file_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']
output_file_paths = ['../data/review/processed/processed_sw_1.txt', '../data/review/processed/processed_sw_2.txt', '../data/review/processed/processed_sw_3.txt', '../data/review/processed/processed_sw_4.txt']

# Define the paths to your custom stop word dictionaries
stopword_paths = ['stopwords/hit_stopwords.txt', 'stopwords/scu_stopwords.txt', 'stopwords/baidu_stopwords.txt', 'stopwords/cn_stopwords.txt']

# Combine all stop word dictionaries into one set
stopwords = set([word.strip() for path in stopword_paths for word in open(path, 'r', encoding='utf-8')])

# Define a function to process each review text
def process_text(text):
    # Remove all emoji
    text = re.sub(r'\p{Emoji}', '', text)
    # Remove all punctuation
    text = re.sub(r'[^\w\s]+', '', text)
    # Remove stop words
    for word in stopwords:
        text = text.replace(word, '')
    # Return the processed text as a string
    return text.strip()

# Loop over each file, process the review text and save it to a new text file
for i, input_file_path in enumerate(input_file_paths):
    with open(input_file_path, 'r', encoding='utf-8') as input_file, open(output_file_paths[i], 'w', encoding='utf-8') as output_file:
        for line in input_file:
            # Process the review text
            processed_text = process_text(line.strip())
            # Write the processed text to the output file
            output_file.write(processed_text + '\n')