In [1]:
import pandas as pd
import re

## 1. Extract the review data from csv and save it as txt

In [9]:
# Define the file paths to read from
file_paths = ['../data/北京apm.csv', '../data/王府中環.csv', '../data/王府井百货.csv', '../data/东方新天地.csv']

# Loop over each file, extract the data from the 'review' column and save it to a text file
for i, path in enumerate(file_paths):
    # Read the CSV file
    df = pd.read_csv(path, index_col=0)  # Set the index_col to 0 to use the first column as the index
    # Define the path for the output text file
    txt_path = f'../data/review/reviews_{i+1}.txt'
    # Write the review data to the output text file
    with open(txt_path, 'w') as f:
        for j, review in enumerate(df['review']):
            # Clean up the text data by removing special characters
            review = re.sub('[\n\r\t ]+', '', review)
            # Write the review data to the output text file
            f.write(str(review) + '\n')


### Merge into a new txt

In [12]:
# Define the file paths to read from
txt_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']

# Define the path for the output merged text file
merged_txt_path = '../data/review/review_merged.txt'

# Open the output file for writing
with open(merged_txt_path, 'w') as merged_file:
    # Loop over each input file
    for txt_path in txt_paths:
        # Open the input file for reading
        with open(txt_path, 'r') as input_file:
            # Loop over each line in the input file
            for line in input_file:
                # Write the line to the output file
                merged_file.write(line)


## 2. Remove stop words, punctuation, and emoji

### Method without splitting words

In [44]:
import os
import re
import emoji

# Define the file paths to read from
file_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']

# Define the file paths to save to
save_paths = ['../data/processed/processed_1.txt', '../data/processed/processed_2.txt', '../data/processed/processed_3.txt', '../data/processed/processed_4.txt']

# Define the stopwords dictionaries to use
stopwords_paths = ['stopwords/baidu_stopwords.txt', 'stopwords/cn_stopwords.txt', 'stopwords/hit_stopwords.txt', 'stopwords/scu_stopwords.txt']

# Define a function to remove punctuation marks from a string
def remove_punctuation(text):
    # Remove all punctuation marks
    return re.sub('[^\w\s,，。！？\uff00-\uffef]', '', text)

# Loop over each file, read the text, remove stopwords and punctuation marks/emojis, and save to a new file
for i, path in enumerate(file_paths):
    # Read the text from the input file
    with open(path, 'r') as f:
        text = f.read()

    # Load the custom stopwords dictionary
    stopwords_path = stopwords_paths[i]
    stopwords = set()
    if os.path.exists(stopwords_path):
        with open(stopwords_path, 'r') as f:
            for line in f:
                stopwords.add(line.strip())

    # Replace emojis with their textual representation and remove remaining punctuation marks
    processed_text = remove_punctuation(emoji.emojize(text))

    # Remove stopwords from the text
    processed_text = ' '.join([word for word in processed_text.split() if word not in stopwords])

    # Write the processed text to the output file
    save_path = save_paths[i]
    with open(save_path, 'w') as f:
        f.write(processed_text)

In [52]:
import re
import os

# Define the file paths to read from and write to
file_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']
output_paths = ['../data/processed/processed_1.txt', '../data/processed/processed_2.txt', '../data/processed/processed_3.txt', '../data/processed/processed_4.txt']


# Define the paths to your custom stop word dictionaries
stopwords_paths = ['stopwords/baidu_stopwords.txt', 'stopwords/cn_stopwords.txt', 'stopwords/hit_stopwords.txt', 'stopwords/scu_stopwords.txt']

# Combine all stop word dictionaries into one set
stopwords = set()
for path in stopwords_paths:
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.add(line.strip())

# Define a function to process each review text
def process_text(text):
    # Remove all punctuation except for emoticons
    text = re.sub(r'[^\w\s\U0001F600-\U0001F64F\U0001F300-\U0001F5FF]+', '', text)
    # Remove stop words
    for word in stopwords:
        text = text.replace(word, '')
    # Return the processed text as a string
    return text.strip()

# Loop over each file, process the review text and save it to a new text file
for i, input_path in enumerate(file_paths):
    with open(input_path, 'r', encoding='utf-8') as f_input, open(output_paths[i], 'w', encoding='utf-8') as f_output:
        for line in f_input:
            # Process the review text
            processed_text = process_text(line.strip())
            # Write the processed text to the output file
            f_output.write(processed_text + '\n')

### Method of splitting words

In [51]:
import os
import re
import jieba

# Define the file paths to read from
file_paths = ['../data/review/reviews_1.txt', '../data/review/reviews_2.txt', '../data/review/reviews_3.txt', '../data/review/reviews_4.txt']

# Define the file paths to save to
save_paths = ['../data/processed/split_1.txt', '../data/processed/split_2.txt', '../data/processed/split_3.txt', '../data/processed/split_4.txt']

# Define the stopwords dictionaries to use
stopwords_paths = ['stopwords/baidu_stopwords.txt', 'stopwords/cn_stopwords.txt', 'stopwords/hit_stopwords.txt', 'stopwords/scu_stopwords.txt']

# Define a function to remove punctuation marks and emojis from a string
def remove_punctuation(text):
    # Remove all punctuation marks and emojis
    return re.sub('[^\w\s]+', '', text)

# Loop over each file, read the text, remove stopwords and punctuation marks/emojis, and save to a new file
for i, path in enumerate(file_paths):
    # Read the text from the input file
    with open(path, 'r') as f:
        text = f.read()

    # Load the custom stopwords dictionary
    stopwords_path = stopwords_paths[i]
    stopwords = set()
    if os.path.exists(stopwords_path):
        with open(stopwords_path, 'r') as f:
            for line in f:
                stopwords.add(line.strip())

    # Tokenize the text using jieba
    words = jieba.cut(text)

    # Remove stopwords and punctuation marks/emojis from the text
    processed_words = []
    for word in words:
        if word not in stopwords and word.strip():
            processed_words.append(word)

    processed_text = ' '.join(processed_words)
    processed_text = remove_punctuation(processed_text.strip())

    # Write the processed text to the output file
    save_path = save_paths[i]
    with open(save_path, 'w') as f:
        f.write(processed_text)