### 1. Conver .txt file to .json file

In [13]:
import json

input_file = "Openrice_Cantonese.txt"  
output_file = "Openrice_Cantonese.json"  

formatted_data = []

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()  
        if not line:  
            continue
        try:
            # Separate CLASS and TEXT using \t\t
            rating, review = line.split("\t\t", 1)  
            formatted_data.append({"text": review, "label": int(rating)})  
        except ValueError:
            print(f"跳过格式错误的行: {line}") #"Misformed lines are skipped:"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(formatted_data, f, ensure_ascii=False, indent=2)

print(f"转换完成，数据集已保存为 JSON 文件：{output_file}") #"The conversion is complete and the dataset has been saved as a JSON file:"


转换完成，数据集已保存为 JSON 文件：Openrice_Cantonese.json


### 2. Divided into training set and testing set
#### “After stochastic shuffling, 90% of dataset is used as the training set, while the other 10% reviews are used as the testing set.”

In [14]:
import json
import random

input_file = "Openrice_Cantonese.json"  # 转换后的 JSON 文件路径
train_file = "Openrice_train.json"  # 训练集文件路径
test_file = "Openrice_test.json"  # 测试集文件路径

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# shuffle data randomly
random.shuffle(data)

# Split training and test set at 90% and 10%
train_size = int(0.9 * len(data))  # 90% used as training set
train_data = data[:train_size]
test_data = data[train_size:]

with open(train_file, "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(test_file, "w", encoding="utf-8") as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

print(f"训练集已保存为：{train_file}")
print(f"测试集已保存为：{test_file}")


训练集已保存为：Openrice_train.json
测试集已保存为：Openrice_test.json


### 3. Before Dataset preprocessing, the label is changed to have positive and negative emotions (1,2 is negative; 3,4,5 are heads)
### 3. 在Dataset preprocessing前，先将label改为正面情绪和负面情绪两种（1，2为负面；3，4，5为正面）

In [8]:
import json

def relabel_data(input_file, output_file):
    # 读取 JSON 数据
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 重新标注 label
    for entry in data:
        if entry["label"] in [1, 2]:
            entry["label"] = "负面"
        elif entry["label"] in [3, 4, 5]:
            entry["label"] = "正面"

    # 保存修改后的数据
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    print(f"数据已处理并保存为 {output_file}")

# 执行代码
input_file = "Openrice_train.json"
output_file = "Openrice_train_labeled.json"
input_file = "Openrice_test.json"
output_file = "Openrice_test_labeled.json"

relabel_data(input_file, output_file)

数据已处理并保存为 Openrice_test_labeled.json


### 4. Adjust sample balance (training set only)
#### ① First, count the number of positive and negative samples
#### ② The number of additional positive samples is randomly removed, so that the number of positive samples and negative samples are the same
#### Satisfies: the total number of samples is unchanged; Positive and negative sample equalization; Reduce positive samples (random sampling)

### 4. 调整样本平衡 (仅针对训练集)
#### ①先计算正面和负面的样本数量
#### ②随机去除多出的正面样本数量，令正面样本数和负面样本数相同
#### 满足：总样本数不变； 正负面样本均衡； 减少正面样本（随机采样） 

In [1]:
import json

def count_labels(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Count positive and negative samples
    positive_count = sum(1 for entry in data if entry["label"] == "正面")
    negative_count = sum(1 for entry in data if entry["label"] == "负面")

    print(f"正面样本数量: {positive_count}")
    print(f"负面样本数量: {negative_count}")

file_path = "Openrice_train_labeled.json"
# file_path = "Openrice_train_balanced2.json"

count_labels(file_path)

正面样本数量: 49485
负面样本数量: 5964


In [5]:
import json
import random
import re
from tqdm import tqdm

input_file = "Openrice_train_labeled.json"
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# distinguish between positive and negative examples
positive_samples = [d for d in data if d["label"] == "正面"]  # positive
negative_samples = [d for d in data if d["label"] == "负面"]  # negative

print(f"原始数据集: 正面 = {len(positive_samples)}，负面 = {len(negative_samples)}")

# Adjust the sample balance: reduce the number of positive samples (random samples)
target_size = len(negative_samples) * 2
positive_samples = random.sample(positive_samples, target_size - len(negative_samples))

final_data = positive_samples + negative_samples

print(f"新数据集: {len(final_data)} 条数据（正负平衡）")

output_file = "Openrice_train_balanced2.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(final_data, f, ensure_ascii=False, indent=4)

print(f"增强数据已保存至 {output_file}！")

📊 原始数据集: 正面 = 49485，负面 = 5964
✅ 新数据集: 11928 条数据（正负平衡）
✅ 增强数据已保存至 Openrice_train_balanced2.json！


### 5. Dataset preprocessing

#### 删除特殊字符，并删除多余的空格。
#### 删除非粤语评论。在数据集中包含非粤语评论，例如英语或其他语言的评论，可能会干扰模型的训练。
#### 和“评论的长度限制为250个字符”
#### 
#### Remove special characters, and remove excess spaces.
#### Delete non-Cantonese comments. The inclusion of non-Cantonese comments in the dataset, such as in English or other languages, may interfere with the training of the model.
#### And "The length of comment is limited to 250 characters."

#### 1. no emojis 去除表情

In [8]:
import json
import re

# 输入和输出文件路径
input_train_file = "Openrice_train_balanced2.json"
# input_test_file = "Openrice_test_labeled.json" 

output_train_file = "Openrice_train_cleaned_no_emojis.json"  # 清理后的训练集文件
# output_test_file = "Openrice_test_cleaned_no_emojis.json"  # 清理后的测试集文件

# 定义清理函数
def clean_text_no_emojis(text):
    # 移除 <sssss> 或其他无效占位符
    text = re.sub(r'<sssss>', '', text)
    # 移除 emoji 和无效特殊字符
    text = re.sub(r'[^\w\s.,!?]', '', text)
    # 去除多余空格
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 定义语言检测函数（过滤非粤语评论）
def is_cantonese(text):
    # 判断评论中英文字母比例
    num_english = sum(1 for char in text if char.isalpha() and ord(char) < 128)
    return num_english / len(text) < 0.5

# 定义预处理函数
def preprocess_data(input_file, output_file):
    # 读取 JSON 数据
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 清理和过滤数据
    cleaned_data = [
        {"text": clean_text_no_emojis(item["text"]), "label": item["label"]}
        for item in data
        if is_cantonese(item["text"]) and len(item["text"]) > 5 and len(item["text"]) <= 250
    ]

    # 保存清理后的数据
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

    print(f"预处理完成，数据集已保存为：{output_file}")

# 对训练集和测试集分别进行预处理
preprocess_data(input_train_file, output_train_file)
# preprocess_data(input_test_file, output_test_file)

预处理完成，数据集已保存为：Openrice_train_cleaned_no_emojis.json


#### 2. with emojis 保留表情

In [2]:
!pip install emoji



In [3]:
import json
import emoji
import re

input_train_file = "Openrice_train_balanced2.json"
# input_test_file = "Openrice_test_labeled.json" 

output_train_file = "Openrice_train_cleaned_with_emojis.json"  # Cleaned training set file
# output_test_file = "Openrice_test_cleaned_with_emojis.json"  

def clean_text_with_emojis(text):
    # Remove <sssss> or other invalid placeholders
    text = re.sub(r'<sssss>', '', text)
    # Keep emoji, text, punctuation, and whitespace
    text = ''.join(char for char in text if emoji.is_emoji(char) or char.isalnum() or char.isspace() or char in ".,!?")
    # Remove extra white space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Define language detection function (filter non-Cantonese reviews)
def is_cantonese(text):
    # Determine the ratio of letters in Chinese to English in a comment
    num_english = sum(1 for char in text if char.isalpha() and ord(char) < 128)
    return num_english / len(text) < 0.5

def preprocess_data(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Clean and filter data
    cleaned_data = [
        {"text": clean_text_with_emojis(item["text"]), "label": item["label"]}
        for item in data
        if is_cantonese(item["text"]) and len(item["text"]) > 5 and len(item["text"]) <= 250
    ]

    # save the cleaned data
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

    print(f"预处理完成，数据集已保存为：{output_file}")

# Preprocess the training and test data separately
preprocess_data(input_train_file, output_train_file)
# preprocess_data(input_test_file, output_test_file)

预处理完成，数据集已保存为：Openrice_train_cleaned_with_emojis.json


#### The text containing emojis is further divided into two processing methods: 
#### 1. Retain the emojis themselves.  （No changes to the original code）
#### 2. Translate emojis into text descriptions (e.g. "😉" into "眨眼咁笑")  （Additional code is required to handle emojis in the text）
#### 对于包含表情符号的文本，进一步分为两种处理方式：
#### 1. 保留表情符号本身。（原始代码没有改变）
#### 2. 将表情符号翻译成文字描述(例如：将“😉”转换为"眨眼咁笑")（需要额外代码来处理文本中的表情符号）

In [10]:
import json
import re

input_train_file = "Openrice_train_cleaned_with_emojis.json"  
# input_test_file = "Openrice_test_cleaned_with_emojis.json"  
output_train_file = "Openrice_train_cleaned_with_emojis_texts.json"  
# output_test_file = "Openrice_test_cleaned_with_emojis_texts.json"  

# Define a dictionary of emoji to Cantonese descriptions
emoji_to_cantonese = {
  "🙂": "微微笑",
  "😊": "眼微笑，面紅紅",
  "🤗": "笑住開手抱抱",
  "😇": "戴住光環笑",
  "😉": "眨眼咁笑",
  "🙃": "反轉咗嘅樣",
  "😀": "露齒大笑",
  "😃": "大眼大笑露齒",
  "😄": "眼笑齒露",
  "😁": "眉飛色舞",
  "😆": "咪埋眼齒露",
  "😅": "咪埋眼齒露出汗",
  "🤣": "滾地大笑到抽筋",
  "😂": "開心到喊",
  "🥲": "邊笑邊喊",
  "🤤": "流晒口水",
  "😌": "輕鬆晒",
  "🤓": "書蟲樣",
  "😎": "戴晒墨鏡笑",
  "🤠": "戴住牛仔帽齒露笑",
  "🥳": "戴派對帽撒花",
  "☺️": "微微笑",
  "🥰": "甜到漏",
  "😍": "眼仔發心心",
  "🤩": "眼仔發星星",
  "😗": "嘟嘴錫人",
  "😙": "咪埋眼嘟嘴錫人",
  "😚": "閉埋眼嘟嘴錫人",
  "😘": "飛個香吻",
  "😳": "面紅晒",
  "🥺": "楚楚可憐",
  "😕": "好疑惑",
  "🙁": "有啲唔開心",
  "☹️": "唔開心",
  "😟": "憂心忡忡",
  "😮": "張開嘴巴好驚訝",
  "😯": "眉飛色舞好驚訝",
  "😦": "驚到沮喪晒",
  "😧": "皺埋眉驚沮晒",
  "😲": "嘩咁大反應",
  "😨": "驚到震",
  "😰": "冒冷汗",
  "😥": "失望又鬆一口氣",
  "😢": "喊緊",
  "😱": "尖叫晒",
  "😖": "好困惑",
  "😣": "愁眉苦臉",
  "😞": "失望透頂",
  "😓": "流冷汗",
  "😩": "煩到爆",
  "😫": "煩到攰晒",
  "😶": "無話可講",
  "😐": "一般般",
  "😑": "面癱樣",
  "😒": "唔屑樣",
  "😏": "奸笑",
  "🤨": "單邊挑眉",
  "🙄": "翻白眼",
  "😬": "尷尬到爆",
  "🤐": "拉鏈封晒嘴",
  "🤥": "講大話",
  "🤫": "噓，靜啲",
  "🤭": "掩住嘴笑",
  "🤔": "諗緊計",
  "🧐": "戴單片眼鏡好嚴肅",
  "😜": "眨眼伸舌頭",
  "😝": "咪埋眼伸舌頭",
  "😛": "伸舌頭",
  "😋": "舔舔舌頭",
  "🤪": "癲癲哋",
  "🤑": "眼仔發錢錢",
  "😠": "嬲嬲豬",
  "😡": "紅晒面嬲爆",
  "🤬": "嬲到爆粗",
  "😤": "噴氣嬲",
  "😈": "壞笑戴角",
  "👿": "嬲到現形",
  "😴": "瞓著咗",
  "😪": "打瞌睡",
  "😵": "頭暈",
  "😵‍💫": "頭暈目眩",
  "🤯": "腦袋爆晒炸",
  "🤒": "發緊燒",
  "🤕": "頭部撞傷",
  "🤢": "作嘔",
  "🤮": "嘔晒",
  "🤧": "打晒乞嗤",
  "🥵": "熱到暈",
  "🥶": "凍到震",
  "😷": "戴住口罩", 
  "😭": "喊",
  "😔": "心唔舒服",    
  "❤️": "鍾意",
  "💔": "心痛",    
  "💕": "好甜蜜",    
  "🔥": "火爆",
  "👍": "讚好", 
  "👌": "好正",
  "🍴": "食好味",
  "⭐": "星星",
  "🐮": "牛肉",
  "🐷": "豬肉",
  "🐔": "雞肉",
  "🐑": "羊肉",
  "🐟": "魚",
  "🦐": "蝦",
  "🦀": "蟹",
  "🦌": "鹿肉",
  "🦆": "鴨肉"
}

# Conversion function: Convert emojis to Cantonese descriptions
def convert_emojis_to_cantonese(text):
    # Replace emojis in the text with their Cantonese counterparts using a dictionary
    for emoji_char, cantonese_word in emoji_to_cantonese.items():
        text = text.replace(emoji_char, cantonese_word)
    return text

def preprocess_emojis_to_cantonese(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Convert emojis in each review
    converted_data = [
        {"text": convert_emojis_to_cantonese(item["text"]), "label": item["label"]}
        for item in data
    ]

    # save the transformed data
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(converted_data, f, ensure_ascii=False, indent=2)

    print(f"emoji 转粤语文本的转换完成，数据集已保存为：{output_file}")

# Convert training and test data
preprocess_emojis_to_cantonese(input_train_file, output_train_file)
# preprocess_emojis_to_cantonese(input_test_file, output_test_file)

emoji 转粤语文本的转换完成，数据集已保存为：Openrice_train_cleaned_with_emojis_texts.json


### 6. Convert the data set to Alpaca format

In [11]:
import json

input_train_no_emojis = "Openrice_train_cleaned_no_emojis.json"
# input_test_no_emojis = "Openrice_test_cleaned_no_emojis.json"
input_train_with_emojis = "Openrice_train_cleaned_with_emojis.json"
# input_test_with_emojis = "Openrice_test_cleaned_with_emojis.json"
input_train_with_emojis_texts = "Openrice_train_cleaned_with_emojis_texts.json" 
# input_test_with_emojis_texts = "Openrice_test_cleaned_with_emojis_texts.json"  

# Output file path (after converting to Alpaca)
output_train_no_emojis = "Openrice_train_alpaca_no_emojis.json"
# output_test_no_emojis = "Openrice_test_alpaca_no_emojis.json"
output_train_with_emojis = "Openrice_train_alpaca_with_emojis.json"
# output_test_with_emojis = "Openrice_test_alpaca_with_emojis.json"
output_train_with_emojis_texts = "Openrice_train_alpaca_with_emojis_texts.json" 
# output_test_with_emojis_texts = "Openrice_test_alpaca_with_emojis_texts.json"

def convert_to_alpaca_format(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    alpaca_data = [
        {
            "instruction": "你是一个经过训练的人工智能，可以分析文本输入，并根据上下文将其分类为最合适的情感类型。你的任务是仔细评估输入并确定输入属于哪种情绪。从以下情绪中选择：1. “正面”，2.“负面”。只提供情绪的名称作为输出，不提供额外的解释。",
            "input": item["text"],  # Comment text as input
            "output": str(item["label"])  # Emotion labels as output
        }
        for item in data
    ]

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(alpaca_data, f, ensure_ascii=False, indent=2)

    print(f"转换完成，Alpaca 格式数据已保存为：{output_file}")

convert_to_alpaca_format(input_train_with_emojis, output_train_with_emojis)
# convert_to_alpaca_format(input_test_with_emojis, output_test_with_emojis)
convert_to_alpaca_format(input_train_no_emojis, output_train_no_emojis)
# convert_to_alpaca_format(input_test_no_emojis, output_test_no_emojis)
convert_to_alpaca_format(input_train_with_emojis_texts, output_train_with_emojis_texts)
# convert_to_alpaca_format(input_test_with_emojis_texts, output_test_with_emojis_texts)

转换完成，Alpaca 格式数据已保存为：Openrice_train_alpaca_with_emojis.json
转换完成，Alpaca 格式数据已保存为：Openrice_train_alpaca_no_emojis.json
转换完成，Alpaca 格式数据已保存为：Openrice_train_alpaca_with_emojis_texts.json
