In [5]:
import random
import os

In [6]:
def load_data(filepath):
    """返回文本列表和标签列表."""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    texts = []
    labels = []
    
    current_text = []
    current_label = []
    
    for line in lines:
        line = line.strip() # 删除行首和行尾的空白字符
        if not line:
            continue
        
        word, label = line.split('\t')
        current_text.append(word)
        current_label.append(label)
        
        if word == '。': # 以句号分割
            texts.append(current_text)
            labels.append(current_label)
            current_text = []
            current_label = []
            
    if current_text:
        texts.append(current_text)
        labels.append(current_label)

    return texts, labels

def numericalize_labels(labels, label_map):
    """将标签列表转换为数字列表."""
    numericalized_labels = []
    for label_list in labels:
      numericalized_labels.append([label_map[label] for label in label_list])
    return numericalized_labels

def create_samples(texts, numericalized_labels, max_len=128, min_overlap_rate=0.0, max_overlap_rate=0.5):
    """创建带有随机重叠的样本，样本长度和重叠率随机."""
    samples = []
    
    i = 0
    while i < len(texts):
        sample_text = []
        sample_label = []
        
        current_len = 0
        start_index = i
        
        # 随机生成当前样本的最大长度
        # current_max_len = random.randint(32, max_len)
        current_max_len = max_len

        while current_len < current_max_len and i < len(texts):
            
            sentence_len = len(texts[i])
            
            if current_len + sentence_len <= current_max_len:
                sample_text.extend(texts[i])
                sample_label.extend(numericalized_labels[i])
                current_len += sentence_len
                i += 1
            else:
                break
        
        if len(sample_text) > 0:
            samples.append(("".join(sample_text),sample_label))
        
        # 随机重叠率
        overlap_rate = random.uniform(min_overlap_rate, max_overlap_rate)
        # 计算重叠的起始位置
        overlap = int(len(texts[start_index:i]) * overlap_rate)
        i = max(start_index + (len(texts[start_index:i]) - overlap), start_index+1)

    return samples

In [7]:
# filepath = '../data/EvaHan2025_traingdata/trainset_A.txt'
# texts, labels = load_data(filepath)

# label_map = {
#     "O": 0,
#     "B-NR": 1,
#     "M-NR": 2,
#     "E-NR": 3,
#     "S-NR": 4,
#     "B-NS": 5,
#     "M-NS": 6,
#     "E-NS": 7,
#     "S-NS": 8,
#     "B-NB": 9,
#     "M-NB": 10,
#     "E-NB": 11,
#     "S-NB": 12,
#     "B-NO": 13,
#     "M-NO": 14,
#     "E-NO": 15,
#     "S-NO": 16,
#     "B-NG": 17,
#     "M-NG": 18,
#     "E-NG": 19,
#     "S-NG": 20,
#     "B-T": 21,
#     "M-T": 22,
#     "E-T": 23,
#     "S-T": 24,
# }
# numericalized_labels = numericalize_labels(labels, label_map)

# samples = create_samples(texts, numericalized_labels)

# for text, label in samples:
#     print(text)
#     print(label)


In [8]:
filepath = '../data/EvaHan2025_traingdata/trainset_A.txt'
texts, labels = load_data(filepath)

label_map = {
    "O": 0,
    "B-NR": 1,
    "M-NR": 2,
    "E-NR": 3,
    "S-NR": 4,
    "B-NS": 5,
    "M-NS": 6,
    "E-NS": 7,
    "S-NS": 8,
    "B-NB": 9,
    "M-NB": 10,
    "E-NB": 11,
    "S-NB": 12,
    "B-NO": 13,
    "M-NO": 14,
    "E-NO": 15,
    "S-NO": 16,
    "B-NG": 17,
    "M-NG": 18,
    "E-NG": 19,
    "S-NG": 20,
    "B-T": 21,
    "M-T": 22,
    "E-T": 23,
    "S-T": 24,
}

numericalized_labels = numericalize_labels(labels, label_map)
samples = create_samples(texts, numericalized_labels)

# 计算分割点
split_point = int(len(samples) * 0.9)
random.shuffle(samples)
train_samples = samples[:split_point]
test_samples = samples[split_point:]

text_output_path = '../data/text_A.txt'  # 训练文本文件的路径
label_output_path = '../data/label_A.txt'  # 训练标签文件的路径
text_test_output_path = '../data/text_A_test.txt' # 测试文本文件的路径
label_test_output_path = '../data/label_A_test.txt' # 测试标签文件的路径


with open(text_output_path, 'w', encoding='utf-8') as text_file, \
        open(label_output_path, 'w', encoding='utf-8') as label_file:
    for text, label in train_samples:
        text_file.write(text + '\n')  # 将文本写入文件，每个样本一行
        label_file.write(str(label) + '\n')  # 将数字标签列表转换为字符串并写入文件，每个样本一行

with open(text_test_output_path, 'w', encoding='utf-8') as text_test_file, \
    open(label_test_output_path, 'w', encoding='utf-8') as label_test_file:
    for text, label in test_samples:
        text_test_file.write(text + '\n')
        label_test_file.write(str(label) + '\n')