In [28]:
import random
import os

In [29]:
def load_data(filepath):
    """返回文本列表和标签列表."""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    texts = []
    labels = []
    
    current_text = []
    current_label = []
    
    for line in lines:
        line = line.strip()  # 删除行首和行尾的空白字符
        if not line:
            continue
        
        word, label = line.split('\t')
        current_text.append(word)
        current_label.append(label)
        
        # 以句号（。）或分号（；）分割
        # if word in ['。', '？']:  
        if word == '。':  
            texts.append(current_text)
            labels.append(current_label)
            current_text = []
            current_label = []
            
    # 如果最后一个句子没有以句号或分号结束，也添加到结果中
    if current_text:
        texts.append(current_text)
        labels.append(current_label)

    return texts, labels

def numericalize_labels(labels, label_map):
    """将标签列表转换为数字列表."""
    numericalized_labels = []
    for label_list in labels:
      numericalized_labels.append([label_map[label] for label in label_list])
    return numericalized_labels

def no_samples(texts, numericalized_labels, max_len=510):
    """创建样本，每个样本都是以句号分割的一句话及其标签."""
    samples = []
    
    for i in range(len(texts)):
        # 确保每个句子（即以句号分隔的文本）的长度不超过max_len
        if len("".join(texts[i])) <= max_len:
          samples.append(("".join(texts[i]), numericalized_labels[i]))
        else:
          # 如果句子长度超过max_len，则进行截断
          truncated_text = "".join(texts[i])[:max_len]
          
          # 找到截断后的文本中最后一个字符在原始标签列表中的索引。
          truncated_len = 0
          for j, word in enumerate(texts[i]):
            truncated_len += len(word)
            if truncated_len > max_len:
                break

          # 截断标签列表
          truncated_labels = numericalized_labels[i][:j]

          samples.append((truncated_text, truncated_labels))
    return samples

def create_samples(texts, numericalized_labels, max_len=512, min_overlap_rate=0.0, max_overlap_rate=0.5):
    """创建带有随机重叠的样本，样本长度和重叠率随机."""
    samples = []
    
    i = 0
    while i < len(texts):
        sample_text = []
        sample_label = []
        
        current_len = 0
        start_index = i
        
        # 随机生成当前样本的最大长度
        # current_max_len = random.randint(32, max_len)
        current_max_len = max_len

        while current_len < current_max_len and i < len(texts):
            
            sentence_len = len(texts[i])
            
            if current_len + sentence_len <= current_max_len:
                sample_text.extend(texts[i])
                sample_label.extend(numericalized_labels[i])
                current_len += sentence_len
                i += 1
            else:
                break
        
        if len(sample_text) > 0:
            samples.append(("".join(sample_text),sample_label))
        
        # 随机重叠率
        overlap_rate = random.uniform(min_overlap_rate, max_overlap_rate)
        # 计算重叠的起始位置
        overlap = int(len(texts[start_index:i]) * overlap_rate)
        i = max(start_index + (len(texts[start_index:i]) - overlap), start_index+1)

    return samples

In [30]:
filepath = '../data/EvaHan2025_traingdata/trainset_C.txt'
texts, labels = load_data(filepath)

# label_map = {
#     "O": 0,
#     "B-NR": 1,
#     "M-NR": 2,
#     "E-NR": 3,
#     "S-NR": 4,
#     "B-NS": 5,
#     "M-NS": 6,
#     "E-NS": 7,
#     "S-NS": 8,
#     "B-NB": 9,
#     "M-NB": 10,
#     "E-NB": 11,
#     "S-NB": 12,
#     "B-NO": 13,
#     "M-NO": 14,
#     "E-NO": 15,
#     "S-NO": 16,
#     "B-NG": 17,
#     "M-NG": 18,
#     "E-NG": 19,
#     "S-NG": 20,
#     "B-T": 21,
#     "M-T": 22,
#     "E-T": 23,
#     "S-T": 24,
# }

# label_map = {
#     "O": 0,
#     "B-NR": 1,
#     "M-NR": 2,
#     "E-NR": 3,
#     "S-NR": 4,
#     "B-NS": 5,
#     "M-NS": 6,
#     "E-NS": 7,
#     "S-NS": 8,
#     "B-T": 9,
#     "M-T": 10,
#     "E-T": 11,
#     "S-T": 12,
# }

label_map = {
    "O": 0,
    "B-ZD": 1,
    "M-ZD": 2,
    "E-ZD": 3,
    "S-ZD": 4,
    "B-ZZ": 5,
    "M-ZZ": 6,
    "E-ZZ": 7,
    "S-ZZ": 8,
    "B-ZF": 9,
    "M-ZF": 10,
    "E-ZF": 11,
    "S-ZF": 12,
    "B-ZP": 13,
    "M-ZP": 14,
    "E-ZP": 15,
    "S-ZP": 16,
    "B-ZS": 17,
    "M-ZS": 18,
    "E-ZS": 19,
    "S-ZS": 20,
    "B-ZA": 21,
    "M-ZA": 22,
    "E-ZA": 23,
    "S-ZA": 24,
}

numericalized_labels = numericalize_labels(labels, label_map)
# samples = create_samples(texts, numericalized_labels)
samples = no_samples(texts, numericalized_labels)

# 计算分割点
split_point = int(len(samples) * 1)
random.shuffle(samples)
train_samples = samples[:split_point]
test_samples = samples[split_point:]

text_output_path = '../data/text_C.txt'  # 训练文本文件的路径
label_output_path = '../data/label_C.txt'  # 训练标签文件的路径
text_test_output_path = '../data/text_C_test.txt' # 测试文本文件的路径
label_test_output_path = '../data/label_C_test.txt' # 测试标签文件的路径

with open(text_test_output_path, 'w', encoding='utf-8') as text_test_file, \
    open(label_test_output_path, 'w', encoding='utf-8') as label_test_file:
    for text, label in test_samples:
        text_test_file.write(text + '\n')
        label_test_file.write(str(label) + '\n')

with open(text_output_path, 'w', encoding='utf-8') as text_file, \
        open(label_output_path, 'w', encoding='utf-8') as label_file:
    for text, label in train_samples:
        text_file.write(text + '\n')  # 将文本写入文件，每个样本一行
        label_file.write(str(label) + '\n')  # 将数字标签列表转换为字符串并写入文件，每个样本一行

In [18]:
def load_sentences(filepath):
    """读取文件并按句号分割文本，返回句子列表."""
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read()  # 读取整个文件内容

    # 按句号分割文本
    sentences = text.split('。')
    
    # 去除空字符串，并处理最后一个句子
    result_sentences = []
    for i, sentence in enumerate(sentences):
        sentence = sentence.strip()
        if sentence:  # 确保句子不为空
            if i == len(sentences) - 1 and not sentence.endswith('。'):
                # 如果是最后一个句子且没有句号，保留原样
                result_sentences.append(sentence)
            else:
                # 其他句子末尾添加句号
                result_sentences.append(sentence + '。')
    
    return result_sentences

def save_sentences(sentences, output_path):
    """将句子列表保存到文件，每个句子一行."""
    with open(output_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + '\n')

# 输入文件路径
input_file_path = '../data/TestSet/raw/testset_B.txt'  # 输入文件的路径
output_file_path = '../data/TestSet/test_B.txt'  # 输出文件的路径

# 加载句子
sentences = load_sentences(input_file_path)

# 保存句子到输出文件
save_sentences(sentences, output_file_path)