In [7]:
import chardet
import os

def detect_encoding_chardet(file_path):
    """
    使用chardet库检测文件编码
    """
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            encoding = result['encoding']
            confidence = result['confidence']
            return encoding, confidence
    except Exception as e:
        return None, f"错误: {str(e)}"

# 使用示例
file_path = "NER-test.txt"
encoding, confidence = detect_encoding_chardet(file_path)
print(f"文件编码: {encoding}, 置信度: {confidence:.2f}")

文件编码: utf-8, 置信度: 0.99


In [14]:
import chardet
import os

def convert_to_utf8(file_path, output_path=None):
    """
    自动检测文件编码并转换为UTF-8
    """
    if output_path is None:
        output_path = file_path  # 原地转换
    
    # 检测原始编码
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        encoding_result = chardet.detect(raw_data)
    
    original_encoding = encoding_result['encoding']
    confidence = encoding_result['confidence']
    
    print(f"检测到原始编码: {original_encoding} (置信度: {confidence:.2f})")
    
    # 尝试用检测到的编码读取并转换为UTF-8
    try:
        if original_encoding:
            with open(file_path, 'r', encoding=original_encoding) as f:
                content = f.read()
        else:
            raise UnicodeDecodeError("无法检测编码")
    except (UnicodeDecodeError, LookupError):
        print("检测编码读取失败，尝试常见编码...")
        # 尝试常见编码
        common_encodings = ['utf-8', 'gbk', 'gb2312','gb18030']
        content = None
        
        for enc in common_encodings:
            try:
                with open(file_path, 'r', encoding=enc) as f:
                    content = f.read()
                original_encoding = enc
                print(f"成功使用 {enc} 编码读取")
                break
            except UnicodeDecodeError:
                continue
        
        if content is None:
            # 最后尝试忽略错误
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            original_encoding = 'utf-8 (errors ignored)'
    
    # 写入UTF-8格式
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(content)
    
    print(f"成功转换为UTF-8格式: {output_path}")
    return original_encoding

# 使用示例
file_path = "NER-train-GBK.txt"
original_encoding = convert_to_utf8(file_path,output_path="NER-train.txt")
print(f"原始编码: {original_encoding} → 转换后: UTF-8")

检测到原始编码: GB2312 (置信度: 0.99)
检测编码读取失败，尝试常见编码...
成功使用 gbk 编码读取
成功转换为UTF-8格式: NER-train.txt
原始编码: gbk → 转换后: UTF-8


In [17]:
def view_first_lines(file_path, num_lines=10):
    """查看文件前几行"""
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= num_lines:
                break
            print(f"{i+1}: {line}", end='')

# 使用示例
view_first_lines('NER-train.txt', 10)

1: 迈 N
2: 向 N
3: 充 N
4: 满 N
5: 希 N
6: 望 N
7: 的 N
8: 新 N
9: 世 N
10: 纪 N


In [20]:
def convert_bin_to_bmes(input_file, output_file):
    """
    将BIN标记格式转换为BMES标记格式
    """
    with open(input_file, 'r', encoding='utf-8') as f_in, \
         open(output_file, 'w', encoding='utf-8') as f_out:
        
        lines = f_in.readlines()
        total_lines = len(lines)
        
        i = 0
        while i < total_lines:
            current_line = lines[i].strip()
            
            # 跳过空行
            if not current_line:
                f_out.write('\n')
                i += 1
                continue
            
            # 分割字符和标签
            parts = current_line.split()
            if len(parts) < 2:
                f_out.write(current_line + '\n')
                i += 1
                continue
            
            char, label = parts[0], parts[1]
            
            # 规则1: B-* 标签保持不变
            if label.startswith('B-'):
                f_out.write(f"{char} {label}\n")
            
            # 规则3: N 标签改为 O
            elif label == 'N':
                f_out.write(f"{char} O\n")
            
            # 规则2: I-* 标签的处理
            elif label.startswith('I-'):
                entity_type = label.split('-')[1]  # 提取实体类型
                
                # 检查下一个字符
                if i + 1 < total_lines:
                    next_line = lines[i + 1].strip()
                    if next_line:
                        next_parts = next_line.split()
                        if len(next_parts) >= 2:
                            next_label = next_parts[1]
                            
                            # 如果下一个标签也是I-*，当前标签改为M-*
                            if next_label.startswith('I-'):
                                new_label = f"M-{entity_type}"
                                f_out.write(f"{char} {new_label}\n")
                            # 如果下一个标签不是I-*，当前标签改为E-*
                            else:
                                new_label = f"E-{entity_type}"
                                f_out.write(f"{char} {new_label}\n")
                        else:
                            # 下一行格式不正确，当前标签改为E-*
                            new_label = f"E-{entity_type}"
                            f_out.write(f"{char} {new_label}\n")
                    else:
                        # 下一行为空行，当前标签改为E-*
                        new_label = f"E-{entity_type}"
                        f_out.write(f"{char} {new_label}\n")
                else:
                    # 当前是最后一行，标签改为E-*
                    new_label = f"E-{entity_type}"
                    f_out.write(f"{char} {new_label}\n")
            
            # 其他标签（如O等）保持不变
            else:
                f_out.write(f"{char} {label}\n")
            
            i += 1

def main():
    input_file = "NER-dev.txt"
    output_file = "dev.char.bmes"
    
    print("开始转换BIN标记到BMES标记...")
    convert_bin_to_bmes(input_file, output_file)
    print(f"转换完成！结果已保存到 {output_file}")
    
    # 显示转换前后的对比示例
    print("\n转换前后对比示例：")
    print("原数据（BIN格式）：")
    with open(input_file, 'r', encoding='utf-8') as f:
        sample_lines = []
        for i, line in enumerate(f):
            if line.strip():
                sample_lines.append(line.strip())
            if len(sample_lines) >= 10:
                break
        for line in sample_lines:
            print(line)
    
    print("\n转换后（BMES格式）：")
    with open(output_file, 'r', encoding='utf-8') as f:
        sample_lines = []
        for i, line in enumerate(f):
            if line.strip():
                sample_lines.append(line.strip())
            if len(sample_lines) >= 10:
                break
        for line in sample_lines:
            print(line)

if __name__ == "__main__":
    main()

开始转换BIN标记到BMES标记...
转换完成！结果已保存到 dev.char.bmes

转换前后对比示例：
原数据（BIN格式）：
国 B-ORG
家 I-ORG
禁 I-ORG
毒 I-ORG
委 I-ORG
员 I-ORG
会 I-ORG
副 N
主 N
任 N

转换后（BMES格式）：
国 B-ORG
家 M-ORG
禁 M-ORG
毒 M-ORG
委 M-ORG
员 M-ORG
会 E-ORG
副 O
主 O
任 O


In [18]:
import random

def split_ner_dataset(input_file, train_file, dev_file, split_ratio=0.9):
    """
    将NER数据集按指定比例划分为训练集和验证集
    
    Args:
        input_file: 输入文件路径
        train_file: 训练集输出文件路径
        dev_file: 验证集输出文件路径
        split_ratio: 训练集比例，默认为0.9
    """
    
    # 读取原始数据
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # 按句子分割数据（空行分隔）
    sentences = []
    current_sentence = []
    
    for line in lines:
        line = line.strip()
        if line:  # 非空行
            current_sentence.append(line)
        else:  # 空行表示句子结束
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
    
    # 添加最后一个句子（如果存在）
    if current_sentence:
        sentences.append(current_sentence)
    
    print(f"总句子数: {len(sentences)}")
    
    # 随机打乱句子
    random.seed(42)  # 设置随机种子以保证可重复性
    random.shuffle(sentences)
    
    # 计算分割点
    split_point = int(len(sentences) * split_ratio)
    train_sentences = sentences[:split_point]
    dev_sentences = sentences[split_point:]
    
    print(f"训练集句子数: {len(train_sentences)}")
    print(f"验证集句子数: {len(dev_sentences)}")
    
    # 写入训练集
    with open(train_file, 'w', encoding='utf-8') as f:
        for sentence in train_sentences:
            for line in sentence:
                f.write(line + '\n')
            f.write('\n')  # 句子间用空行分隔
    
    # 写入验证集
    with open(dev_file, 'w', encoding='utf-8') as f:
        for sentence in dev_sentences:
            for line in sentence:
                f.write(line + '\n')
            f.write('\n')  # 句子间用空行分隔
    
    print("数据集划分完成！")

# 使用示例
if __name__ == "__main__":
    input_file = "NER-train.txt"
    train_output = "NER-train.txt"  # 为避免覆盖原文件，使用新名称
    dev_output = "NER-dev.txt"
    
    split_ner_dataset(input_file, train_output, dev_output)
    
    print(f"训练集已保存至: {train_output}")
    print(f"验证集已保存至: {dev_output}")

总句子数: 47317
训练集句子数: 42585
验证集句子数: 4732
数据集划分完成！
训练集已保存至: NER-train.txt
验证集已保存至: NER-dev.txt
