In [None]:
# 安装必要的库
!pip install python-docx pyyaml pandas

In [None]:
# 导入所需的库
from docx import Document
import re
import yaml
import pandas as pd
from collections import defaultdict
import json

In [None]:
# 1. 文档预处理函数
def extract_word_content(doc_path):
    """从Word文档提取文本内容"""
    try:
        doc = Document(doc_path)
        full_text = []
        
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():  # 忽略空行
                full_text.append(paragraph.text)
        
        return "\n".join(full_text)
    except Exception as e:
        print(f"读取Word文档时出错: {e}")
        return None

# 测试文档读取
doc_content = extract_word_content("AT_Commands.docx")  # 替换为您的文档路径
if doc_content:
    print(f"成功读取文档，共 {len(doc_content)} 字符")
    # 预览前500个字符
    print("\n文档预览:")
    print(doc_content[:500] + "...")
else:
    print("无法读取文档，请检查路径是否正确")

In [None]:
# 2. 定义解析模式和规则
# 在文档中识别这些模式
PATTERNS = {
    'command_start': r'^(\d+\.\d+\s+)?(AT[+\w]+)-',  # 命令开始，如"1.1 ATI-"
    'parameter_section': r'参数|参数\s*$',  # 参数部分开始
    'example_section': r'示例|示例\s*$',   # 示例部分开始
    'command_format': r'命令格式',        # 命令格式部分
}

# 提取命令描述的更精确模式
COMMAND_PATTERN = r'^(\d+\.\d+)\s+(AT[+\w]+)-([^。]+)'

# 参数提取模式
PARAM_PATTERN = r'<([^>]+)>([^<]+)'

In [None]:
# 智能解析函数 - 专注于解决section、parameter和example提取
def smart_parse_at_commands(text_content):
    """智能解析AT命令文档"""
    lines = text_content.split('\n')
    commands = []
    current_command = None
    current_section = None
    param_lines = []  # 存储参数相关的多行文本
    example_lines = []  # 存储示例相关的多行文本
    
    print("开始智能解析文档...")
    
    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue
            
        # 调试：打印前30行处理过程
        if i < 30:
            print(f"处理行 {i}: {repr(line)}")
        
        # 检测命令开始
        command_match = re.match(r'^(\d+\.\d+\s+)?(AT[+\w]+)[–\-：:]\s*([^。]+)', line)
        if command_match:
            # 保存上一个命令
            if current_command:
                # 处理累积的参数行
                if param_lines:
                    current_command['parameters'] = extract_parameters_from_lines(param_lines)
                    param_lines = []
                # 处理累积的示例行
                if example_lines:
                    current_command['examples'] = example_lines
                    example_lines = []
                
                commands.append(current_command)
                print(f"已解析命令: {current_command['cmd']}")
            
            # 提取命令信息
            section = command_match.group(1)
            cmd = command_match.group(2)
            desc = command_match.group(3)
            
            current_command = {
                'section': section.strip() if section else None,
                'cmd': cmd,
                'desc': desc.strip(),
                'parameters': [],
                'examples': [],
                'format': None
            }
            current_section = None
            print(f"检测到命令: {cmd}")
            continue
        
        # 检测章节变化
        if line == "参数":
            current_section = "parameters"
            print(f"检测到章节: {current_section}")
            continue
        elif line == "示例":
            current_section = "examples"
            print(f"检测到章节: {current_section}")
            continue
        elif line == "命令格式":
            current_section = "format"
            print(f"检测到章节: {current_section}")
            continue
            
        # 根据当前章节处理内容
        if current_command and current_section:
            if current_section == "parameters":
                # 收集参数行，稍后统一处理
                param_lines.append(line)
                
            elif current_section == "examples":
                # 收集示例行
                if line and not any(keyword in line for keyword in ["示例", "例子", "example"]):
                    example_lines.append(line)
                
            elif current_section == "format":
                if not line.startswith("命令格式"):
                    current_command['format'] = line
    
    # 处理最后一个命令
    if current_command:
        # 处理累积的参数行
        if param_lines:
            current_command['parameters'] = extract_parameters_from_lines(param_lines)
        # 处理累积的示例行
        if example_lines:
            current_command['examples'] = example_lines
        
        commands.append(current_command)
        print(f"已解析命令: {current_command['cmd']}")
    
    print(f"\n解析完成，共提取 {len(commands)} 个AT命令")
    
    # 显示解析统计
    params_count = sum(len(cmd.get('parameters', [])) for cmd in commands)
    examples_count = sum(len(cmd.get('examples', [])) for cmd in commands)
    
    print(f"参数总数: {params_count}")
    print(f"示例总数: {examples_count}")
    
    return commands

# 从多行文本中提取参数的辅助函数
def extract_parameters_from_lines(lines):
    """从多行文本中提取参数"""
    parameters = []
    current_param = None
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # 检测参数开始
        param_match = re.search(r'<([^>]+)>\s*([^<]*)', line)
        if param_match:
            # 保存上一个参数
            if current_param:
                parameters.append(current_param)
            
            # 开始新参数
            param_name = param_match.group(1)
            param_desc = param_match.group(2).strip()
            current_param = {
                'name': param_name,
                'desc': param_desc
            }
        elif current_param and line:
            # 继续当前参数的描述
            current_param['desc'] += " " + line
    
    # 添加最后一个参数
    if current_param:
        parameters.append(current_param)
    
    return parameters

In [None]:
# 增强Section提取
def enhance_section_extraction(commands):
    """增强Section提取"""
    print("增强Section提取...")
    
    # 为每个命令尝试提取更准确的Section
    for cmd in commands:
        if not cmd.get('section') and cmd.get('cmd'):
            # 尝试从命令描述中提取Section
            desc = cmd.get('desc', '')
            section_match = re.search(r'(\d+\.\d+)', desc)
            if section_match:
                cmd['section'] = section_match.group(1)
    
    return commands

# 执行增强解析
if doc_content:
    commands = smart_parse_at_commands(doc_content)
    commands = enhance_section_extraction(commands)
else:
    commands = []
    print("没有文档内容可供解析")

In [None]:
# 智能填充缺失数据
def smart_fill_missing_data(commands):
    """基于已知模式的智能填充"""
    print("智能填充缺失数据...")
    
    # 已知命令模式
    command_patterns = {
        "ATI": {
            "parameters": [
                {"name": "manufacturer", "desc": "模组厂商信息、产品名称、版本号"},
                {"name": "module_version", "desc": "模组型号"},
                {"name": "soft_version", "desc": "模组软件版本"}
            ],
            "examples": ["ATI", "查询厂商信息", "+CGMI: Neoway", "OK"]
        },
        "AT+GMR": {
            "parameters": [
                {"name": "re version", "desc": "模组软件版本信息"}
            ],
            "examples": ["AT+GMR", "+GMR:N706-R004-STD-B2-003", "OK"]
        },
        "AT+CSQ": {
            "parameters": [
                {"name": "signal", "desc": "信号强度"},
                {"name": "ber", "desc": "误码率"}
            ],
            "examples": ["AT+CSQ", "+CSQ:19,2", "OK"]
        },
        "AT+CREG": {
            "parameters": [
                {"name": "n", "desc": "网络注册主动提供结果代码设置"},
                {"name": "stat", "desc": "网络注册状态"}
            ],
            "examples": ["AT+CREG?", "+CREG: 0,1", "OK"]
        },
        # 添加更多已知命令模式...
    }
    
    # 为每个命令应用已知模式
    for cmd in commands:
        cmd_name = cmd.get('cmd')
        if cmd_name in command_patterns:
            pattern = command_patterns[cmd_name]
            
            # 只填充缺失的数据
            if not cmd.get('parameters') and 'parameters' in pattern:
                cmd['parameters'] = pattern['parameters']
                print(f"为 {cmd_name} 填充参数")
            
            if not cmd.get('examples') and 'examples' in pattern:
                cmd['examples'] = pattern['examples']
                print(f"为 {cmd_name} 填充示例")
    
    return commands

# 执行智能填充
if commands:
    commands = smart_fill_missing_data(commands)
else:
    print("没有命令数据可供填充")

In [None]:
# 批量处理常见问题
def batch_process_common_issues(commands):
    """批量处理常见问题"""
    print("批量处理常见问题...")
    
    for cmd in commands:
        # 确保所有命令都有必要的字段
        if 'parameters' not in cmd:
            cmd['parameters'] = []
        if 'examples' not in cmd:
            cmd['examples'] = []
        if 'format' not in cmd:
            cmd['format'] = cmd.get('cmd')
        
        # 清理参数描述
        for param in cmd.get('parameters', []):
            if param.get('desc'):
                param['desc'] = param['desc'].strip()
        
        # 清理示例
        cmd['examples'] = [ex.strip() for ex in cmd.get('examples', []) if ex.strip()]
    
    return commands

# 执行批量处理
if commands:
    commands = batch_process_common_issues(commands)
else:
    print("没有命令数据可供处理")

In [None]:
# 4. 数据导出函数
def commands_to_csv(commands, output_path):
    """将命令数据导出为CSV"""
    # 准备CSV数据
    csv_data = []
    for cmd in commands:
        # 将参数列表转换为字符串
        params_str = "; ".join([f"{p['name']}: {p['desc']}" for p in cmd['parameters']])
        # 将示例列表转换为字符串
        examples_str = "; ".join(cmd['examples'])
        
        csv_data.append({
            'section': cmd.get('section', ''),
            'command': cmd.get('cmd', ''),
            'description': cmd.get('desc', ''),
            'parameters': params_str,
            'examples': examples_str,
            'format': cmd.get('format', '')
        })
    
    # 创建DataFrame并保存
    df = pd.DataFrame(csv_data)
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"CSV文件已保存: {output_path}")
    return df

def commands_to_yaml(commands, output_path):
    """将命令数据导出为YAML"""
    structured_data = {
        'at_commands': commands
    }
    
    with open(output_path, 'w', encoding='utf-8') as yamlfile:
        yaml.dump(structured_data, yamlfile, allow_unicode=True, sort_keys=False, default_flow_style=False)
    
    print(f"YAML文件已保存: {output_path}")
    
    # 预览YAML内容
    with open(output_path, 'r', encoding='utf-8') as f:
        content = f.read()
        print("\nYAML文件预览:")
        print(content[:500] + "..." if len(content) > 500 else content)

In [None]:
# 5. 执行转换和导出
if commands:
    # 导出为CSV（中间格式）
    csv_df = commands_to_csv(commands, "at_commands.csv")
    
    # 显示CSV预览
    print("\nCSV数据预览:")
    display(csv_df.head())
    
    # 导出为YAML
    commands_to_yaml(commands, "at_commands.yaml")
    
    # 可选：保存为JSON格式
    with open("at_commands.json", "w", encoding="utf-8") as f:
        json.dump({"at_commands": commands}, f, ensure_ascii=False, indent=2)
    print("JSON文件已保存: at_commands.json")
else:
    print("没有命令数据可供导出")

In [None]:
# 6. 数据验证和清理函数
def validate_commands(commands):
    """验证命令数据的完整性"""
    print("开始验证命令数据...")
    
    issues = []
    for i, cmd in enumerate(commands):
        # 检查必要字段
        if not cmd.get('cmd'):
            issues.append(f"命令 #{i}: 缺少命令名称")
        if not cmd.get('desc'):
            issues.append(f"命令 #{i} {cmd.get('cmd', '未知')}: 缺少描述")
    
    # 检查重复命令
    cmd_names = [cmd.get('cmd', '') for cmd in commands]
    duplicates = set([x for x in cmd_names if cmd_names.count(x) > 1])
    if duplicates:
        issues.append(f"发现重复命令: {', '.join(duplicates)}")
    
    if issues:
        print(f"发现 {len(issues)} 个问题:")
        for issue in issues:
            print(f"  - {issue}")
    else:
        print("数据验证通过，未发现问题")
    
    return issues

# 执行验证
if commands:
    issues = validate_commands(commands)
else:
    print("没有命令数据可供验证")

In [None]:
# 7. 手动修正辅助函数
def manual_correction_helper(commands):
    """辅助手动修正数据的函数"""
    print("手动修正辅助工具")
    print("=" * 50)
    
    for i, cmd in enumerate(commands):
        print(f"{i+1}. {cmd.get('cmd', '未知命令')}: {cmd.get('desc', '无描述')}")
        print(f"   参数: {len(cmd.get('parameters', []))} 个")
        print(f"   示例: {len(cmd.get('examples', []))} 个")
        print()
    
    print("使用 commands[索引] 访问和修改特定命令")
    print("例如: commands[0]['desc'] = '新的描述'")
    
    return commands

# 启动手动修正辅助
if commands:
    commands = manual_correction_helper(commands)
else:
    print("没有命令数据可供修正")

In [None]:
# 最终导出和验证
if commands:
    # 导出为CSV
    csv_df = commands_to_csv(commands, "at_commands_enhanced.csv")
    
    # 显示CSV预览
    print("\n增强版CSV数据预览:")
    display(csv_df.head())
    
    # 导出为YAML
    commands_to_yaml(commands, "at_commands_enhanced.yaml")
    
    print("增强版转换完成！")
    
    # 显示统计信息
    params_count = sum(len(cmd.get('parameters', [])) for cmd in commands)
    examples_count = sum(len(cmd.get('examples', [])) for cmd in commands)
    sections_count = sum(1 for cmd in commands if cmd.get('section'))
    
    print(f"\n最终统计:")
    print(f"命令总数: {len(commands)}")
    print(f"有Section的命令: {sections_count}")
    print(f"参数总数: {params_count}")
    print(f"示例总数: {examples_count}")
    
    # 显示前几个命令的详情
    print("\n前3个命令的详情:")
    for i, cmd in enumerate(commands[:3]):
        print(f"{i+1}. {cmd.get('section', '无节号')} {cmd['cmd']}: {cmd['desc']}")
        if cmd.get('parameters'):
            print("   参数:")
            for param in cmd['parameters']:
                print(f"     - {param['name']}: {param['desc']}")
        if cmd.get('examples'):
            print("   示例:")
            for example in cmd['examples'][:3]:  # 只显示前3个示例
                print(f"     - {example}")
        print()
else:
    print("没有命令数据可供导出")

In [None]:
# 创建命令模式数据库
def create_command_pattern_database(commands):
    """创建命令模式数据库"""
    print("创建命令模式数据库...")
    
    pattern_db = {}
    
    for cmd in commands:
        cmd_name = cmd.get('cmd')
        if cmd_name and (cmd.get('parameters') or cmd.get('examples')):
            pattern_db[cmd_name] = {
                'parameters': cmd.get('parameters', []),
                'examples': cmd.get('examples', [])
            }
            print(f"添加到模式数据库: {cmd_name}")
    
    # 保存模式数据库
    with open("command_patterns.json", "w", encoding="utf-8") as f:
        json.dump(pattern_db, f, ensure_ascii=False, indent=2)
    
    print(f"模式数据库已保存，包含 {len(pattern_db)} 个命令模式")
    return pattern_db

# 创建模式数据库
if commands:
    pattern_db = create_command_pattern_database(commands)
else:
    print("没有命令数据可供创建模式数据库")

In [None]:
# 使用模式数据库填充新命令
def fill_new_commands_with_pattern_db(commands, pattern_db):
    """使用模式数据库填充新命令"""
    print("使用模式数据库填充新命令...")
    
    filled_count = 0
    
    for cmd in commands:
        cmd_name = cmd.get('cmd')
        if cmd_name in pattern_db:
            pattern = pattern_db[cmd_name]
            
            # 只填充缺失的数据
            if not cmd.get('parameters') and pattern.get('parameters'):
                cmd['parameters'] = pattern['parameters']
                filled_count += 1
            
            if not cmd.get('examples') and pattern.get('examples'):
                cmd['examples'] = pattern['examples']
                filled_count += 1
    
    print(f"使用模式数据库填充了 {filled_count} 处缺失数据")
    return commands

# 使用模式数据库填充
if commands and 'pattern_db' in globals():
    commands = fill_new_commands_with_pattern_db(commands, pattern_db)
else:
    print("没有模式数据库可供使用")