
# 📘 从 Word（at-parameter-demo.docx）精准抽取 → CSV 模板（增强版）

本 Notebook 针对你的示例文档结构做了**定制化适配**：
- 识别形如 `ATI：获取模组厂商信息`、`AT+GMR：查询版本信息` 的命令段落；
- 绑定**紧随其后的「参数」表格**为该命令的参数表；
- 支持**无表头两列表格**（第1列=参数名，第2列=说明/取值）；
- 自动解析 `a:b, c:d`/`a:b；c:d` 等取值映射为 JSON；
- 输出符合流水线的 `data/extracted_commands.csv`。


In [None]:

# Step 0. 安装依赖（首次运行需要）
!pip install python-docx pandas -q


In [1]:

# Step 1. 读取文档并按顺序枚举段落与表格（用于调试）
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
import re

doc_path = "at-parameter-demo.docx"
doc = Document(doc_path)

order = []
body = doc._element.body
for child in body.iterchildren():
    if isinstance(child, CT_P):
        text = "".join([t.text for t in child.xpath('.//w:t') if t.text]).strip()
        order.append(("P", text))
    elif isinstance(child, CT_Tbl):
        order.append(("T", f"<table>"))
    else:
        order.append(("X", child.tag))

print("文档块顺序预览：")
for i, (typ, content) in enumerate(order):
    print(f"{i:02d} [{typ}] {content}")


文档块顺序预览：
00 [P] 通用AT指令
01 [P] ATI：获取模组厂商信息
02 [P] 参数
03 [T] <table>
04 [P] AT+GMR：查询版本信息
05 [P] 参数
06 [T] <table>
07 [P] AT+CSQ：获取信号强度
08 [P] 参数
09 [T] <table>
10 [P] 
11 [X] {http://schemas.openxmlformats.org/wordprocessingml/2006/main}sectPr


In [2]:

# Step 2. 抽取命令与“参数”表格的绑定，输出 CSV
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
import pandas as pd, json, os, re

def extract_commands_to_csv(doc_path, csv_out="data/extracted_commands.csv"):
    doc = Document(doc_path)
    body = doc._element.body

    # 将文档内容按顺序序列化：段落对象 or 表格对象
    seq = []
    tbl_iter_index = 0
    for child in body.iterchildren():
        if isinstance(child, CT_P):
            text = "".join([t.text for t in child.xpath('.//w:t') if t.text]).strip()
            seq.append(("p", text))
        elif isinstance(child, CT_Tbl):
            table_obj = doc.tables[tbl_iter_index]
            tbl_iter_index += 1
            seq.append(("tbl", table_obj))

    results = []
    current_cmd = None
    current_title = ""
    expect_param_table = False

    # 正则：匹配 'AT...' 到 冒号/全角冒号 之前的命令名
    cmd_pattern = re.compile(r'^\s*(AT\S*?)\s*[：:]\s*(.*)$')

    for typ, obj in seq:
        if typ == "p":
            text = obj
            # 1) 命令行
            m = cmd_pattern.match(text)
            if m:
                current_cmd = m.group(1)          # 例如 ATI、AT+GMR
                current_title = m.group(2).strip()# 例如 获取模组厂商信息
                expect_param_table = False
                continue

            # 2) 参数提示行
            if text.strip() == "参数" and current_cmd:
                expect_param_table = True
                continue

        elif typ == "tbl" and current_cmd and expect_param_table:
            table = obj

            # 读取表格为二维数组
            rows = [[c.text.strip() for c in r.cells] for r in table.rows]
            if not rows:
                continue

            # 检测是否有表头（包含“参数/描述/说明/取值”等关键词）
            header = rows[0]
            has_header = any(re.search(r'参数|描述|说明|取值', h) for h in header)
            start = 1 if has_header else 0

            params = []
            for row in rows[start:]:
                if not any(row):  # 跳过全空行
                    continue
                name = row[0] if len(row) > 0 else ""
                desc = row[1] if len(row) > 1 else ""
                values_cell = row[2] if len(row) > 2 else ""

                # 尝试从 desc 或第三列中解析枚举映射
                map_source = values_cell if values_cell else desc
                values = {}
                for part in re.split(r'[，,;；]\s*', map_source):
                    if ':' in part:
                        k, v = part.split(':', 1)
                        values[k.strip()] = v.strip()

                params.append({
                    "name": name,
                    "desc": desc,
                    "values": values  # 可能为空字典，后续可人工补充
                })

            results.append({
                "命令": current_cmd,
                "命令标题": current_title,
                "命令类型": "执行;查询",
                "命令格式": current_cmd,
                "示例命令": current_cmd,
                "示例响应": "",
                "功能描述": current_title,
                "备注": "",
                "参数JSON": json.dumps(params, ensure_ascii=False)
            })

            # 一个命令只绑定紧随其后的第一张“参数表”
            expect_param_table = False

    os.makedirs("data", exist_ok=True)
    df = pd.DataFrame(results)
    df.to_csv(csv_out, index=False, encoding="utf-8-sig")
    print(f"✅ 已提取 {len(df)} 条命令 → {csv_out}")
    return df

df = extract_commands_to_csv(doc_path)
df.head()


✅ 已提取 3 条命令 → data/extracted_commands.csv


Unnamed: 0,命令,命令标题,命令类型,命令格式,示例命令,示例响应,功能描述,备注,参数JSON
0,ATI,获取模组厂商信息,执行;查询,ATI,ATI,,获取模组厂商信息,,"[{""name"": ""<manufacturer>"", ""desc"": ""模组厂商信息、产品..."
1,AT+GMR,查询版本信息,执行;查询,AT+GMR,AT+GMR,,查询版本信息,,"[{""name"": ""<reversion>"", ""desc"": ""模组软件版本信息"", ""..."
2,AT+CSQ,获取信号强度,执行;查询,AT+CSQ,AT+CSQ,,获取信号强度,,"[{""name"": ""<signal>"", ""desc"": ""以下为signal(CSQ)与..."
