# 📘 Word → CSV (Markdown 参数版)

此 Notebook 将 Word 命令手册（如 `SimpleAT.docx`）解析为可人工阅读的 CSV。

输出的 CSV 字段包括：
- 章节、命令、命令标题、命令类型、命令格式、响应、示例命令、功能描述、备注、表数量、顺序、参数

其中 `参数` 字段以 Markdown 格式展示，方便人工校对与后续结构化清洗。

In [1]:
!pip install -q python-docx pandas lxml
print("✅ 环境依赖检查完成")

✅ 环境依赖检查完成


In [2]:
import os, re, json, traceback
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from lxml import etree
import pandas as pd

## ⚙️ 全局配置

In [3]:
IN_WORD = "SimpleAT.docx"  # 输入 Word 文件名
CSV_OUT = "data/at_commands_markdown.csv"  # 输出 CSV

os.makedirs(os.path.dirname(CSV_OUT), exist_ok=True)
print(f"✅ 配置就绪。输入文件: {IN_WORD}")

✅ 配置就绪。输入文件: SimpleAT.docx


## 🧩 工具函数定义

In [4]:
CMD_LINE_PAT = re.compile(r'^\s*(AT[\+\w\-]+(?:\?[=\w<>,\s\-\+\.\:]*?)?)\s*(?::|：)?\s*(.*)$', re.I)
PARAM_HEADING_PAT = re.compile(r'^\s*参数(说明|表|信息)?\s*[:：]?\s*$', re.I)

def is_cmd_heading(text: str) -> bool:
    return bool(CMD_LINE_PAT.match(text or ""))

def is_param_heading(text: str) -> bool:
    return bool(PARAM_HEADING_PAT.match(text or ""))

def iter_ordered_blocks(doc):
    body = doc._element.body
    tbl_idx = 0
    for child in body.iterchildren():
        if isinstance(child, CT_P):
            text = "".join([t.text for t in child.xpath('.//w:t') if t.text]).strip()
            yield ("p", text)
        elif isinstance(child, CT_Tbl):
            table_obj = doc.tables[tbl_idx]
            tbl_idx += 1
            yield ("tbl", table_obj)

def cell_plain_text(cell):
    parts = [p.text.strip() for p in cell.paragraphs if p.text and p.text.strip()]
    return "\n".join(parts).strip()

def find_nested_tbls_in_cell(cell):
    xml_str = cell._tc.xml
    root = etree.fromstring(xml_str.encode("utf-8"))
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    return root.findall(".//w:tbl", ns), ns

def tbl_rows_as_text(tbl, ns):
    rows = []
    for r in tbl.findall(".//w:tr", ns):
        cells = r.findall(".//w:tc", ns)
        row = ["".join(tn.text for tn in c.iterfind(".//w:t", ns) if tn.text).strip() for c in cells]
        rows.append(row)
    return rows

def looks_like_header(row):
    hdr = " ".join(row[:3])
    return any(k in hdr for k in ["参数","名称","Name","描述","说明","含义","取值","值","value","meaning","备注","范围"])

def nested_table_to_valmap(rows):
    if not rows: return {}
    start = 1 if looks_like_header(rows[0]) else 0
    kv = {}
    for r in rows[start:]:
        if not r: continue
        key = (r[0] or "").strip()
        val = " | ".join([c for c in r[1:] if c and c.strip()]) if len(r) > 1 else ""
        if key: kv[key] = val
    return kv

def cell_valmap_from_nested_table(cell):
    tbls, ns = find_nested_tbls_in_cell(cell); mapping = {}
    for t in tbls:
        rows = tbl_rows_as_text(t, ns); mapping.update(nested_table_to_valmap(rows))
    return mapping

def parse_enum_map_fuzzy(text):
    if not text: return {}
    segs = re.split(r"[,，;；\n]+", text.strip())
    m = {}
    for s in segs:
        s = s.strip()
        if not s: continue
        if ":" in s or "：" in s:
            k, v = re.split(r"[:：]", s, 1); k, v = k.strip(), v.strip()
        else:
            m2 = re.match(r"^(\S+)\s*(?:->|→|=>|-|—|\s)\s*(.+)$", s)
            if m2: k, v = m2.group(1).strip(), m2.group(2).strip()
            else: continue
        if k: m[k] = v
    return m

def format_params_markdown(params):
    out = []
    for p in params:
        name = p.get("name", "").strip()
        desc = p.get("desc", "").strip()
        valmap = p.get("valmap", {}) or {}
        lines = [f"- `{name}`：{desc or '—'}"]
        if valmap:
            for k, v in valmap.items():
                lines.append(f"  - `{k}`: {v}")
        out.append("\n".join(lines))
    return "\n\n".join(out).strip()

## 🧠 核心函数：Word → CSV (Markdown参数)

In [5]:
def extract_word_to_csv(docx_path, csv_out):
    if not os.path.exists(docx_path):
        raise FileNotFoundError(f"未找到 Word 文件: {docx_path}")

    doc = Document(docx_path)
    seq = list(iter_ordered_blocks(doc))

    results = []
    i = 0
    cmd_order = 0

    while i < len(seq):
        typ, obj = seq[i]
        if typ == "p":
            m = CMD_LINE_PAT.match(obj)
            if m:
                cmd_order += 1
                current_cmd = m.group(1).strip()
                current_title = (m.group(2) or "").strip()

                desc_lines = []
                j = i + 1
                while j < len(seq):
                    t2, o2 = seq[j]
                    if t2 == "p":
                        if is_cmd_heading(o2) or is_param_heading(o2): break
                        if o2: desc_lines.append(o2)
                    elif t2 == "tbl": break
                    j += 1
                merged_desc = "\n".join(desc_lines).strip()

                params_all = []
                table_count = 0
                k = j
                while k < len(seq):
                    t3, o3 = seq[k]
                    if t3 == "p" and is_cmd_heading(o3): break
                    if t3 == "p" and is_param_heading(o3):
                        k += 1
                        while k < len(seq) and seq[k][0] == "tbl":
                            table = seq[k][1]
                            table_count += 1
                            for r in table.rows:
                                cols = r.cells
                                if not any(c.text.strip() for c in cols): continue
                                try:
                                    name = cell_plain_text(cols[0]) if len(cols) > 0 else ""
                                    desc = cell_plain_text(cols[1]) if len(cols) > 1 else ""
                                    valmap = {}
                                    if len(cols) > 2:
                                        valmap = cell_valmap_from_nested_table(cols[2]) or parse_enum_map_fuzzy(cell_plain_text(cols[2]))
                                    if not valmap and len(cols) > 1:
                                        valmap = cell_valmap_from_nested_table(cols[1]) or parse_enum_map_fuzzy(desc)
                                    if name in ("参数","参数名","Name") and any(x in desc for x in ["描述","说明","Description","Meaning"]): continue
                                    params_all.append({"name": name, "desc": desc, "valmap": valmap})
                                except Exception as e:
                                    print(f"⚠️ 参数解析异常 {current_cmd}: {e}")
                            k += 1
                        continue
                    k += 1

                results.append({
                    "章节": str(cmd_order),
                    "命令": current_cmd,
                    "命令标题": current_title,
                    "命令类型": "执行",
                    "命令格式": current_cmd,
                    "响应": "<CR><LF>OK<CR><LF>",
                    "示例命令": f"{current_cmd}\n\nOK",
                    "功能描述": merged_desc or current_title,
                    "备注": f"说明示例{cmd_order}",
                    "表数量": table_count,
                    "顺序": cmd_order,
                    "参数": format_params_markdown(params_all)
                })
                i = k; continue
        i += 1

    df = pd.DataFrame(results)
    df.to_csv(csv_out, index=False, encoding="utf-8-sig")
    print(f"✅ 提取 {len(df)} 条命令 → {csv_out}")
    return df

## 🚀 运行提取

In [6]:
df = extract_word_to_csv(IN_WORD, CSV_OUT)

✅ 提取 2 条命令 → data/at_commands_markdown.csv


  k, v = re.split(r"[:：]", s, 1); k, v = k.strip(), v.strip()


In [7]:
df.head()

Unnamed: 0,章节,命令,命令标题,命令类型,命令格式,响应,示例命令,功能描述,备注,表数量,顺序,参数
0,1,ATI,获取模组厂商信息,执行,ATI,<CR><LF>OK<CR><LF>,ATI\n\nOK,获取模组厂商信息，包括厂家、型号和版本。\n命令格式,说明示例1,1,1,- `<manufacturer>`：模组厂商信息、产品名称、版本号\n\n- `<modu...
1,2,AT+CSMS,选择短信服务,执行,AT+CSMS,<CR><LF>OK<CR><LF>,AT+CSMS\n\nOK,用于支持的短消息包括：发送（SMS-MO）、接收（SMS-MT）、小区广播（SMS-CB）。...,说明示例2,1,2,- `<service>`：0：GSM03.40 and GSM03.41；SMS相关AT指...
