
# 📘 Word → CSV → YAML → RST → Sphinx(HTML/PDF) 全自动流水线 v6
（针对复杂 `AT_Commands.docx`）
- 嵌套表优先、文本回退解析（valmap）
- 多参数表合并；命令标题/参数标题更宽松识别
- 解析日志 `parse_log.txt`；一键 `run_all()`


In [1]:

# Step 0 — 安装依赖
!pip install -q python-docx pandas pyyaml jinja2 sphinx sphinx_rtd_theme lxml


In [2]:

# Step 1 — Word → CSV（复杂解析）
import os, re, json, traceback, datetime
import pandas as pd
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from lxml import etree

IN_WORD = "AT_Commands.docx"
CSV_DIR = "data"
CSV_OUT = os.path.join(CSV_DIR, "at_extracted_commands.csv")
LOG_PATH = "parse_log.txt"
os.makedirs(CSV_DIR, exist_ok=True)

def log(msg):
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(f"[{datetime.datetime.now().isoformat(timespec='seconds')}] {msg}\n")

CMD_LINE_PAT = re.compile(r'^\s*(AT[\+\w\-]+(?:\?[=\w<>,\s\-\+\.\:]*?)?)\s*(?::|：)?\s*(.*)$')
PARAM_HEADING_PAT = re.compile(r'^\s*参数(说明|表|信息)?\s*[:：]?\s*$', re.I)

def is_cmd_heading(text: str) -> bool: return bool(CMD_LINE_PAT.match(text or ""))
def is_param_heading(text: str) -> bool: return bool(PARAM_HEADING_PAT.match(text or ""))

def iter_ordered_blocks(doc):
    body = doc._element.body; tbl_idx = 0
    for child in body.iterchildren():
        if isinstance(child, CT_P):
            text = "".join([t.text for t in child.xpath('.//w:t') if t.text]).strip()
            yield ("p", text)
        elif isinstance(child, CT_Tbl):
            table_obj = doc.tables[tbl_idx]; tbl_idx += 1
            yield ("tbl", table_obj)

def cell_plain_text(cell):
    parts = [p.text.strip() for p in cell.paragraphs if p.text and p.text.strip()]
    return "\n".join(parts).strip()

def find_nested_tbls_in_cell(cell):
    xml_str = cell._tc.xml
    root = etree.fromstring(xml_str.encode("utf-8"))
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    return root.findall(".//w:tbl", ns), ns

def tbl_rows_as_text(tbl, ns):
    rows = []
    for r in tbl.findall(".//w:tr", ns):
        cells = r.findall(".//w:tc", ns)
        row = ["".join(tn.text for tn in c.iterfind(".//w:t", ns) if tn.text).strip() for c in cells]
        rows.append(row)
    return rows

def looks_like_header(row):
    hdr = " ".join(row[:3])
    return any(k in hdr for k in ["参数","名称","Name","描述","说明","含义","取值","值","value","meaning","备注","范围"])

def nested_table_to_valmap(rows):
    if not rows: return {}
    start = 1 if looks_like_header(rows[0]) else 0
    kv = {}
    for r in rows[start:]:
        if not r: continue
        key = (r[0] or "").strip()
        val = " | ".join([c for c in r[1:] if c and c.strip()]) if len(r) > 1 else ""
        if key: kv[key] = val
    return kv

def cell_valmap_from_nested_table(cell):
    tbls, ns = find_nested_tbls_in_cell(cell)
    mapping = {}
    for t in tbls:
        rows = tbl_rows_as_text(t, ns)
        mapping.update(nested_table_to_valmap(rows))
    return mapping

def parse_enum_map_fuzzy(text):
    if not text: return {}
    segs = re.split(r"[,\\uFF0C;\\uFF1B\\n]+", text.strip())
    m = {}
    for s in segs:
        s = s.strip()
        if not s: continue
        if ":" in s or "：" in s:
            k, v = re.split(r"[:：]", s, 1); k, v = k.strip(), v.strip()
        else:
            m2 = re.match(r"^(\\S+)\\s*(?:->|→|=>|-|—|\\s)\\s*(.+)$", s)
            if m2: k, v = m2.group(1).strip(), m2.group(2).strip()
            else:
                m3 = re.match(r"^([A-Za-z0-9\\+\\-\\.]+)\\s+(.+)$", s)
                if m3: k, v = m3.group(1).strip(), m3.group(2).strip()
                else: continue
        if k: m[k] = v
    return m

def extract_word_to_csv(docx_path, csv_out):
    if not os.path.exists(docx_path):
        raise FileNotFoundError(f"未找到 Word 文件: {docx_path}")
    open(LOG_PATH, "w", encoding="utf-8").write("")
    log(f"Start parsing: {docx_path}")
    doc = Document(docx_path)
    seq = list(iter_ordered_blocks(doc))

    results = []; i = 0; cmd_order = 0
    while i < len(seq):
        typ, obj = seq[i]
        if typ == "p":
            m = CMD_LINE_PAT.match(obj)
            if m:
                cmd_order += 1
                current_cmd = m.group(1).strip()
                current_title = (m.group(2) or "").strip()
                log(f"CMD[{cmd_order}] {current_cmd} — {current_title}")

                # 合并紧随其后的说明段落
                desc_lines = []; j = i + 1
                while j < len(seq):
                    t2, o2 = seq[j]
                    if t2 == "p":
                        if is_cmd_heading(o2) or is_param_heading(o2): break
                        if o2: desc_lines.append(o2)
                    elif t2 == "tbl":
                        break
                    j += 1
                merged_desc = "\\n".join(desc_lines).strip()

                # 参数表聚合
                params_all = []; table_count = 0; k = j
                while k < len(seq):
                    t3, o3 = seq[k]
                    if t3 == "p" and is_cmd_heading(o3): break
                    if t3 == "p" and is_param_heading(o3):
                        k += 1
                        while k < len(seq) and seq[k][0] == "tbl":
                            table = seq[k][1]; table_count += 1
                            for r in table.rows:
                                cols = r.cells
                                if not any(c.text.strip() for c in cols): continue
                                try:
                                    name = cell_plain_text(cols[0]) if len(cols) > 0 else ""
                                    desc = cell_plain_text(cols[1]) if len(cols) > 1 else ""
                                    valmap = {}
                                    if len(cols) > 2:
                                        valmap = cell_valmap_from_nested_table(cols[2]) or parse_enum_map_fuzzy(cell_plain_text(cols[2]))
                                    if not valmap and len(cols) > 1:
                                        valmap = cell_valmap_from_nested_table(cols[1]) or parse_enum_map_fuzzy(desc)
                                    if name in ("参数","参数名","Name") and any(x in desc for x in ["描述","说明","Description","Meaning"]): 
                                        continue
                                    params_all.append({"name": name, "desc": desc, "valmap": valmap})
                                except Exception as e:
                                    log(f"ROW-ERROR in {current_cmd}: {e}")
                                    log(traceback.format_exc())
                            k += 1
                        continue
                    k += 1

                if params_all or merged_desc:
                    results.append({
                        "命令": current_cmd, "命令标题": current_title, "命令类型": "执行;查询",
                        "命令格式": current_cmd, "示例命令": current_cmd, "示例响应": "",
                        "功能描述": merged_desc or current_title, "备注": "",
                        "表数量": table_count, "顺序": cmd_order,
                        "参数JSON": json.dumps(params_all, ensure_ascii=False)
                    })
                    log(f"CMD[{cmd_order}] tables={table_count} params={len(params_all)}")

                i = k; continue
        i += 1

    df = pd.DataFrame(results)
    df.to_csv(csv_out, index=False, encoding="utf-8-sig")
    print(f"✅ 提取 {len(df)} 条命令 → {csv_out}")
    print(f"📝 解析日志：{LOG_PATH}")
    return df

df_csv = extract_word_to_csv(IN_WORD, CSV_OUT)
df_csv.head()


✅ 提取 44 条命令 → data\at_extracted_commands.csv
📝 解析日志：parse_log.txt


  k, v = re.split(r"[:：]", s, 1); k, v = k.strip(), v.strip()


Unnamed: 0,命令,命令标题,命令类型,命令格式,示例命令,示例响应,功能描述,备注,表数量,顺序,参数JSON
0,ATI,获取模组厂商信息,执行;查询,ATI,ATI,,获取模组厂商信息，包括厂家、型号和版本。\n命令格式,,1,1,"[{""name"": ""<manufacturer>"", ""desc"": ""模组厂商信息、产品..."
1,AT+GMR,查询版本信息,执行;查询,AT+GMR,AT+GMR,,查询软件版本信息。\n命令格式,,1,2,"[{""name"": ""<reversion>"", ""desc"": ""模组软件版本信息"", ""..."
2,AT+CSQ,获取信号强度,执行;查询,AT+CSQ,AT+CSQ,,查询接收信号强度<rssi>。\n命令格式,,1,3,"[{""name"": ""<signal>"", ""desc"": ""以下为signal(CSQ)与..."
3,AT+CREG,查询网络注册状态,执行;查询,AT+CREG,AT+CREG,,查询模组的当前网络注册状态。\n命令格式,,1,4,"[{""name"": ""<n>"", ""desc"": ""0：禁止网络注册主动提供结果代码（默认设..."
4,AT+CEREG,获取EPS网络注册状态,执行;查询,AT+CEREG,AT+CEREG,,查询EPS网络注册状态。\n命令格式,,1,5,"[{""name"": ""<n>"", ""desc"": ""0：禁止网络注册主动提供结果代码（默认设..."


In [3]:

# Step 2 — CSV → YAML（保留 valmap + meta）
import yaml, json

YAML_OUT = os.path.join(CSV_DIR, "at_all_commands.yaml")

def csv_to_yaml(csv_path, yaml_path):
    df = pd.read_csv(csv_path, dtype=str).fillna("")
    objs = []
    for _, r in df.iterrows():
        params = json.loads(r["参数JSON"]) if r["参数JSON"] else []
        objs.append({
            "command": r["命令"],
            "title": r["命令标题"],
            "type": [t.strip() for t in r["命令类型"].split(";") if t.strip()],
            "formats": [f.strip() for f in r["命令格式"].split("|") if f.strip()] or [r["命令格式"]],
            "parameters": params,
            "examples": [],
            "description": r.get("功能描述",""),
            "notes": r.get("备注",""),
            "meta": {"order": int(r.get("顺序","0") or 0), "tables": int(r.get("表数量","0") or 0)}
        })
    objs.sort(key=lambda x: x["meta"]["order"])
    with open(yaml_path, "w", encoding="utf-8") as f:
        yaml.safe_dump({"commands": objs}, f, allow_unicode=True, sort_keys=False)
    print(f"✅ 已生成 YAML → {yaml_path}")

csv_to_yaml(CSV_OUT, YAML_OUT)


✅ 已生成 YAML → data\at_all_commands.yaml


In [4]:

# Step 3 — YAML → RST（valmap 渲染 + 分组索引）
from jinja2 import Template
import yaml, os, re
from collections import defaultdict

RST_DIR = os.path.join("data","rst_output")
os.makedirs(RST_DIR, exist_ok=True)

PAGE_TMPL = Template('''
{{ cmd.command }}
{{ '=' * cmd.command|length }}

**Title**: {{ cmd.title }}
**Types**: {{ cmd.type|join(', ') }}

Formats::
{%- for f in cmd.formats %}
   {{ f }}
{%- endfor %}

Parameters
----------
.. list-table::
   :header-rows: 1
   :widths: 18 34 48

   * - Name
     - Description
     - Values
{%- for p in cmd.parameters %}
   * - {{ p.name }}
     - {{ p.desc or '—' }}
     - {%- if p.valmap %}
       .. list-table::
          :header-rows: 1
          :widths: 20 40

          * - Key
            - Value
{%- for k, v in p.valmap.items() %}
          * - {{ k }}
            - {{ v }}
{%- endfor %}
       {%- else %} N/A {%- endif %}
{%- endfor %}

**Description**: {{ cmd.description or '' }}
''')

def group_key(cmd_str):
    m = re.match(r'^AT\+([A-Z]+)', cmd_str.upper())
    if not m: return "AT-OTHER"
    token = m.group(1)
    return f"AT-{token[:2]}" if len(token) >= 2 else "AT-OTHER"

def yaml_to_rst(yaml_path, rst_dir):
    with open(yaml_path, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)
    cmds = data.get("commands", [])

    groups = defaultdict(list)
    for cmd in cmds:
        rst_text = PAGE_TMPL.render(cmd=cmd)
        fname = f"{cmd['command']}.rst"
        with open(os.path.join(rst_dir, fname), "w", encoding="utf-8") as fo:
            fo.write(rst_text)
        groups[group_key(cmd["command"])].append(cmd["command"])

    index_lines = ["AT Manual", "=========", "", ".. toctree::", "   :maxdepth: 1", ""]
    for g in sorted(groups.keys()):
        grp_name = f"index_{g}.rst"
        index_lines.append(f"   {grp_name[:-4]}")
        glines = [g, "=" * len(g), "", ".. toctree::", "   :maxdepth: 1", ""]
        for c in groups[g]:
            glines.append(f"   {c}")
        with open(os.path.join(rst_dir, grp_name), "w", encoding="utf-8") as fo:
            fo.write("\n".join(glines))

    with open(os.path.join(rst_dir, "index.rst"), "w", encoding="utf-8") as fo:
        fo.write("\n".join(index_lines))

    print(f"✅ RST 已生成到 {rst_dir}（含分组索引）")

yaml_to_rst(YAML_OUT, RST_DIR)


✅ RST 已生成到 data\rst_output（含分组索引）


In [None]:
# 清理旧 Sphinx项目  执行这个就不执行下面的step4了
import shutil, os

if os.path.exists("docs"):
    shutil.rmtree("docs")
    print("✅ 已清理旧 docs/ 目录")

!sphinx-quickstart docs --sep --project "AT Command Manual" --author "Doc Team" --release "1.0" -q

# 确保主题被追加
with open("docs/source/conf.py", "a", encoding="utf-8") as f:
    f.write('\nhtml_theme = "sphinx_rtd_theme"\n')

print("✅ 已重新生成 conf.py")


✅ 已清理旧 docs/ 目录

[01mFinished: An initial directory structure has been created.[39;49;00m

You should now populate your master file c:\Users\txiab\Documents\Git-folder\Building-docs\docs-as-code-learning\pipeline-1008\docs\source\index.rst and create other documentation
source files. Use the Makefile to build the docs, like so:
   make builder
where "builder" is one of the supported builders, e.g. html, latex or linkcheck.

✅ 已重新生成 conf.py


In [None]:
# 编译构建HTML文档
import shutil, os
shutil.copytree("data/rst_output", "docs/source", dirs_exist_ok=True)
!sphinx-build -b html docs/source docs/build/html


[01mRunning Sphinx v8.2.3[39;49;00m
[01mloading translations [en]... [39;49;00mdone
[01mmaking output directory... [39;49;00mdone
[01mbuilding [mo]: [39;49;00mtargets for 0 po files that are out of date
[01mwriting output... [39;49;00m
[01mbuilding [html]: [39;49;00mtargets for 64 source files that are out of date
[01mupdating environment: [39;49;00m[new config] 64 added, 0 changed, 0 removed
[2K[01mreading sources... [39;49;00m[  2%] [35mAT+CCID[39;49;00m
[2K[01mreading sources... [39;49;00m[  3%] [35mAT+CCLK[39;49;00m
[2K[01mreading sources... [39;49;00m[  5%] [35mAT+CEREG[39;49;00m
[2K[01mreading sources... [39;49;00m[  6%] [35mAT+CESQ[39;49;00m
[2K[01mreading sources... [39;49;00m[  8%] [35mAT+CFUN[39;49;00m
[2K[01mreading sources... [39;49;00m[  9%] [35mAT+CGATT[39;49;00m
[2K[01mreading sources... [39;49;00m[ 11%] [35mAT+CGDCONT[39;49;00m
[2K[01mreading sources... [39;49;00m[ 12%] [35mAT+CGMM[39;49;00m
[2K[01mreading sources.

[31mC:\Users\txiab\Documents\Git-folder\Building-docs\docs-as-code-learning\pipeline-1008\docs\source\AT+CCID.rst:13: ERROR: Error parsing content block for the "list-table" directive: uniform two-level bullet list expected, but row 2 does not contain the same number of items as row 1 (2 vs 3).

.. list-table::
   :header-rows: 1
   :widths: 18 34 48

   * - Name
     - Description
     - Values
   * - <ICCID>
     - Integrate circuit card identity 集成电路卡识别码，即所插入卡的识别码 [docutils][39;49;00m
[31mC:\Users\txiab\Documents\Git-folder\Building-docs\docs-as-code-learning\pipeline-1008\docs\source\AT+CCLK.rst:13: ERROR: Error parsing content block for the "list-table" directive: uniform two-level bullet list expected, but row 3 does not contain the same number of items as row 1 (2 vs 3).

.. list-table::
   :header-rows: 1
   :widths: 18 34 48

   * - Name
     - Description
     - Values
   * - <time>
     - 字符串，格式为 “yy/MM/dd,hh:mm:ss[TZ]”，指示年、月、日、小时、分钟、秒
     -
       .. list-table::
      

In [17]:
import os

for root, dirs, files in os.walk("docs"):
    level = root.replace("docs", "").count(os.sep)
    indent = " " * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = " " * 2 * (level + 1)
    for f in files:
        print(f"{subindent}{f}")


docs/
  make.bat
  Makefile
  build/
    html/
  source/
    AT+CCID.rst
    AT+CCLK.rst
    AT+CEREG.rst
    AT+CESQ.rst
    AT+CFUN.rst
    AT+CGATT.rst
    AT+CGDCONT.rst
    AT+CGMM.rst
    AT+CGSN.rst
    AT+CIMI.rst
    AT+CLCK.rst
    AT+CMGD.rst
    AT+CMGF.rst
    AT+CMGL.rst
    AT+CMGR.rst
    AT+CMGS.rst
    AT+CMGW.rst
    AT+CMSS.rst
    AT+CMUX.rst
    AT+CNMI.rst
    AT+COPS.rst
    AT+CPIN.rst
    AT+CPMS.rst
    AT+CPWD.rst
    AT+CREG.rst
    AT+CSCA.rst
    AT+CSCS.rst
    AT+CSDH.rst
    AT+CSMP.rst
    AT+CSMS.rst
    AT+CSQ.rst
    AT+GMM.rst
    AT+GMR.rst
    AT+GSN.rst
    AT+IPR.rst
    AT+NSTGETRSSI.rst
    AT+NWDNS.rst
    AT+NWENPWRSAVE.rst
    AT+NWPWROFF.rst
    AT+NWRFTEST.rst
    AT+XGAUTH.rst
    ATD.rst
    ATE1.rst
    ATI.rst
    conf.py
    index.rst
    index_AT-CC.rst
    index_AT-CE.rst
    index_AT-CF.rst
    index_AT-CG.rst
    index_AT-CI.rst
    index_AT-CL.rst
    index_AT-CM.rst
    index_AT-CN.rst
    index_AT-CO.rst
    index_AT-CP.rst


In [7]:

# 🟢 一键执行：run_all()
def run_all():
    global df_csv
    df_csv = extract_word_to_csv(IN_WORD, CSV_OUT)
    csv_to_yaml(CSV_OUT, YAML_OUT)
    yaml_to_rst(YAML_OUT, RST_DIR)
    import shutil, os
    if not os.path.exists('docs'):
        get_ipython().run_cell_magic('bash', '', 'sphinx-quickstart docs --sep --project "AT Command Manual" --author "Doc Team" --release "1.0" -q')
    with open('docs/source/conf.py','a',encoding='utf-8') as f:
        f.write('\nhtml_theme = "sphinx_rtd_theme"\n')
    shutil.copytree('data/rst_output', 'docs/source', dirs_exist_ok=True)
    get_ipython().run_cell_magic('bash', '', 'make -C docs html')
    print('\\n✅ 完成：docs/build/html/index.html')
    print('📝 解析日志查看：parse_log.txt')

print("准备就绪。逐步运行 Step 0~5，或直接 run_all()。")


准备就绪。逐步运行 Step 0~5，或直接 run_all()。
