
# 📘 Word → CSV → YAML → RST → Sphinx 全自动流水线 v5
**主要改进：**
- 支持 Word 嵌套表格解析，生成 `valmap`
- CSV / YAML 保留完整参数映射结构
- RST 渲染使用 `p.valmap` 模板
- 兼容多行、冒号、空格等复杂取值表达


In [None]:

# Step 0 — 安装依赖（首次运行）
!pip install -q python-docx pandas pyyaml jinja2 sphinx sphinx_rtd_theme lxml


In [None]:

# Step 1 — Word → CSV（解析嵌套表格 → valmap）
import os, re, json
import pandas as pd
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from lxml import etree

IN_WORD = "at-parameter-demo.docx"
CSV_DIR = "data"
CSV_OUT = os.path.join(CSV_DIR, "extracted_commands.csv")
os.makedirs(CSV_DIR, exist_ok=True)

PARAM_HEADING_PAT = re.compile(r'^\s*参数(说明|表)?\s*[:：]?\s*$', re.I)
CMD_LINE_PAT = re.compile(r'^\s*(AT\S*?)\s*[:：]\s*(.*)$')

def is_param_heading(text): return bool(PARAM_HEADING_PAT.match(text or ""))
def is_cmd_heading(text): return bool(CMD_LINE_PAT.match(text or ""))

def iter_ordered_blocks(doc):
    body = doc._element.body; tbl_idx = 0
    for child in body.iterchildren():
        if isinstance(child, CT_P):
            text = ''.join([t.text for t in child.xpath('.//w:t') if t.text]).strip()
            yield ('p', text)
        elif isinstance(child, CT_Tbl):
            table_obj = doc.tables[tbl_idx]; tbl_idx += 1; yield ('tbl', table_obj)

def cell_plain_text(cell):
    parts = [p.text.strip() for p in cell.paragraphs if p.text and p.text.strip()]
    return '\n'.join(parts).strip()

def find_nested_tbls_in_cell(cell):
    xml_str = cell._tc.xml
    root = etree.fromstring(xml_str.encode('utf-8'))
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    return root.findall('.//w:tbl', ns), ns

def tbl_rows_as_text(tbl, ns):
    rows = []
    for r in tbl.findall('.//w:tr', ns):
        cells = r.findall('.//w:tc', ns)
        row = [''.join(tn.text for tn in c.iterfind('.//w:t', ns) if tn.text).strip() for c in cells]
        rows.append(row)
    return rows

def looks_like_header(row):
    hdr = ' '.join(row[:3])
    return any(k in hdr for k in ['参数','名称','Name','描述','说明','取值','值','value','meaning'])

def nested_table_to_valmap(rows):
    if not rows: return {}
    start = 1 if looks_like_header(rows[0]) else 0
    kv = {}
    for r in rows[start:]:
        if not r: continue
        key = (r[0] or '').strip()
        val = ' | '.join([c for c in r[1:] if c and c.strip()]) if len(r)>1 else ''
        if key: kv[key] = val
    return kv

def cell_valmap_from_nested_table(cell):
    tbls, ns = find_nested_tbls_in_cell(cell)
    mapping = {}
    for t in tbls:
        rows = tbl_rows_as_text(t, ns)
        mapping.update(nested_table_to_valmap(rows))
    return mapping

def parse_enum_map_fuzzy(text):
    if not text: return {}
    segs = re.split(r'[，,;；\n]+', text.strip())
    m = {}
    for s in segs:
        s = s.strip()
        if not s: continue
        if ':' in s or '：' in s:
            k,v = re.split(r'[:：]',s,1); k,v=k.strip(),v.strip()
        else:
            m2 = re.match(r'^(\S+)\s*(?:->|→|=>|-|—|\s)\s*(.+)$', s)
            if m2: k,v=m2.group(1).strip(),m2.group(2).strip()
            else:
                m3 = re.match(r'^([A-Za-z0-9\+\-\.]+)\s+(.+)$', s)
                if m3: k,v=m3.group(1).strip(),m3.group(2).strip()
                else: continue
        if k: m[k]=v
    return m

def extract_word_to_csv(docx_path, csv_out):
    if not os.path.exists(docx_path):
        raise FileNotFoundError(f"未找到 Word 文件: {docx_path}")
    doc = Document(docx_path)
    seq = list(iter_ordered_blocks(doc))
    results = []; i = 0
    while i < len(seq):
        typ,obj = seq[i]
        if typ == 'p':
            m = CMD_LINE_PAT.match(obj)
            if m:
                cmd, title = m.group(1), (m.group(2) or '').strip()
                params_all = []; j = i + 1
                while j < len(seq):
                    t2,o2 = seq[j]
                    if t2 == 'p' and is_cmd_heading(o2): break
                    if t2 == 'p' and is_param_heading(o2):
                        k = j + 1
                        while k < len(seq) and seq[k][0] == 'tbl':
                            table = seq[k][1]
                            for r in table.rows:
                                cols = r.cells
                                if not any(c.text.strip() for c in cols): continue
                                name = cell_plain_text(cols[0]) if len(cols)>0 else ''
                                desc = cell_plain_text(cols[1]) if len(cols)>1 else ''
                                valmap = {}
                                if len(cols)>2:
                                    valmap = cell_valmap_from_nested_table(cols[2]) or parse_enum_map_fuzzy(cell_plain_text(cols[2]))
                                if not valmap and len(cols)>1:
                                    valmap = cell_valmap_from_nested_table(cols[1]) or parse_enum_map_fuzzy(desc)
                                if name in ('参数','参数名','Name') and any(x in desc for x in ['描述','说明','Description']): continue
                                params_all.append({'name':name,'desc':desc,'valmap':valmap})
                            k += 1
                        j = k; continue
                    j += 1
                if params_all:
                    results.append({'命令':cmd,'命令标题':title,'命令类型':'执行;查询','命令格式':cmd,
                                    '示例命令':cmd,'示例响应':'','功能描述':title,'备注':'',
                                    '参数JSON':json.dumps(params_all,ensure_ascii=False)})
                i = j; continue
        i += 1
    df = pd.DataFrame(results)
    df.to_csv(csv_out,index=False,encoding='utf-8-sig')
    print(f'✅ 提取 {len(df)} 条命令 → {csv_out}')
    return df

df_csv = extract_word_to_csv(IN_WORD, CSV_OUT)
df_csv.head()


In [None]:

# Step 2 — CSV → YAML（保留 valmap）
import yaml, json
YAML_OUT = os.path.join(CSV_DIR, "all_commands.yaml")

def csv_to_yaml(csv_path, yaml_path):
    df = pd.read_csv(csv_path, dtype=str).fillna("")
    cmds = []
    for _,r in df.iterrows():
        params = json.loads(r['参数JSON']) if r['参数JSON'] else []
        cmds.append({
            'command':r['命令'],'title':r['命令标题'],
            'type':[t.strip() for t in r['命令类型'].split(';') if t.strip()],
            'formats':[f.strip() for f in r['命令格式'].split('|') if f.strip()] or [r['命令格式']],
            'parameters':params,
            'examples':[{'cmd':c.strip(),'resp':e.strip()} for c,e in zip((r['示例命令'] or '').split('|'),(r['示例响应'] or '').split('|')) if c.strip() or e.strip()],
            'description':r.get('功能描述',''),'notes':r.get('备注','')
        })
    with open(yaml_path,'w',encoding='utf-8') as f:
        yaml.safe_dump({'commands':cmds}, f, allow_unicode=True, sort_keys=False)
    print(f'✅ 已生成 YAML → {yaml_path}')

csv_to_yaml(CSV_OUT, YAML_OUT)


In [None]:

# Step 3 — YAML → RST（使用 p.valmap 模板）
from jinja2 import Template
import yaml, os

RST_DIR = os.path.join("data","rst_output")
os.makedirs(RST_DIR, exist_ok=True)

TEMPLATE_STR = '''
{{ cmd.command }}
{{ '=' * cmd.command|length }}

**Title**: {{ cmd.title }}
**Types**: {{ cmd.type|join(', ') }}

Formats::
{%- for f in cmd.formats %}
   {{ f }}
{%- endfor %}

Parameters
----------
.. list-table::
   :header-rows: 1
   :widths: 18 34 48

   * - Name
     - Description
     - Values
{%- for p in cmd.parameters %}
   * - {{ p.name }}
     - {{ p.desc or '—' }}
     - {%- if p.valmap %}
       .. list-table::
          :header-rows: 1
          :widths: 20 40

          * - Key
            - Value
{%- for k,v in p.valmap.items() %}
          * - {{ k }}
            - {{ v }}
{%- endfor %}
       {%- else %} N/A {%- endif %}
{%- endfor %}
'''

RST_TMPL = Template(TEMPLATE_STR)

def yaml_to_rst(yaml_path, rst_dir):
    with open(yaml_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)
    cmds = data.get('commands', [])
    for cmd in cmds:
        text = RST_TMPL.render(cmd=cmd)
        fname = f"{cmd['command']}.rst"
        with open(os.path.join(rst_dir, fname), 'w', encoding='utf-8') as fo:
            fo.write(text)
    index_lines = ['AT Manual','=========','','.. toctree::','   :maxdepth: 1','']
    for cmd in cmds:
        index_lines.append(f"   {cmd['command']}")
    with open(os.path.join(rst_dir,'index.rst'),'w',encoding='utf-8') as fo:
        fo.write('\n'.join(index_lines))
    print(f'✅ RST 已生成到 {rst_dir}')

yaml_to_rst(YAML_OUT, RST_DIR)
