# 📘 general csv
适配复杂 AT 命令手册（嵌套表、合并单元格、多参数表）。输入 `AT_Commands.docx`，输出 HTML（可选 PDF）。


## Step 0 — 安装依赖（首次运行需要）

In [9]:
!pip install -q python-docx pandas pyyaml jinja2 sphinx sphinx_rtd_theme lxml
print("✅ 依赖安装完成")

✅ 依赖安装完成


## Step 0.5 — 配置与通用工具（路径、日志、目录查看）

In [10]:
import os, re, json, traceback, datetime, subprocess, sys, shutil
import pandas as pd
from lxml import etree
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl

IN_WORD = "SimpleAT.docx"
DATA_DIR = "data"
CSV_OUT  = os.path.join(DATA_DIR, "demoAT.csv")
YAML_OUT = os.path.join(DATA_DIR, "at_all_commands.yaml")
RST_DIR  = os.path.join(DATA_DIR, "rst_output")
DOCS_DIR = "docs"
LOG_PATH = "parse_log.txt"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(RST_DIR, exist_ok=True)

def log(msg: str):
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(f"[{datetime.datetime.now().isoformat(timespec='seconds')}] {msg}\n")

open(LOG_PATH, "w", encoding="utf-8").write("")

def print_tree(root="docs"):
    if not os.path.exists(root):
        print(f"(不存在) {root}")
        return
    for dirpath, dirnames, filenames in os.walk(root):
        level = dirpath.replace(root, "").count(os.sep)
        indent = "  " * level
        print(f"{indent}{os.path.basename(dirpath)}/")
        subindent = "  " * (level + 1)
        for f in filenames:
            print(f"{subindent}{f}")
print("✅ 配置就绪；可用 print_tree('docs') 查看 Sphinx 目录结构")

✅ 配置就绪；可用 print_tree('docs') 查看 Sphinx 目录结构


## Step 1 — Word → CSV（复杂解析：嵌套表、多表合并、valmap）
- 识别命令标题；合并后续说明段；
- 连续参数表自动合并；
- 优先解析嵌套表为 `valmap`，否则回退文本解析；
- 日志写入 `parse_log.txt`。

In [None]:
CMD_LINE_PAT = re.compile(r'^\s*(AT[\+\w\-]+(?:\?[=\w<>,\s\-\+\.\:]*?)?)\s*(?::|：)?\s*(.*)$', re.I)
PARAM_HEADING_PAT = re.compile(r'^\s*参数(说明|表|信息)?\s*[:：]?\s*$', re.I)

def is_cmd_heading(text: str) -> bool: return bool(CMD_LINE_PAT.match(text or ""))
def is_param_heading(text: str) -> bool: return bool(PARAM_HEADING_PAT.match(text or ""))

def iter_ordered_blocks(doc):
    body = doc._element.body
    tbl_idx = 0
    for child in body.iterchildren():
        if isinstance(child, CT_P):
            text = "".join([t.text for t in child.xpath('.//w:t') if t.text]).strip()
            yield ("p", text)
        elif isinstance(child, CT_Tbl):
            table_obj = doc.tables[tbl_idx]
            tbl_idx += 1
            yield ("tbl", table_obj)

def cell_plain_text(cell):
    parts = [p.text.strip() for p in cell.paragraphs if p.text and p.text.strip()]
    return "\n".join(parts).strip()

def find_nested_tbls_in_cell(cell):
    xml_str = cell._tc.xml
    root = etree.fromstring(xml_str.encode("utf-8"))
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    return root.findall(".//w:tbl", ns), ns

def tbl_rows_as_text(tbl, ns):
    rows = []
    for r in tbl.findall(".//w:tr", ns):
        cells = r.findall(".//w:tc", ns)
        row = ["".join(tn.text for tn in c.iterfind(".//w:t", ns) if tn.text).strip() for c in cells]
        rows.append(row)
    return rows

def looks_like_header(row):
    hdr = " ".join(row[:3])
    return any(k in hdr for k in ["参数","名称","Name","描述","说明","含义","取值","值","value","meaning","备注","范围"])

def nested_table_to_valmap(rows):
    if not rows: return {}
    start = 1 if looks_like_header(rows[0]) else 0
    kv = {}
    for r in rows[start:]:
        if not r: continue
        key = (r[0] or "").strip()
        val = " | ".join([c for c in r[1:] if c and c.strip()]) if len(r) > 1 else ""
        if key: kv[key] = val
    return kv

def cell_valmap_from_nested_table(cell):
    tbls, ns = find_nested_tbls_in_cell(cell); mapping = {}
    for t in tbls:
        rows = tbl_rows_as_text(t, ns); mapping.update(nested_table_to_valmap(rows))
    return mapping

def parse_enum_map_fuzzy(text):
    if not text: return {}
    segs = re.split(r"[,\uFF0C;\uFF1B\n]+", text.strip())
    m = {}
    for s in segs:
        s = s.strip()
        if not s: continue
        if ":" in s or "：" in s:
            k, v = re.split(r"[:：]", s, 1); k, v = k.strip(), v.strip()
        else:
            m2 = re.match(r"^(\S+)\s*(?:->|→|=>|-|—|\s)\s*(.+)$", s)
            if m2: k, v = m2.group(1).strip(), m2.group(2).strip()
            else:
                m3 = re.match(r"^([A-Za-z0-9\+\-\.]+)\s+(.+)$", s)
                if m3: k, v = m3.group(1).strip(), m3.group(2).strip()
                else: continue
        if k: m[k] = v
    return m

def extract_word_to_csv(docx_path, csv_out):
    if not os.path.exists(docx_path):
        raise FileNotFoundError(f"未找到 Word 文件: {docx_path}")
    log(f"Start parsing: {docx_path}")
    doc = Document(docx_path)
    seq = list(iter_ordered_blocks(doc))

    results = []; i = 0; cmd_order = 0
    while i < len(seq):
        typ, obj = seq[i]
        if typ == "p":
            m = CMD_LINE_PAT.match(obj)
            if m:
                cmd_order += 1
                current_cmd = m.group(1).strip()
                current_title = (m.group(2) or "").strip()
                log(f"CMD[{cmd_order}] {current_cmd} — {current_title}")

                desc_lines = []; j = i + 1
                while j < len(seq):
                    t2, o2 = seq[j]
                    if t2 == "p":
                        if is_cmd_heading(o2) or is_param_heading(o2): break
                        if o2: desc_lines.append(o2)
                    elif t2 == "tbl": break
                    j += 1
                merged_desc = "\n".join(desc_lines).strip()

                params_all = []; table_count = 0; k = j
                while k < len(seq):
                    t3, o3 = seq[k]
                    if t3 == "p" and is_cmd_heading(o3): break
                    if t3 == "p" and is_param_heading(o3):
                        k += 1
                        while k < len(seq) and seq[k][0] == "tbl":
                            table = seq[k][1]; table_count += 1
                            for r in table.rows:
                                cols = r.cells
                                if not any(c.text.strip() for c in cols): continue
                                try:
                                    name = cell_plain_text(cols[0]) if len(cols) > 0 else ""
                                    desc = cell_plain_text(cols[1]) if len(cols) > 1 else ""
                                    valmap = {}
                                    if len(cols) > 2:
                                        valmap = cell_valmap_from_nested_table(cols[2]) or parse_enum_map_fuzzy(cell_plain_text(cols[2]))
                                    if not valmap and len(cols) > 1:
                                        valmap = cell_valmap_from_nested_table(cols[1]) or parse_enum_map_fuzzy(desc)
                                    if name in ("参数","参数名","Name") and any(x in desc for x in ["描述","说明","Description","Meaning"]): 
                                        continue
                                    params_all.append({"name": name, "desc": desc, "valmap": valmap})
                                except Exception as e:
                                    log(f"ROW-ERROR in {current_cmd}: {e}")
                                    log(traceback.format_exc())
                            k += 1
                        continue
                    k += 1

                if params_all or merged_desc:
                    results.append({
                        "章节": str(cmd_order), "命令": current_cmd, "命令标题": current_title, 
                        "命令类型": "执行",  # 可自动判断或后续再处理
                        "命令格式": f"{current_cmd}=<param>" if "<" in current_cmd else current_cmd,
                        "响应": "<CR><LF>OK<CR><LF>",  # 或自定义逻辑解析响应
                        "示例命令": f"{current_cmd}\n\nOK",  # 或从文档中实际提取
                        "功能描述": merged_desc or current_title,
                        "备注": f"说明示例{cmd_order}",
                        "表数量": table_count,
                        "顺序": cmd_order,
                        "参数JSON": json.dumps(params_all, ensure_ascii=False)
                    })

                    log(f"CMD[{cmd_order}] tables={table_count} params={len(params_all)}")
                i = k; continue
        i += 1

    df = pd.DataFrame(results)
    df.to_csv(csv_out, index=False, encoding="utf-8-sig")
    print(f"✅ 提取 {len(df)} 条命令 → {csv_out}")
    print(f"📝 解析日志：{LOG_PATH}")
    return df

df_csv = extract_word_to_csv(IN_WORD, CSV_OUT)
df_csv.head()