In [5]:
import pandas as pd
from openpyxl import load_workbook
import time

# ---------- 基础函数 ----------
def load_excel(file_path, sheet_name=0):
    """读入文件：默认第1个sheet，无需指定"""
    return pd.read_excel(file_path, sheet_name=sheet_name, dtype=str)

def write_fields_to_row(ws, row_num, data, field_mapping):
    for field, col in field_mapping.items():
        if field in data:
            ws[f"{col}{row_num}"] = data[field]

def consolidate_source_data(df, fields_to_consolidate):
    """
    按 ID 聚合指定字段；如果某字段在源表里根本不存在，则给它留空字符串，不报错。
    """
    consolidated = {}
    total = len(df)
    # 先拿到源表真实列名（统一大写，避免大小写问题）
    real_cols = {c.upper() for c in df.columns}

    print(f"开始整合源数据，共 {total} 行...")
    for idx, row in df.iterrows():
        name = str(row['ID']).strip().upper() if pd.notna(row['ID']) else None
        if not name:
            continue

        # 首次遇到该 ID
        consolidated.setdefault(name, {f: set() for f in fields_to_consolidate})

        # 只处理“源表真正存在”的字段
        for f in fields_to_consolidate:
            if f.upper() not in real_cols:   # 源表没有这一列，直接跳过
                continue
            val = row.get(f)                # 用 get 防止 KeyError
            if pd.notna(val) and str(val).strip():
                consolidated[name][f].add(str(val).strip())

        # 进度条
        if idx % 1000 == 0:
            print(f"整合进度: {idx+1}/{total} ({(idx+1)/total*100:.1f}%)")

    # 集合 → 字符串
    for name, data in consolidated.items():
        for f in fields_to_consolidate:
            consolidated[name][f] = "; ".join(data[f]) if data[f] else ""

    print(f"整合完成，共 {len(consolidated)} 个唯一ID")
    return consolidated

def print_progress(current, total, start_time, operation="处理"):
    elapsed = time.time() - start_time
    progress = (current / total) * 100
    remaining = elapsed * total / current - elapsed if current else 0
    print(f"{operation}进度: {current}/{total} ({progress:.1f}%) - 已用: {elapsed:.1f}s, 剩余: {remaining:.1f}s")

# ---------- 字段清单（V00 基线，按用户给定顺序） ----------
field_list = [
    'V00AGE', 'V00ARTDOC', 'V00BMI', 'V00BONFX', 'V00BP30', 'V00BPACTCV',
    'V00CESD', 'V00COXIBS', 'V00DOXYCYC', 'V00FALL', 'V00FALLCV', 'V00GLUC',
    'V00HYAINJL', 'V00HYAINJR', 'V00KGLRS', 'V00KPACT30', 'V00KPACDCV',
    'V00KPMED', 'V00KPMEDCV', 'V00KPL30CV', 'V00KPR30CV', 'V00KOOSKPL',
    'V00KOOSKPR', 'V00KOOSQOL', 'V00KOOSYML', 'V00KOOSYMR', 'V00NARCOT',
    'V00NSAIDS', 'V00NSAIDRX', 'V00PASE', 'V00PNMEDT', 'V00STRINJL',
    'V00STRINJR', 'V00TYLEN', 'V00WOMADLL', 'V00WOMADLR', 'V00WOMKPL',
    'V00WOMKPR', 'V00WOMSTFL', 'V00WOMSTFR', 'V00WOMTSL', 'V00WOMTSR'
]
def col_letter(n):
    letter = ""
    while n:
        n, rem = divmod(n - 1, 26)
        letter = chr(65 + rem) + letter
    return letter

field_mapping = {f: col_letter(i + 2) for i, f in enumerate(field_list)}  # B, C, D, ...
fields_to_consolidate = list(field_mapping.keys())

# ---------- 路径 ----------
src_file = r"C:\Users\DXW\OAI data\OAIdatabase\Allclinical\AllClinical00.xlsx"
tgt_file = r"C:\Users\DXW\Desktop\新建 Microsoft Excel 工作表.xlsx"
tgt_sheet = '12m'   # ← 指定目标 sheet 名称

# ---------- 主流程 ----------
start_time = time.time()
print("=" * 50)
print("程序开始执行")
print("=" * 50)

# 读源（不指定 sheet，默认第 1 个）
df = load_excel(src_file)
consolidated_data = consolidate_source_data(df, fields_to_consolidate)

# 读目标（指定 sheet）
wb = load_workbook(tgt_file)
ws = wb[tgt_sheet]

# 构建 ID→行号映射
acc_number_to_rows = {}
for row_num, row in enumerate(ws.iter_rows(min_row=2, min_col=1, max_col=2), start=2):
    acc = str(row[0].value).strip().upper() if row[0].value is not None else None
    if acc and acc != 'None':
        acc_number_to_rows.setdefault(acc, []).append(row_num)

matched_names = [n for n in consolidated_data if n in acc_number_to_rows]
total_matched_rows = sum(len(acc_number_to_rows[n]) for n in matched_names)
print(f"\n匹配到 {len(matched_names)} 个ID，共 {total_matched_rows} 行待写入")

# 写入
write_start = time.time()
i = 0
for name_idx, (name, data) in enumerate(consolidated_data.items()):
    if name in acc_number_to_rows:
        for row_num in acc_number_to_rows[name]:
            i += 1
            write_fields_to_row(ws, row_num, data, field_mapping)
            if i % 100 == 0:
                print_progress(i, total_matched_rows, write_start, "写入")

if i > 0:
    print_progress(i, total_matched_rows, write_start, "写入")

# 保存
print("\n正在保存...")
wb.save(tgt_file)
print(f"保存完成！总耗时 {time.time() - start_time:.1f}秒")
print("=" * 50)

程序开始执行
开始整合源数据，共 4796 行...
整合进度: 1/4796 (0.0%)
整合进度: 1001/4796 (20.9%)
整合进度: 2001/4796 (41.7%)
整合进度: 3001/4796 (62.6%)
整合进度: 4001/4796 (83.4%)
整合完成，共 4796 个唯一ID

匹配到 47 个ID，共 47 行待写入
写入进度: 47/47 (100.0%) - 已用: 0.0s, 剩余: 0.0s

正在保存...
保存完成！总耗时 134.0秒
