In [1]:
import pandas as pd  
from openpyxl import load_workbook
import time
from datetime import datetime

def load_excel(file_path, sheet_name):  
    data = pd.read_excel(file_path, sheet_name=sheet_name, dtype=str)  
    return data  

def write_fields_to_row(ws, row_num, data, field_mapping):
    """将多个字段写入到指定行的指定列"""
    for field, column in field_mapping.items():
        if field in data:
            ws[f"{column}{row_num}"] = data[field]

def consolidate_source_data(df, fields_to_consolidate):
    """整合源数据中相同姓名的条目"""
    consolidated_data = {}
    total_rows = len(df)
    
    print(f"开始整合源数据，共 {total_rows} 行...")
    
    for index, row in df.iterrows():
        name = str(row['ID']) if pd.notna(row['ID']) else None
        if name:
            if name not in consolidated_data:
                # 初始化所有字段为空集合
                consolidated_data[name] = {field: set() for field in fields_to_consolidate}
            
            # 为每个字段添加数据
            for field in fields_to_consolidate:
                if pd.notna(row[field]) and str(row[field]).strip():
                    consolidated_data[name][field].add(str(row[field]).strip())
        
        # 进度显示
        if index % 1000 == 0:
            progress = (index + 1) / total_rows * 100
            print(f"整合进度: {index+1}/{total_rows} ({progress:.1f}%)")
    
    print(f"整合完成，共处理 {total_rows} 行，生成 {len(consolidated_data)} 个唯一ID")
    
    # 将集合转换为分号分隔的字符串
    print("正在格式化数据...")
    format_count = 0
    for name, data in consolidated_data.items():
        for field in fields_to_consolidate:
            if field in data and data[field]:
                consolidated_data[name][field] = '; '.join(data[field])
            else:
                consolidated_data[name][field] = ""
        format_count += 1
        if format_count % 1000 == 0:
            print(f"格式化进度: {format_count}/{len(consolidated_data)}")
    
    return consolidated_data

def print_progress(current, total, start_time, operation="处理"):
    """打印进度信息"""
    elapsed_time = time.time() - start_time
    progress = (current / total) * 100
    
    # 计算预计剩余时间
    if current > 0:
        estimated_total_time = elapsed_time * total / current
        remaining_time = estimated_total_time - elapsed_time
        time_str = f"已用: {elapsed_time:.1f}s, 剩余: {remaining_time:.1f}s"
    else:
        time_str = f"已用: {elapsed_time:.1f}s"
    
    print(f"{operation}进度: {current}/{total} ({progress:.1f}%) - {time_str}")

# 主程序
print("=" * 50)
print("程序开始执行")
print("=" * 50)

start_time = time.time()

excel_path = r"C:\Users\DXW\Downloads\AllClinical_ASCII_combined (2).xlsx"
df = load_excel(excel_path, sheet_name='AllClinical00') 

target_path = r"C:\Users\DXW\Desktop\进展预测\骨关节炎进展评估+性别.xlsx"
sheet_name = 'Sheet1'
wb = load_workbook(target_path)
ws = wb[sheet_name]
rows = ws.max_row 
acc_number_column_index = 2  # 列索引从 1 开始

# 定义字段到列字母的映射
field_mapping = {
    'P01BMI': 'D',
    'V00AGE': 'C',
}

# 需要整合的字段列表
fields_to_consolidate = list(field_mapping.keys())

# 存储Excel中'acc number'列的所有值和对应的行号
print("正在读取目标Excel中的ID映射...")
acc_number_to_rows = {}
target_rows_count = 0

for row_num, row in enumerate(ws.iter_rows(min_row=2, min_col=acc_number_column_index, max_col=acc_number_column_index), start=2):
    acc_number_cell = row[0]
    acc_number = str(acc_number_cell.value) if acc_number_cell.value is not None else None
    if acc_number and acc_number != 'None':
        if acc_number not in acc_number_to_rows:
            acc_number_to_rows[acc_number] = []
        acc_number_to_rows[acc_number].append(row_num)
        target_rows_count += 1

print(f"目标Excel中共有 {len(acc_number_to_rows)} 个唯一ID，对应 {target_rows_count} 个条目")

print("\n目标Excel中的ID映射示例:")
for name, rows in list(acc_number_to_rows.items())[:5]:
    print(f"  {name}: {len(rows)} 个条目")

# 整合源数据
consolidated_data = consolidate_source_data(df, fields_to_consolidate)
print(f"\n源数据整合后共有 {len(consolidated_data)} 个不同的ID")

# 计算匹配统计
matched_names = [name for name in consolidated_data if name in acc_number_to_rows]
unmatched_names = [name for name in consolidated_data if name not in acc_number_to_rows]

print(f"\n匹配统计:")
print(f"  匹配的ID数量: {len(matched_names)}")
print(f"  未匹配的ID数量: {len(unmatched_names)}")

if len(unmatched_names) > 0:
    print(f"  未匹配的ID示例: {list(unmatched_names)[:10]}")

# 计算总匹配行数
total_matched_rows = sum(len(acc_number_to_rows[name]) for name in matched_names)
print(f"  总匹配行数: {total_matched_rows}")

i = 0
write_start_time = time.time()

print(f"\n开始写入数据到目标Excel...")

# 使用整合后的数据写入目标Excel
for name_idx, (name, data) in enumerate(consolidated_data.items()):
    if name in acc_number_to_rows:
        row_list = acc_number_to_rows[name]
        
        # 为每个匹配的行写入整合后的数据
        for row_num in row_list:
            i = i + 1
            write_fields_to_row(ws, row_num, data, field_mapping)
            
            # 每处理100行显示一次进度
            if i % 100 == 0:
                print_progress(i, total_matched_rows, write_start_time, "写入")
        
        # 每处理100个ID显示一次匹配信息
        if len(matched_names) > 100 and (name_idx + 1) % 100 == 0:
            current_match_count = len([n for n in list(consolidated_data.keys())[:name_idx+1] if n in acc_number_to_rows])
            print(f"  已处理 {current_match_count}/{len(matched_names)} 个匹配ID")

# 最终进度显示
if i > 0:
    print_progress(i, total_matched_rows, write_start_time, "写入")

print(f"\n总共处理了 {i} 个匹配项")

# 保存文件
print("正在保存文件...")
save_start = time.time()
wb.save(target_path)
save_time = time.time() - save_start
print(f"文件保存完成，耗时: {save_time:.1f}秒")

total_time = time.time() - start_time
print(f"\n程序执行完成!")
print(f"总执行时间: {total_time:.1f}秒")
print("=" * 50)

程序开始执行
正在读取目标Excel中的ID映射...
目标Excel中共有 200 个唯一ID，对应 200 个条目

目标Excel中的ID映射示例:
  9819744: 1 个条目
  9821872: 1 个条目
  9821982: 1 个条目
  9824005: 1 个条目
  9824703: 1 个条目
开始整合源数据，共 4796 行...
整合进度: 1/4796 (0.0%)
整合进度: 1001/4796 (20.9%)
整合进度: 2001/4796 (41.7%)
整合进度: 3001/4796 (62.6%)
整合进度: 4001/4796 (83.4%)
整合完成，共处理 4796 行，生成 4796 个唯一ID
正在格式化数据...
格式化进度: 1000/4796
格式化进度: 2000/4796
格式化进度: 3000/4796
格式化进度: 4000/4796

源数据整合后共有 4796 个不同的ID

匹配统计:
  匹配的ID数量: 200
  未匹配的ID数量: 4596
  未匹配的ID示例: ['9000099', '9000296', '9000622', '9000798', '9001104', '9001400', '9001695', '9001897', '9002116', '9002316']
  总匹配行数: 200

开始写入数据到目标Excel...
写入进度: 100/200 (50.0%) - 已用: 0.0s, 剩余: 0.0s
  已处理 108/200 个匹配ID
写入进度: 200/200 (100.0%) - 已用: 0.0s, 剩余: 0.0s
写入进度: 200/200 (100.0%) - 已用: 0.0s, 剩余: 0.0s

总共处理了 200 个匹配项
正在保存文件...
文件保存完成，耗时: 0.0秒

程序执行完成!
总执行时间: 157.7秒
