In [None]:
#!/usr/bin/env python3
"""
ORCID 女性姓名变化检测程序
"""

import os
import csv
import logging
import multiprocessing as mp
from multiprocessing import Pool, Manager, Lock
from concurrent.futures import ProcessPoolExecutor, as_completed
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import signal
import sys
from functools import partial

from bs4 import BeautifulSoup
import gender_guesser.detector as gender
import Levenshtein
from datetime import datetime

# 全局配置
CONFIG = {
    'base_path': '/hy-tmp/ORCID_2024_10_summaries',
    'output_file': '/hy-tmp/female_name_changes.csv',
    'log_file': '/hy-tmp/orcid_processing.log',
    'max_workers': 20,  # 保留2个核心给系统
    'chunk_size': 1000,  # 每个进程处理的文件数量
    'batch_size': 100,  # 批量写入CSV的记录数
}

class ORCIDProcessor:
    def __init__(self, base_path: str, output_file: str, max_workers: int = 38):
        self.base_path = Path(base_path)
        self.output_file = output_file
        self.max_workers = max_workers
        self.gender_detector = gender.Detector()
        self.setup_logging()
        
        # 统计信息
        self.stats = {
            'total_files': 0,
            'processed_files': 0,
            'female_records': 0,
            'name_changes': 0,
            'errors': 0
        }

    def setup_logging(self):
        """配置日志系统"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(CONFIG['log_file']),
                logging.StreamHandler(sys.stdout)
            ]
        )
        self.logger = logging.getLogger(__name__)

    def extract_name_info(self, soup: BeautifulSoup) -> Optional[Dict]:
        """从BeautifulSoup对象中提取姓名信息和性别"""
        try:
            person_name_tag = soup.find("person:name")
            if not person_name_tag:
                return None

            # 提取person:name基本信息
            person_created_date_tag = person_name_tag.find("common:created-date")
            given_names_tag = person_name_tag.find("personal-details:given-names")
            family_name_tag = person_name_tag.find("personal-details:family-name")

            person_created_date = person_created_date_tag.string if person_created_date_tag else None
            given_name = given_names_tag.string if given_names_tag else None
            family_name = family_name_tag.string if family_name_tag else None

            # 性别识别 - 只处理女性
            if not given_name:
                return None
            
            gender_result = self.gender_detector.get_gender(given_name.split()[0])
            if gender_result not in ['female', 'mostly_female']:
                return None  # 只处理女性

            # 构建基准姓名 (given_name + family_name)
            if not family_name:
                return None
            base_name = f"{given_name} {family_name}".strip()

            # 提取other-name信息 - 可能有多个
            other_names_data = []
            other_name_tags = soup.find_all("other-name:other-name")
            
            for other_name_tag in other_name_tags:
                other_created_date_tag = other_name_tag.find("common:created-date")
                other_name_content_tag = other_name_tag.find("other-name:content")
                
                other_created_date = other_created_date_tag.string if other_created_date_tag else None
                other_name_content = other_name_content_tag.string if other_name_content_tag else None
                
                if other_created_date and other_name_content:
                    other_names_data.append({
                        'other_name_content': other_name_content,
                        'other_created_date': other_created_date
                    })

            if not other_names_data:
                return None

            return {
                'person_created_date': person_created_date,
                'given_name': given_name,
                'family_name': family_name,
                'base_name': base_name,
                'other_names_data': other_names_data,
                'gender': gender_result
            }

        except Exception as e:
            self.logger.debug(f"Error extracting name info: {e}")
            return None


    def process_single_file(self, file_path: Path, port_name: str) -> Optional[Dict]:
        """处理单个XML文件"""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'xml')
                name_info = self.extract_name_info(soup)

                if not name_info:
                    return None

                is_change, change_data = self.is_name_change_candidate(name_info)
                if is_change and change_data:
                    path_tag = soup.find("common:path")
                    orcid_id = path_tag.string if path_tag else "unknown"
                    
                    return {
                        'port_name': port_name,
                        'id': orcid_id,
                        'person_name': change_data[0],
                        'person_date': change_data[1],
                        'other_name': change_data[2],
                        'other_date': change_data[3],
                        'gender': name_info['gender']
                    }
                return None

        except Exception as e:
            self.logger.debug(f"Error processing file {file_path}: {e}")
            return None



def _extract_name_parts(full_name: str) -> Dict[str, str]:
    """从全名中提取姓、名、中间名"""
    parts = full_name.strip().split()
    if not parts:
        return {'given': '', 'middle': '', 'family': ''}
    
    given = parts[0]
    if len(parts) == 1:
        return {'given': given, 'middle': '', 'family': ''}
    
    family = parts[-1]
    middle = ' '.join(parts[1:-1])
    return {'given': given, 'middle': middle, 'family': family}


def is_name_change_candidate(name_info: Dict) -> Tuple[bool, Optional[List]]:
    """
    判断是否为姓名变化候选者
    """
    try:
        # --- 1. 数据提取 ---
        base_name = name_info['base_name']
        person_created_date = name_info['person_created_date']
        other_names_data = name_info['other_names_data']
        family_name = name_info['family_name']
        given_name = name_info['given_name']

        if not all([base_name, person_created_date, other_names_data, family_name, given_name]):
            return False, None

        # 解析person:name的创建日期 
        try:
            person_date = datetime.fromisoformat(person_created_date.replace('Z', '+00:00'))
        except ValueError: # 捕获更具体的异常
            return False, None

        # --- 2. 遍历 other-names，进行筛选和判断 ---
        for other_data in other_names_data:
            other_name_content = other_data.get('other_name_content')
            other_created_date = other_data.get('other_created_date')
            
            if not other_name_content or not other_created_date:
                continue
            
            # 关联性验证-无需此条件 注释
            #if not (str(family_name).lower() in str(other_name_content).lower() or str(given_name).lower() in str(other_name_content).lower()):
                #continue

            try:
                other_date = datetime.fromisoformat(other_created_date.replace('Z', '+00:00'))
            except ValueError:
                continue

            # 时间差过滤
            if abs((person_date - other_date).days) < 30:
                continue

            # --- 3. 姓名标准化与结构化 ---
            cleaned_base = ' '.join(str(base_name).lower().strip().replace('.', '').replace(',', '').replace('-', ' ').split())
            cleaned_other = ' '.join(str(other_name_content).lower().strip().replace('.', '').replace(',', '').replace('-', ' ').split())
            
            # Levenshtein 距离
            if Levenshtein.distance(cleaned_base, cleaned_other) < 2:
                continue

            base_parts = _extract_name_parts(cleaned_base)
            other_parts = _extract_name_parts(cleaned_other)

            if not all([base_parts['given'], other_parts['given'], base_parts['family'], other_parts['family']]):
                continue

            # --- 4. 核心逻辑判断 ---
            
            # a) 验证是否为同一个人 (名字部分容错判断)
            if Levenshtein.distance(base_parts['given'], other_parts['given']) >= 2:
                continue
            
            # b) 获取并比较完整的姓氏部分
            base_full_surname = f"{base_parts['middle']} {base_parts['family']}".strip()
            other_full_surname = f"{other_parts['middle']} {other_parts['family']}".strip()

            if Levenshtein.distance(base_full_surname, other_full_surname) < 2:
                continue

            # c) 排除只增加/减少中间名的情况
            is_subset_change = (base_full_surname in other_full_surname or other_full_surname in base_full_surname)
            if is_subset_change and Levenshtein.distance(base_parts['family'], other_parts['family']) <= 1:
                continue

            # d) 排除缩写情况
            if (len(base_parts['family']) == 1 and len(other_parts['family']) > 1 and base_parts['family'] == other_parts['family'][0]) or \
               (len(other_parts['family']) == 1 and len(base_parts['family']) > 1 and other_parts['family'] == base_parts['family'][0]):
                continue

            # --- 5. 确认是候选者 ---
            
            if person_date < other_date:
                older_name, newer_name = cleaned_base, cleaned_other
                older_date_str, newer_date_str = person_created_date, other_created_date
            else:
                older_name, newer_name = cleaned_other, cleaned_base
                older_date_str, newer_date_str = other_created_date, person_created_date

            return True, [older_name, older_date_str, newer_name, newer_date_str]

        return False, None

    except Exception as e:
        return False, None

def process_file_chunk(args):
    """处理文件块的工作函数"""
    file_paths, port_name, process_id = args
    processor = ORCIDProcessor(CONFIG['base_path'], CONFIG['output_file'])
    results = []
    processed = 0
    
    for file_path in file_paths:
        try:
            result = processor.process_single_file(file_path, port_name)
            if result:
                results.append(result)
            processed += 1
            
            # 每处理1000000个文件报告一次进度
            if processed % 1000000 == 0:
                print(f"Process {process_id}: Processed {processed}/{len(file_paths)} files in {port_name}")
                
        except Exception as e:
            logging.error(f"Error in process {process_id} processing {file_path}: {e}")
    
    return results, processed, len(results)

def get_file_chunks(base_path: str, chunk_size: int = 1000) -> List[Tuple]:
    """获取文件分块信息"""
    base_path = Path(base_path)
    chunks = []
    chunk_id = 0
    
    for portfolio_folder in base_path.iterdir():
        if not portfolio_folder.is_dir():
            continue
            
        # 获取文件夹中的所有XML文件
        xml_files = list(portfolio_folder.glob("*.xml"))
        if not xml_files:
            xml_files = list(portfolio_folder.iterdir())  # 如果没有.xml扩展名
            
        # 将文件分块
        for i in range(0, len(xml_files), chunk_size):
            file_chunk = xml_files[i:i + chunk_size]
            chunks.append((file_chunk, portfolio_folder.name, chunk_id))
            chunk_id += 1
    
    return chunks

def write_results_batch(results_batch: List[Dict], output_file: str, write_header: bool = False):
    """批量写入结果到CSV文件"""
    mode = 'w' if write_header else 'a'
    with open(output_file, mode, newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['port_name', 'id', 'person_name', 'person_date', 
                                             'other_name', 'other_date', 'gender'])
        if write_header:
            writer.writeheader()
        writer.writerows(results_batch)

def signal_handler(signum, frame):
    """信号处理器"""
    print("\n收到中断信号，正在安全退出...")
    sys.exit(0)

def main():
    """主函数"""
    # 注册信号处理器
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    
    start_time = time.time()
    
    # 获取文件分块
    print("正在分析文件结构...")
    chunks = get_file_chunks(CONFIG['base_path'], CONFIG['chunk_size'])
    total_chunks = len(chunks)
    
    print(f"发现 {total_chunks} 个文件块")
    if total_chunks == 0:
        print("没有找到需要处理的文件")
        return
    
    # 初始化输出文件
    write_results_batch([], CONFIG['output_file'], write_header=True)
    
    # 统计变量
    total_processed = 0
    total_results = 0
    results_buffer = []
    
    try:
        # 使用进程池处理
        with ProcessPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
            print(f"启动 {CONFIG['max_workers']} 个工作进程...")
            
            # 提交所有任务
            future_to_chunk = {executor.submit(process_file_chunk, chunk): i 
                              for i, chunk in enumerate(chunks)}
            
            # 处理完成的任务
            for future in as_completed(future_to_chunk):
                chunk_idx = future_to_chunk[future]
                try:
                    chunk_results, chunk_processed, chunk_found = future.result()
                    
                    total_processed += chunk_processed
                    total_results += chunk_found
                    results_buffer.extend(chunk_results)
                    
                    # 批量写入结果
                    if len(results_buffer) >= CONFIG['batch_size']:
                        write_results_batch(results_buffer, CONFIG['output_file'])
                        results_buffer = []
                    
                    # 进度报告
                    progress = (chunk_idx + 1) / total_chunks * 100
                    print(f"进度: {progress:.1f}% ({chunk_idx + 1}/{total_chunks}), "
                          f"已处理: {total_processed:,}, 发现变化: {total_results:,}")
                    
                except Exception as e:
                    print(f"处理块 {chunk_idx} 时出错: {e}")
    
    except KeyboardInterrupt:
        print("\n用户中断程序执行")
        return
    except Exception as e:
        print(f"程序执行出错: {e}")
        return
    finally:
        # 写入剩余结果
        if results_buffer:
            write_results_batch(results_buffer, CONFIG['output_file'])
    
    # 最终统计
    end_time = time.time()
    duration = end_time - start_time
    
    print("\n" + "="*60)
    print("处理完成!")
    print(f"总处理文件数: {total_processed:,}")
    print(f"发现姓名变化: {total_results:,}")
    print(f"处理时间: {duration/3600:.1f} 小时 ({duration/60:.1f} 分钟)")
    print(f"平均速度: {total_processed/duration:.0f} 文件/秒")
    print(f"结果已保存至: {CONFIG['output_file']}")
    print("="*60)

if __name__ == "__main__":   
    main()