In [None]:
from pyecospold import parse_file_v2
import pandas as pd
import os
from datetime import datetime
import logging
import csv

# 全局 activity_id 到 (shortname, activityName) 的映射字典
global_activity_mapping = {}

# 初始化统计变量
total_files = 0
converted_files = 0
failed_files = 0
failed_file_names = []

# 用于跟踪每个文件中被删除的行数
deleted_rows_per_file = {}

# 用于跟踪每个文件中被替换的 process_name
replaced_process_name = {}

# 用于统计没有 activityLinkId 且 amount 不为1的行数
non1_amount_per_file = {}

# 用于统计没有 activityLinkId 且 amount 为-1的行数
neg1_amount_per_file = {}

# 用于记录仍有 [Unknown Location]Unknown Activity Name 的文件
unknown_activity_files = []

# 定义输入和输出文件夹路径
input_folder = "C:\\Users\\WasteWang\\LCA\\DATA\\3.11_APOS\\datasets"
output_folder_base = "C:\\Users\\WasteWang\\LCA\\OUTPUT"
lookup_file_path = "C:\\Users\\WasteWang\\LCA\\DATA\\3.11_APOS\\FilenameToActivityLookup.csv"
batch_file_path = "C:\\Users\\WasteWang\\LCA\\batch_number.txt"  # 用于存储批次编号的文件路径

# 如果没有批次编号文件，初始化为 1
if not os.path.exists(batch_file_path):
    try:
        with open(batch_file_path, "w") as f:
            f.write("1")
        print("Batch number file created with initial value 1.")
    except Exception as e:
        print(f"Error creating batch number file: {e}")

# 读取批次编号
try:
    with open(batch_file_path, "r") as f:
        batch_number = int(f.read().strip())
    print(f"Current batch number: {batch_number}")
except Exception as e:
    print(f"Error reading batch number file: {e}")
    batch_number = 1  # 默认值

# 获取当前日期
current_date = datetime.now().strftime("%m%d")
print(f"Current date: {current_date}")

# 创建当前批次的输出文件夹
batch_folder = os.path.join(output_folder_base, f"{current_date}_{batch_number}")
os.makedirs(batch_folder, exist_ok=True)
print(f"Output will be saved to: {batch_folder}")

# 配置日志，仅输出到文件，设置为 INFO 级别
logger = logging.getLogger('spold_processor')
logger.setLevel(logging.INFO)

# 创建文件处理器，保存到输出批次文件夹中的 processing_debug.txt
file_handler = logging.FileHandler(os.path.join(batch_folder, 'processing_debug.txt'), encoding='utf-8')
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s %(levelname)s:%(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# 加载文件名到 activity 名称的映射表，指定分号分隔符
try:
    lookup_df = pd.read_csv(lookup_file_path, sep=';')
    lookup_dict = {row['Filename'].split('_')[0]: (row['ActivityName'], row['Location']) for _, row in lookup_df.iterrows()}
    logger.info(f"Loaded lookup dictionary with {len(lookup_dict)} entries.")
except Exception as e:
    logger.error(f"Error loading lookup file: {e}")
    lookup_dict = {}

# 验证 lookup_dict 的完整性
def verify_lookup_dict():
    missing_prefixes = set()
    for filename in os.listdir(input_folder):
        if filename.endswith(".spold"):
            prefix = filename.split('_')[0] if '_' in filename else os.path.splitext(filename)[0]
            if prefix not in lookup_dict:
                missing_prefixes.add(prefix)
    if missing_prefixes:
        logger.warning(f"The following prefixes are missing in lookup_dict: {', '.join(missing_prefixes)}")
    else:
        logger.info("All prefixes are present in lookup_dict.")

verify_lookup_dict()

def extract_activity_mapping(file_path):
    """
    提取单个 .spold 文件中的所有 activityDescription 元素，并返回一个 mapping 字典。
    """
    mapping = {}
    try:
        logger.info(f"Parsing file: {file_path}")
        ecoSpold = parse_file_v2(file_path)
        namespaces = {'eco': 'http://www.EcoInvent.org/EcoSpold02'}
        
        # 获取所有 activityDescription 元素，包括子活动
        activity_descriptions = ecoSpold.findall('.//eco:activityDescription', namespaces)
        for activity_description in activity_descriptions:
            activity = activity_description.find('eco:activity', namespaces)
            if activity is not None:
                activity_id = activity.attrib.get('id')
                activity_name_elem = activity.find('eco:activityName', namespaces)
                activity_name_text = activity_name_elem.text.strip() if activity_name_elem is not None and activity_name_elem.text else "Unknown Activity Name"

                geography = activity_description.find('eco:geography', namespaces)
                if geography is not None:
                    shortname_elem = geography.find('eco:shortname', namespaces)
                    shortname = shortname_elem.text.strip() if shortname_elem is not None and shortname_elem.text else "Unknown Location"
                else:
                    shortname = "Unknown Location"

                if activity_id:
                    mapping[activity_id] = (shortname, activity_name_text)
                    logger.info(f"Mapped activity_id {activity_id} to ({shortname}, {activity_name_text})")
    except Exception as e:
        logger.error(f"Error extracting activity mapping from file {file_path}: {e}")
    return mapping

def process_activity_description(activityDescription, current_activity_name, namespaces):
    """
    处理 activityDescription 部分的信息，返回一个列表的字典记录。
    """
    records = []
    try:
        if activityDescription is not None:
            for field, tag in [('includedActivitiesStart', 'includedActivitiesStart'),
                               ('includedActivitiesEnd', 'includedActivitiesEnd'),
                               ('generalComment', 'generalComment')]:
                value = activityDescription.find(f'eco:{tag}', namespaces)
                value_text = value.text.strip() if value is not None and value.text else 'N/A'
                category = 'Included Activities' if field != 'generalComment' else 'General Comment'
                record = {
                    'process_name': current_activity_name,
                    'flow': '',  # 该字段在此部分没有数据
                    'unit': '',
                    'amount': '',
                    'category': category,
                    'field': field,
                    'value': value_text,
                    'compartment': '',
                    'subcompartment': '',
                    'comment': '',
                    'outputGroup': '',
                    'section': 'activityDescription',
                    'activityLinkId': '',
                    'intermediateExchangeId': ''
                }
                records.append(record)
    except Exception as e:
        logger.error(f"Error processing activityDescription: {e}")
    return records

def process_intermediate_exchange(exchange, current_activity_name, current_location, filename, file_activity_mapping, namespaces):
    """
    处理一个 intermediateExchange 元素，返回一个字典记录。
    """
    try:
        intermediateExchangeId = exchange.attrib.get('intermediateExchangeId', '').strip()
        amount_str = exchange.attrib.get('amount', 'N/A').strip()
        try:
            amount = float(amount_str)
        except ValueError:
            amount = None
            logger.warning(f"{filename}: Invalid amount value '{amount_str}' in intermediateExchangeId '{intermediateExchangeId}'.")

        flow_name_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}name')
        flow_name = flow_name_elem.text.strip() if flow_name_elem is not None and flow_name_elem.text else "Unknown Flow"
        unit_name_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}unitName')
        unit_name = unit_name_elem.text.strip() if unit_name_elem is not None and unit_name_elem.text else "Unknown Unit"
        comment_field = exchange.find('{http://www.EcoInvent.org/EcoSpold02}comment')
        comment = comment_field.text.strip() if comment_field is not None and comment_field.text else 'N/A'
        outputGroup_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}outputGroup')
        outputGroup = outputGroup_elem.text.strip() if outputGroup_elem is not None and outputGroup_elem.text else 'N/A'
        activityLinkId = exchange.attrib.get('activityLinkId', '').strip()

        # 条件：对于 intermediateExchange，如果 amount == 0，则删除该行
        if amount == 0.0:
            logger.info(f"{filename}: Skipping intermediateExchange with amount=0.0.")
            return None  # 不添加该记录

        # 使用 activityLinkId 获取对应的 shortname 和 activityName
        if activityLinkId:
            related_info = global_activity_mapping.get(activityLinkId)
            if not related_info:
                # 尝试从当前文件的 mapping 获取
                related_info = file_activity_mapping.get(activityLinkId)
            if not related_info:
                related_info = ("Unknown Location", "Unknown Activity Name")
                logger.warning(f"{filename}: activityLinkId '{activityLinkId}' not found in global or file mapping.")
        else:
            # 如果没有 activityLinkId，使用当前文件的 shortname 和 activityName
            related_info = (current_location, current_activity_name)

        related_shortname, related_activity_name = related_info

        # 格式化 flow 字段
        formatted_flow = f"{flow_name}//[{related_shortname}]{related_activity_name}"

        # 创建记录字典
        record = {
            'process_name': current_activity_name,
            'flow': formatted_flow,
            'unit': unit_name,
            'amount': amount,
            'category': 'Flow Data',
            'field': 'intermediateExchange',
            'value': '',
            'compartment': '',
            'subcompartment': '',
            'comment': comment,
            'outputGroup': outputGroup,
            'section': 'intermediateExchange',
            'activityLinkId': activityLinkId,
            'intermediateExchangeId': intermediateExchangeId
        }
        return record
    except Exception as e:
        logger.error(f"{filename}: Error processing intermediateExchange - {e}")
        return {}

def process_elementary_exchange(exchange, current_activity_name, filename, file_activity_mapping, namespaces):
    """
    处理一个 elementaryExchange 元素，返回一个字典记录。
    """
    try:
        flow_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}name')
        flow = flow_elem.text.strip() if flow_elem is not None and flow_elem.text else "Unknown Flow"
        unit_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}unitName')
        unit = unit_elem.text.strip() if unit_elem is not None and unit_elem.text else "Unknown Unit"
        amount_str = exchange.attrib.get('amount', 'N/A').strip()
        try:
            amount = float(amount_str)
        except ValueError:
            amount = None
            logger.warning(f"{filename}: Invalid amount value '{amount_str}' in elementaryExchangeId '{exchange.attrib.get('intermediateExchangeId', '').strip()}'.")
    
        comment_field = exchange.find('{http://www.EcoInvent.org/EcoSpold02}comment')
        comment = comment_field.text.strip() if comment_field is not None and comment_field.text else 'N/A'

        # 条件：对于 elementaryExchange，如果 amount == 0，则删除该行
        if amount == 0.0:
            logger.info(f"{filename}: Skipping elementaryExchange with amount=0.0.")
            return None  # 不添加该记录

        # 处理 compartment 和 subcompartment
        compartment = exchange.find('{http://www.EcoInvent.org/EcoSpold02}compartment')
        if compartment is not None:
            compartment_main = compartment.find('{http://www.EcoInvent.org/EcoSpold02}compartment')
            compartment_text = compartment_main.text.strip() if compartment_main is not None and compartment_main.text else 'Unknown Compartment'
            subcompartment_elem = compartment.find('{http://www.EcoInvent.org/EcoSpold02}subcompartment')
            subcompartment = subcompartment_elem.text.strip() if subcompartment_elem is not None and subcompartment_elem.text else 'Unknown Subcompartment'
        else:
            compartment_text = 'Unknown Compartment'
            subcompartment = 'Unknown Subcompartment'

        outputGroup_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}outputGroup')
        outputGroup = outputGroup_elem.text.strip() if outputGroup_elem is not None and outputGroup_elem.text else 'N/A'
        activityLinkId = exchange.attrib.get('activityLinkId', '').strip()
        intermediateExchangeId = exchange.attrib.get('intermediateExchangeId', '').strip()

        # 使用 activityLinkId 获取对应的 shortname 和 activityName
        if activityLinkId:
            related_info = global_activity_mapping.get(activityLinkId)
            if not related_info:
                # 尝试从当前文件的 mapping 获取
                related_info = file_activity_mapping.get(activityLinkId)
            if not related_info:
                related_info = ("Unknown Location", "Unknown Activity Name")
                logger.warning(f"{filename}: activityLinkId '{activityLinkId}' not found in global or file mapping.")
        else:
            # 如果没有 activityLinkId，使用当前文件的 shortname 和 activityName
            related_info = (current_activity_name, current_activity_name)  # 这里假设没有 activityLinkId 时使用当前活动信息

        related_shortname, related_activity_name = related_info

        # 格式化 flow 字段
        formatted_flow = f"{flow}//[{related_shortname}]{related_activity_name}"

        # 在 flow 中添加 compartment 和 subcompartment
        formatted_flow = f"{formatted_flow}_{compartment_text}_{subcompartment}"

        # 创建记录字典
        record = {
            'process_name': current_activity_name,
            'flow': formatted_flow,
            'unit': unit,
            'amount': amount,
            'category': 'Flow Data',
            'field': 'elementaryExchange',
            'value': '',
            'compartment': compartment_text,
            'subcompartment': subcompartment,
            'comment': comment,
            'outputGroup': outputGroup,
            'section': 'elementaryExchange',
            'activityLinkId': activityLinkId,
            'intermediateExchangeId': intermediateExchangeId
        }
        return record
    except Exception as e:
        logger.error(f"{filename}: Error processing elementaryExchange - {e}")
        return {}

def process_file(filename):
    global converted_files, failed_files, failed_file_names
    global deleted_rows_per_file, replaced_process_name, non1_amount_per_file, neg1_amount_per_file, global_activity_mapping, unknown_activity_files

    file_path = os.path.join(input_folder, filename)
    logger.info(f"Processing file {filename}")

    try:
        # 获取文件名前缀和后缀
        if "_" in filename:
            prefix, suffix_with_ext = filename.split("_", 1)
            suffix = os.path.splitext(suffix_with_ext)[0].strip()
            logger.info(f"{filename}: Filename split into prefix: '{prefix}' and suffix: '{suffix}'")
        else:
            prefix = os.path.splitext(filename)[0].strip()
            suffix = ""
            logger.warning(f"{filename}: Filename does not contain '_'. Using entire name as prefix.")

        # 获取 activityName 和 location
        file_prefix = prefix  # 已经提取 prefix
        process_info = lookup_dict.get(file_prefix, ("Unknown Activity", "Unknown Location"))
        current_activity_name, current_location = process_info
        logger.info(f"{filename}: Current activity: '{current_activity_name}', Location: '{current_location}'")

        # 初始化一个空列表来存储所有记录
        records = []

        # 初始化删除计数
        deleted_rows_per_file[filename] = 0

        # 初始化非1的amount计数
        non1_amount_per_file[filename] = 0

        # 初始化-1的amount计数
        neg1_amount_per_file[filename] = 0

        # 解析 .spold 文件
        ecoSpold = parse_file_v2(file_path)
        namespaces = {'eco': 'http://www.EcoInvent.org/EcoSpold02'}

        # 提取当前文件的 activity mapping
        file_activity_mapping = extract_activity_mapping(file_path)

        # 提取 activityDescription 部分的信息
        activityDescription = ecoSpold.find('.//eco:activityDescription', namespaces)
        activity_records = process_activity_description(activityDescription, current_activity_name, namespaces)
        records.extend(activity_records)
        logger.info(f"{filename}: Extracted {len(activity_records)} activityDescription records.")

        # 提取 intermediateExchange 部分的信息
        intermediate_exchanges = ecoSpold.findall('.//eco:intermediateExchange', namespaces)
        logger.info(f"{filename}: Found {len(intermediate_exchanges)} intermediateExchange elements.")

        # 识别所有与 suffix 匹配的 intermediateExchangeId
        if suffix:
            matching_exchanges = [exchange for exchange in intermediate_exchanges if exchange.attrib.get('intermediateExchangeId', '').strip() == suffix]
            logger.info(f"{filename}: Found {len(matching_exchanges)} intermediateExchange elements matching suffix '{suffix}'.")
        else:
            matching_exchanges = []
            logger.warning(f"{filename}: Suffix is empty. No intermediateExchangeId to match.")

        # 按照逻辑更新 activityLinkId
        if len(matching_exchanges) == 1:
            exchange_to_update = matching_exchanges[0]
            exchange_to_update.attrib['activityLinkId'] = prefix
            logger.info(f"{filename}: Only one matching intermediateExchangeId. Updated activityLinkId to '{prefix}'.")
        elif len(matching_exchanges) > 1:
            # 更新所有 amount=1 或 amount=-1 的 activityLinkId
            # 先查找所有 amount=1 或 1.0
            amount_1_exchanges = [ex for ex in matching_exchanges if ex.attrib.get('amount', '').strip() in ['1', '1.0']]
            if amount_1_exchanges:
                for ex in amount_1_exchanges:
                    ex.attrib['activityLinkId'] = prefix
                    logger.info(f"{filename}: Updated activityLinkId to '{prefix}' for intermediateExchangeId '{ex.attrib.get('intermediateExchangeId')}' with amount={ex.attrib.get('amount')}.")
            else:
                # 如果没有 amount=1，则查找 amount=-1 或 -1.0
                amount_neg1_exchanges = [ex for ex in matching_exchanges if ex.attrib.get('amount', '').strip() in ['-1', '-1.0']]
                if amount_neg1_exchanges:
                    for ex in amount_neg1_exchanges:
                        ex.attrib['activityLinkId'] = prefix
                        logger.info(f"{filename}: Updated activityLinkId to '{prefix}' for intermediateExchangeId '{ex.attrib.get('intermediateExchangeId')}' with amount={ex.attrib.get('amount')}.")
                else:
                    logger.info(f"{filename}: Multiple matches. No amount=1 or amount=-1. No changes made.")

        # 处理所有 intermediateExchange
        for exchange in intermediate_exchanges:
            record = process_intermediate_exchange(exchange, current_activity_name, current_location, filename, file_activity_mapping, namespaces)
            if record:
                records.append(record)
            else:
                deleted_rows_per_file[filename] += 1

            # 统计没有 activityLinkId 且 amount !=1 的情况
            activityLinkId = exchange.attrib.get('activityLinkId', '').strip()
            amount_str = exchange.attrib.get('amount', 'N/A').strip()
            try:
                amount = float(amount_str)
            except ValueError:
                amount = None
                logger.warning(f"{filename}: Invalid amount value '{amount_str}' in intermediateExchangeId '{exchange.attrib.get('intermediateExchangeId', '').strip()}'.")
        
            if not activityLinkId and amount is not None and amount != 1.0:
                non1_amount_per_file[filename] = non1_amount_per_file.get(filename, 0) + 1
                logger.info(f"{filename}: Found intermediateExchange with amount={amount} and no activityLinkId.")
                if amount == -1.0:
                    neg1_amount_per_file[filename] = neg1_amount_per_file.get(filename, 0) + 1
                    logger.info(f"{filename}: Found intermediateExchange with amount=-1.0 and no activityLinkId.")

        # 提取 elementaryExchange 部分的信息
        elementary_exchanges = ecoSpold.findall('.//eco:elementaryExchange', namespaces)
        logger.info(f"{filename}: Found {len(elementary_exchanges)} elementaryExchange elements.")
        for exchange in elementary_exchanges:
            record = process_elementary_exchange(exchange, current_activity_name, filename, file_activity_mapping, namespaces)
            if record:  # 只有在处理成功时才添加
                records.append(record)
            else:
                deleted_rows_per_file[filename] += 1

            # 统计没有 activityLinkId 且 amount !=1 的情况
            activityLinkId = exchange.attrib.get('activityLinkId', '').strip()
            amount_str = exchange.attrib.get('amount', 'N/A').strip()
            try:
                amount = float(amount_str)
            except ValueError:
                amount = None
                logger.warning(f"{filename}: Invalid amount value '{amount_str}' in elementaryExchangeId '{exchange.attrib.get('intermediateExchangeId', '').strip()}'.")
    
            if not activityLinkId and amount is not None and amount != 1.0:
                non1_amount_per_file[filename] = non1_amount_per_file.get(filename, 0) + 1
                logger.info(f"{filename}: Found elementaryExchange with amount={amount} and no activityLinkId.")
                if amount == -1.0:
                    neg1_amount_per_file[filename] = neg1_amount_per_file.get(filename, 0) + 1
                    logger.info(f"{filename}: Found elementaryExchange with amount=-1.0 and no activityLinkId.")

        # 创建 DataFrame
        all_data = pd.DataFrame(records, columns=[
            'process_name', 'flow', 'unit', 'amount', 'category', 'field', 'value',
            'compartment', 'subcompartment', 'comment', 'outputGroup', 'section',
            'activityLinkId', 'intermediateExchangeId'
        ])

        # 生成组合字符串并检查是否与 "prefix_suffix" 匹配
        matched_flow = None
        target_combined = f"{prefix}_{suffix}"
        logger.info(f"{filename}: Target combined string: '{target_combined}'")

        # 创建布尔条件
        condition = (all_data['activityLinkId'].fillna('') + '_' + all_data['intermediateExchangeId'].fillna('')) == target_combined

        if any(condition):
            matched_flow = all_data.loc[condition, 'flow'].iloc[0]  # 假设只有一个匹配
            logger.info(f"{filename}: Matched combined string '{target_combined}'. Setting process_name to '{matched_flow}' for all records.")

        # 根据匹配结果，替换整个文件的 process_name
        if matched_flow:
            all_data['process_name'] = matched_flow  # 替换整个列
            replaced_process_name[filename] = matched_flow
            logger.info(f"{filename}: process_name has been set to '{matched_flow}' for all records based on combination '{target_combined}'.")

        # 检查是否有 flow 字段包含 [Unknown Location]Unknown Activity Name
        if any(all_data['flow'].str.contains(r'\[Unknown Location\]Unknown Activity Name')):
            unknown_activity_files.append(filename)
            logger.warning(f"{filename}: Contains '[Unknown Location]Unknown Activity Name' in flow fields after processing.")

        # 仅选择 'Flow Data' 类别的记录，并创建副本
        flow_data = all_data[all_data['category'] == 'Flow Data'].copy()
        logger.info(f"{filename}: 'Flow Data' records count: {len(flow_data)}")

        # **新增功能：修改 'flow' 列中符合条件的内容**
        # 定义修改 flow 的函数
        def modify_flow(flow):
            """
            修改 flow 字段的内容：
            如果 flow 包含 "//"，则删除从 "//" 开始到第一个 "_" 之前的所有内容，
            并在保留部分之间添加一个 "_"
            """
            if '//' in flow:
                try:
                    start = flow.index('//')
                    # 查找从 start 开始后的第一个 '_'
                    after_start = flow[start:].index('_')
                    end = start + after_start + 1  # 包括 '_'
                    # 删除从 "//" 到第一个 "_" 之前的内容，并添加 "_"
                    modified_flow = flow[:start] + '_' + flow[end:]
                    logger.debug(f"Original flow: '{flow}' | Modified flow: '{modified_flow}'")
                    return modified_flow
                except ValueError:
                    # 如果没有找到 '_', 返回原始 flow
                    logger.warning(f"Flow '{flow}' contains '//' but no '_'. No modification applied.")
                    return flow
            else:
                return flow

        # 应用 modify_flow 函数到 'flow' 列中符合条件的行
        try:
            flow_data.loc[flow_data['field'] == 'elementaryExchange', 'flow'] = flow_data.loc[flow_data['field'] == 'elementaryExchange', 'flow'].apply(modify_flow)
            logger.info(f"{filename}: Modified 'flow' column for 'elementaryExchange' records.")
        except Exception as e:
            logger.error(f"{filename}: Error modifying 'flow' column for 'elementaryExchange' - {e}")

        # **新增功能结束**

        # **新增功能：删除 outputGroup 为 'N/A' 的重复行**
        # 查找除了 'outputGroup' 外完全相同的行
        subset_cols = [col for col in flow_data.columns if col != 'outputGroup']
        duplicates = flow_data.duplicated(subset=subset_cols, keep=False)

        duplicated_flow_data = flow_data[duplicates]

        # 初始化列表保存要删除的索引
        indexes_to_drop = []

        # 分组查找重复行
        grouped = duplicated_flow_data.groupby(subset_cols)

        for group_keys, group in grouped:
            if set(group['outputGroup']) == {'0', 'N/A'}:
                # 找到 outputGroup 为 'N/A' 的行并标记为删除
                n_a_rows = group[group['outputGroup'] == 'N/A']
                indexes_to_drop.extend(n_a_rows.index.tolist())

        if indexes_to_drop:
            flow_data = flow_data.drop(indexes_to_drop)
            logger.info(f"{filename}: Removed {len(indexes_to_drop)} duplicate rows with outputGroup 'N/A'.")

        # **新增功能结束**

        # 生成输出文件路径，使用 os.path.splitext 确保正确替换扩展名
        output_filename = f"{os.path.splitext(filename)[0]}.csv"
        output_path = os.path.join(batch_folder, output_filename)

        # 保存 'Flow Data' 相关的记录到 CSV 文件
        try:
            # 即使 flow_data 为空，也生成一个包含表头的 CSV 文件
            flow_data.to_csv(output_path, index=False, encoding='utf-8')
            converted_files += 1
            logger.info(f"{filename}: Flow data processed and saved to '{output_path}'")
        except Exception as e:
            failed_files += 1
            failed_file_names.append(filename)
            logger.error(f"{filename}: Error saving CSV file - {e}")

    except Exception as e:
        logger.error(f"{filename}: Error processing file - {e}")
        failed_files += 1
        failed_file_names.append(filename)

def main():
    global total_files, converted_files, failed_files, failed_file_names

    # 第一遍遍历所有文件，构建全局映射
    logger.info("Starting first pass to build global activity mapping.")
    for filename in os.listdir(input_folder):
        if filename.endswith(".spold"):
            file_path = os.path.join(input_folder, filename)
            logger.info(f"Building mapping from file {filename}")
            file_mapping = extract_activity_mapping(file_path)
            global_activity_mapping.update(file_mapping)

    logger.info(f"Global activity mapping built with {len(global_activity_mapping)} entries.")

    # 输出全局映射字典到CSV文件以便检查
    mapping_output_path = os.path.join(batch_folder, "global_activity_mapping.csv")
    try:
        with open(mapping_output_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['activity_id', 'shortname', 'activityName'])
            for activity_id, (shortname, activityName) in global_activity_mapping.items():
                writer.writerow([activity_id, shortname, activityName])
        logger.info(f"Global activity mapping exported to {mapping_output_path}")
    except Exception as e:
        logger.error(f"Error exporting global activity mapping to CSV: {e}")

    # 第二遍遍历所有文件，处理数据
    logger.info("Starting second pass to process all files.")

    for filename in os.listdir(input_folder):
        if filename.endswith(".spold"):
            total_files += 1
            logger.info(f"Processing file {filename}")

            try:
                process_file(filename)
            except Exception as e:
                failed_files += 1
                failed_file_names.append(filename)
                logger.error(f"{filename}: Error processing file - {e}")
                continue

    # 生成失败文件列表并保存到文件
    failed_files_log_path = os.path.join(batch_folder, "failed_files.txt")
    try:
        with open(failed_files_log_path, 'w', encoding='utf-8') as f:
            for fname in failed_file_names:
                f.write(f"{fname}\n")
        logger.info(f"Failed file names saved to {failed_files_log_path}")
    except Exception as e:
        logger.error(f"Error writing failed files log: {e}")

    # 生成处理总结并保存到 summary.txt，包括删除的行数统计和 process_name 替换统计
    summary_log_path = os.path.join(batch_folder, "summary.txt")
    try:
        with open(summary_log_path, 'w', encoding='utf-8') as f:
            f.write("=== Processing Summary ===\n")
            f.write(f"Total .spold files found: {total_files}\n")
            f.write(f"Successfully converted to CSV: {converted_files}\n")
            f.write(f"Failed to convert: {failed_files}\n\n")
            if failed_files > 0:
                f.write("List of failed files:\n")
                for fname in failed_file_names:
                    f.write(f"- {fname}\n")
            f.write("\n=== Deleted Rows Per File ===\n")
            for fname, count in deleted_rows_per_file.items():
                f.write(f"{fname}: {count} rows deleted\n")
            f.write("\n=== Replaced Process Name Per File ===\n")
            for fname, flow in replaced_process_name.items():
                f.write(f"{fname}: process_name replaced with flow '{flow}'\n")
            f.write("\n=== Non-1 Amount Without activityLinkId Per File ===\n")
            for fname, count in non1_amount_per_file.items():
                f.write(f"{fname}: {count} rows with amount != 1 and no activityLinkId\n")
            f.write("\n=== Amount=-1 Without activityLinkId Per File ===\n")
            for fname, count in neg1_amount_per_file.items():
                f.write(f"{fname}: {count} rows with amount=-1 and no activityLinkId\n")
            f.write("\n=== Files with Unknown Activity Name After Deletion ===\n")
            for fname in unknown_activity_files:
                f.write(f"{fname}\n")
        logger.info(f"Processing summary saved to {summary_log_path}")
    except Exception as e:
        logger.error(f"Error writing summary log: {e}")

    # 保存 non1_amount_per_file 到 CSV
    output_non1_path = os.path.join(batch_folder, "non1_amount_files.csv")
    try:
        with open(output_non1_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Filename', 'Non-1 Amount Count'])
            for fname, count in non1_amount_per_file.items():
                if count > 0:
                    writer.writerow([fname, count])
        logger.info(f"Non-1 amount files saved to {output_non1_path}")
    except Exception as e:
        logger.error(f"Error exporting non1 amount files to CSV: {e}")

    # 保存 neg1_amount_per_file 到 CSV
    output_neg1_path = os.path.join(batch_folder, "neg1_amount_files.csv")
    try:
        with open(output_neg1_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Filename', 'Amount=-1 Count'])
            for fname, count in neg1_amount_per_file.items():
                if count > 0:
                    writer.writerow([fname, count])
        logger.info(f"Amount=-1 files saved to {output_neg1_path}")
    except Exception as e:
        logger.error(f"Error exporting amount=-1 files to CSV: {e}")

    # 统计结果并记录到 summary.txt
    logger.info("=== Processing Summary ===")
    logger.info(f"Total .spold files found: {total_files}")
    logger.info(f"Successfully converted to CSV: {converted_files}")
    logger.info(f"Failed to convert: {failed_files}")

    if failed_files > 0:
        logger.info("List of failed files:")
        for fname in failed_file_names:
            logger.info(f"- {fname}")

    # 记录每个文件被删除的行数
    logger.info("=== Deleted Rows Per File ===")
    for fname, count in deleted_rows_per_file.items():
        logger.info(f"{fname}: {count} rows deleted")

    # 记录每个文件被替换的 process_name
    logger.info("=== Replaced Process Name Per File ===")
    for fname, flow in replaced_process_name.items():
        logger.info(f"{fname}: process_name replaced with flow '{flow}'")

    # 记录没有 activityLinkId 且 amount !=1 的行数
    logger.info("=== Non-1 Amount Without activityLinkId Per File ===")
    for fname, count in non1_amount_per_file.items():
        logger.info(f"{fname}: {count} rows with amount != 1 and no activityLinkId")

    # 记录没有 activityLinkId 且 amount ==-1 的行数
    logger.info("=== Amount=-1 Without activityLinkId Per File ===")
    for fname, count in neg1_amount_per_file.items():
        logger.info(f"{fname}: {count} rows with amount=-1 and no activityLinkId")

    # 记录仍有 [Unknown Location]Unknown Activity Name 的文件
    logger.info("=== Files with Unknown Activity Name After Deletion ===")
    for fname in unknown_activity_files:
        logger.info(f"{fname}")

    # 增加批次编号以便下一次运行
    try:
        with open(batch_file_path, "w") as f:
            f.write(str(batch_number + 1))
        logger.info(f"Batch number incremented to {batch_number + 1}")
    except Exception as e:
        logger.error(f"Error updating batch number: {e}")

    logger.info("All files processed successfully.")

if __name__ == "__main__":
    main()


In [2]:
from pyecospold import parse_file_v2
import pandas as pd
import os
from datetime import datetime
import logging
import csv

# 全局 activity_id 到 (shortname, activityName) 的映射字典
global_activity_mapping = {}

# 初始化统计变量
total_files = 0
converted_files = 0
failed_files = 0
failed_file_names = []

# 用于跟踪每个文件中被删除的行数
deleted_rows_per_file = {}

# 用于跟踪每个文件中被替换的 process_name
replaced_process_name = {}

# 用于统计没有 activityLinkId 且 amount 不为1的行数
non1_amount_per_file = {}

# 用于统计没有 activityLinkId 且 amount 为-1的行数
neg1_amount_per_file = {}

# 用于记录仍有 [Unknown Location]Unknown Activity Name 的文件
unknown_activity_files = []

# 定义输入和输出文件夹路径
input_folder = "C:\\Users\\WasteWang\\LCA\\DATA\\3.11_APOS\\datasets"
output_folder_base = "C:\\Users\\WasteWang\\LCA\\OUTPUT"
lookup_file_path = "C:\\Users\\WasteWang\\LCA\\DATA\\3.11_APOS\\FilenameToActivityLookup.csv"
batch_file_path = "C:\\Users\\WasteWang\\LCA\\batch_number.txt"  # 用于存储批次编号的文件路径

# 如果没有批次编号文件，初始化为 1
if not os.path.exists(batch_file_path):
    try:
        with open(batch_file_path, "w") as f:
            f.write("1")
        print("Batch number file created with initial value 1.")
    except Exception as e:
        print(f"Error creating batch number file: {e}")

# 读取批次编号
try:
    with open(batch_file_path, "r") as f:
        batch_number = int(f.read().strip())
    print(f"Current batch number: {batch_number}")
except Exception as e:
    print(f"Error reading batch number file: {e}")
    batch_number = 1  # 默认值

# 获取当前日期
current_date = datetime.now().strftime("%m%d")
print(f"Current date: {current_date}")

# 创建当前批次的输出文件夹
batch_folder = os.path.join(output_folder_base, f"{current_date}_{batch_number}")
os.makedirs(batch_folder, exist_ok=True)
print(f"Output will be saved to: {batch_folder}")

# 配置日志，仅输出到文件，设置为 INFO 级别
logger = logging.getLogger('spold_processor')
logger.setLevel(logging.INFO)

# 创建文件处理器，保存到输出批次文件夹中的 processing_debug.txt
file_handler = logging.FileHandler(os.path.join(batch_folder, 'processing_debug.txt'), encoding='utf-8')
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s %(levelname)s:%(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# 加载文件名到 activity 名称的映射表，指定分号分隔符
try:
    lookup_df = pd.read_csv(lookup_file_path, sep=';')
    lookup_dict = {row['Filename'].split('_')[0]: (row['ActivityName'], row['Location']) for _, row in lookup_df.iterrows()}
    logger.info(f"Loaded lookup dictionary with {len(lookup_dict)} entries.")
except Exception as e:
    logger.error(f"Error loading lookup file: {e}")
    lookup_dict = {}

# 验证 lookup_dict 的完整性
def verify_lookup_dict():
    missing_prefixes = set()
    for filename in os.listdir(input_folder):
        if filename.endswith(".spold"):
            prefix = filename.split('_')[0] if '_' in filename else os.path.splitext(filename)[0]
            if prefix not in lookup_dict:
                missing_prefixes.add(prefix)
    if missing_prefixes:
        logger.warning(f"The following prefixes are missing in lookup_dict: {', '.join(missing_prefixes)}")
    else:
        logger.info("All prefixes are present in lookup_dict.")

verify_lookup_dict()

def extract_activity_mapping(file_path):
    """
    提取单个 .spold 文件中的所有 activityDescription 元素，并返回一个 mapping 字典。
    """
    mapping = {}
    try:
        logger.info(f"Parsing file: {file_path}")
        ecoSpold = parse_file_v2(file_path)
        namespaces = {'eco': 'http://www.EcoInvent.org/EcoSpold02'}
        
        # 获取所有 activityDescription 元素，包括子活动
        activity_descriptions = ecoSpold.findall('.//eco:activityDescription', namespaces)
        for activity_description in activity_descriptions:
            activity = activity_description.find('eco:activity', namespaces)
            if activity is not None:
                activity_id = activity.attrib.get('id')
                activity_name_elem = activity.find('eco:activityName', namespaces)
                activity_name_text = activity_name_elem.text.strip() if activity_name_elem is not None and activity_name_elem.text else "Unknown Activity Name"

                geography = activity_description.find('eco:geography', namespaces)
                if geography is not None:
                    shortname_elem = geography.find('eco:shortname', namespaces)
                    shortname = shortname_elem.text.strip() if shortname_elem is not None and shortname_elem.text else "Unknown Location"
                else:
                    shortname = "Unknown Location"

                if activity_id:
                    mapping[activity_id] = (shortname, activity_name_text)
                    logger.info(f"Mapped activity_id {activity_id} to ({shortname}, {activity_name_text})")
    except Exception as e:
        logger.error(f"Error extracting activity mapping from file {file_path}: {e}")
    return mapping

def process_activity_description(activityDescription, current_activity_name, namespaces):
    """
    处理 activityDescription 部分的信息，返回一个列表的字典记录。
    """
    records = []
    try:
        if activityDescription is not None:
            for field, tag in [('includedActivitiesStart', 'includedActivitiesStart'),
                               ('includedActivitiesEnd', 'includedActivitiesEnd'),
                               ('generalComment', 'generalComment')]:
                value = activityDescription.find(f'eco:{tag}', namespaces)
                value_text = value.text.strip() if value is not None and value.text else 'N/A'
                category = 'Included Activities' if field != 'generalComment' else 'General Comment'
                record = {
                    'process_name': current_activity_name,
                    'flow': '',  # 该字段在此部分没有数据
                    'unit': '',
                    'amount': '',
                    'category': category,
                    'field': field,
                    'value': value_text,
                    'compartment': '',
                    'subcompartment': '',
                    'comment': '',
                    'outputGroup': '',
                    'section': 'activityDescription',
                    'activityLinkId': '',
                    'intermediateExchangeId': ''
                }
                records.append(record)
    except Exception as e:
        logger.error(f"Error processing activityDescription: {e}")
    return records

def process_intermediate_exchange(exchange, current_activity_name, current_location, filename, file_activity_mapping, namespaces):
    """
    处理一个 intermediateExchange 元素，返回一个字典记录。
    """
    try:
        intermediateExchangeId = exchange.attrib.get('intermediateExchangeId', '').strip()
        amount_str = exchange.attrib.get('amount', 'N/A').strip()
        try:
            amount = float(amount_str)
        except ValueError:
            amount = None
            logger.warning(f"{filename}: Invalid amount value '{amount_str}' in intermediateExchangeId '{intermediateExchangeId}'.")
        
        flow_name_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}name')
        flow_name = flow_name_elem.text.strip() if flow_name_elem is not None and flow_name_elem.text else "Unknown Flow"
        unit_name_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}unitName')
        unit_name = unit_name_elem.text.strip() if unit_name_elem is not None and unit_name_elem.text else "Unknown Unit"
        comment_field = exchange.find('{http://www.EcoInvent.org/EcoSpold02}comment')
        comment = comment_field.text.strip() if comment_field is not None and comment_field.text else 'N/A'
        outputGroup_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}outputGroup')
        outputGroup = outputGroup_elem.text.strip() if outputGroup_elem is not None and outputGroup_elem.text else 'N/A'
        activityLinkId = exchange.attrib.get('activityLinkId', '').strip()

        # 条件：对于 intermediateExchange，如果 amount == 0，则删除该行
        if amount == 0.0:
            logger.info(f"{filename}: Skipping intermediateExchange with amount=0.0.")
            return None  # 不添加该记录

        # 使用 activityLinkId 获取对应的 shortname 和 activityName
        if activityLinkId:
            related_info = global_activity_mapping.get(activityLinkId)
            if not related_info:
                # 尝试从当前文件的 mapping 获取
                related_info = file_activity_mapping.get(activityLinkId)
            if not related_info:
                related_info = ("Unknown Location", "Unknown Activity Name")
                logger.warning(f"{filename}: activityLinkId '{activityLinkId}' not found in global or file mapping.")
        else:
            # 如果没有 activityLinkId，使用当前文件的 shortname 和 activityName
            related_info = (current_location, current_activity_name)

        related_shortname, related_activity_name = related_info

        # 格式化 flow 字段
        formatted_flow = f"{flow_name}//[{related_shortname}]{related_activity_name}"

        # 创建记录字典
        record = {
            'process_name': current_activity_name,
            'flow': formatted_flow,
            'unit': unit_name,
            'amount': amount,
            'category': 'Flow Data',
            'field': 'intermediateExchange',
            'value': '',
            'compartment': '',
            'subcompartment': '',
            'comment': comment,
            'outputGroup': outputGroup,
            'section': 'intermediateExchange',
            'activityLinkId': activityLinkId,
            'intermediateExchangeId': intermediateExchangeId
        }
        return record
    except Exception as e:
        logger.error(f"{filename}: Error processing intermediateExchange - {e}")
        return {}

def process_elementary_exchange(exchange, current_activity_name, filename, file_activity_mapping, namespaces):
    """
    处理一个 elementaryExchange 元素，返回一个字典记录。
    """
    try:
        flow_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}name')
        flow = flow_elem.text.strip() if flow_elem is not None and flow_elem.text else "Unknown Flow"
        unit_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}unitName')
        unit = unit_elem.text.strip() if unit_elem is not None and unit_elem.text else "Unknown Unit"
        amount_str = exchange.attrib.get('amount', 'N/A').strip()
        try:
            amount = float(amount_str)
        except ValueError:
            amount = None
            logger.warning(f"{filename}: Invalid amount value '{amount_str}' in elementaryExchangeId '{exchange.attrib.get('intermediateExchangeId', '').strip()}'.")
    
        comment_field = exchange.find('{http://www.EcoInvent.org/EcoSpold02}comment')
        comment = comment_field.text.strip() if comment_field is not None and comment_field.text else 'N/A'

        # 条件：对于 elementaryExchange，如果 amount == 0，则删除该行
        if amount == 0.0:
            logger.info(f"{filename}: Skipping elementaryExchange with amount=0.0.")
            return None  # 不添加该记录

        # 处理 compartment 和 subcompartment
        compartment = exchange.find('{http://www.EcoInvent.org/EcoSpold02}compartment')
        if compartment is not None:
            compartment_main = compartment.find('{http://www.EcoInvent.org/EcoSpold02}compartment')
            compartment_text = compartment_main.text.strip() if compartment_main is not None and compartment_main.text else 'Unknown Compartment'
            subcompartment_elem = compartment.find('{http://www.EcoInvent.org/EcoSpold02}subcompartment')
            subcompartment = subcompartment_elem.text.strip() if subcompartment_elem is not None and subcompartment_elem.text else 'Unknown Subcompartment'
        else:
            compartment_text = 'Unknown Compartment'
            subcompartment = 'Unknown Subcompartment'

        outputGroup_elem = exchange.find('{http://www.EcoInvent.org/EcoSpold02}outputGroup')
        outputGroup = outputGroup_elem.text.strip() if outputGroup_elem is not None and outputGroup_elem.text else 'N/A'
        activityLinkId = exchange.attrib.get('activityLinkId', '').strip()
        intermediateExchangeId = exchange.attrib.get('intermediateExchangeId', '').strip()

        # 使用 activityLinkId 获取对应的 shortname 和 activityName
        if activityLinkId:
            related_info = global_activity_mapping.get(activityLinkId)
            if not related_info:
                # 尝试从当前文件的 mapping 获取
                related_info = file_activity_mapping.get(activityLinkId)
            if not related_info:
                related_info = ("Unknown Location", "Unknown Activity Name")
                logger.warning(f"{filename}: activityLinkId '{activityLinkId}' not found in global or file mapping.")
        else:
            # 如果没有 activityLinkId，使用当前文件的 shortname 和 activityName
            related_info = (current_activity_name, current_activity_name)  # 这里假设没有 activityLinkId 时使用当前活动信息

        related_shortname, related_activity_name = related_info

        # 格式化 flow 字段
        formatted_flow = f"{flow}//[{related_shortname}]{related_activity_name}"

        # 在 flow 中添加 compartment 和 subcompartment
        formatted_flow = f"{formatted_flow}_{compartment_text}_{subcompartment}"

        # 创建记录字典
        record = {
            'process_name': current_activity_name,
            'flow': formatted_flow,
            'unit': unit,
            'amount': amount,
            'category': 'Flow Data',
            'field': 'elementaryExchange',
            'value': '',
            'compartment': compartment_text,
            'subcompartment': subcompartment,
            'comment': comment,
            'outputGroup': outputGroup,
            'section': 'elementaryExchange',
            'activityLinkId': activityLinkId,
            'intermediateExchangeId': intermediateExchangeId
        }
        return record
    except Exception as e:
        logger.error(f"{filename}: Error processing elementaryExchange - {e}")
        return {}

def process_file(filename):
    global converted_files, failed_files, failed_file_names
    global deleted_rows_per_file, replaced_process_name, non1_amount_per_file, neg1_amount_per_file, global_activity_mapping, unknown_activity_files

    file_path = os.path.join(input_folder, filename)
    logger.info(f"Processing file {filename}")

    try:
        # 获取文件名前缀和后缀
        if "_" in filename:
            prefix, suffix_with_ext = filename.split("_", 1)
            suffix = os.path.splitext(suffix_with_ext)[0].strip()
            logger.info(f"{filename}: Filename split into prefix: '{prefix}' and suffix: '{suffix}'")
        else:
            prefix = os.path.splitext(filename)[0].strip()
            suffix = ""
            logger.warning(f"{filename}: Filename does not contain '_'. Using entire name as prefix.")

        # 获取 activityName 和 location
        file_prefix = prefix  # 已经提取 prefix
        process_info = lookup_dict.get(file_prefix, ("Unknown Activity", "Unknown Location"))
        current_activity_name, current_location = process_info
        logger.info(f"{filename}: Current activity: '{current_activity_name}', Location: '{current_location}'")

        # 初始化一个空列表来存储所有记录
        records = []

        # 初始化删除计数
        deleted_rows_per_file[filename] = 0

        # 初始化非1的amount计数
        non1_amount_per_file[filename] = 0

        # 初始化-1的amount计数
        neg1_amount_per_file[filename] = 0

        # 解析 .spold 文件
        ecoSpold = parse_file_v2(file_path)
        namespaces = {'eco': 'http://www.EcoInvent.org/EcoSpold02'}

        # 提取当前文件的 activity mapping
        file_activity_mapping = extract_activity_mapping(file_path)

        # 提取 activityDescription 部分的信息
        activityDescription = ecoSpold.find('.//eco:activityDescription', namespaces)
        activity_records = process_activity_description(activityDescription, current_activity_name, namespaces)
        records.extend(activity_records)
        logger.info(f"{filename}: Extracted {len(activity_records)} activityDescription records.")

        # 提取 intermediateExchange 部分的信息
        intermediate_exchanges = ecoSpold.findall('.//eco:intermediateExchange', namespaces)
        logger.info(f"{filename}: Found {len(intermediate_exchanges)} intermediateExchange elements.")

        # 识别所有与 suffix 匹配的 intermediateExchangeId
        if suffix:
            matching_exchanges = [exchange for exchange in intermediate_exchanges if exchange.attrib.get('intermediateExchangeId', '').strip() == suffix]
            logger.info(f"{filename}: Found {len(matching_exchanges)} intermediateExchange elements matching suffix '{suffix}'.")
        else:
            matching_exchanges = []
            logger.warning(f"{filename}: Suffix is empty. No intermediateExchangeId to match.")

        # 按照逻辑更新 activityLinkId
        if len(matching_exchanges) == 1:
            exchange_to_update = matching_exchanges[0]
            exchange_to_update.attrib['activityLinkId'] = prefix
            logger.info(f"{filename}: Only one matching intermediateExchangeId. Updated activityLinkId to '{prefix}'.")
        elif len(matching_exchanges) > 1:
            # 更新所有 amount=1 或 amount=-1 的 activityLinkId
            # 先查找所有 amount=1 或 1.0
            amount_1_exchanges = [ex for ex in matching_exchanges if ex.attrib.get('amount', '').strip() in ['1', '1.0']]
            if amount_1_exchanges:
                for ex in amount_1_exchanges:
                    ex.attrib['activityLinkId'] = prefix
                    logger.info(f"{filename}: Updated activityLinkId to '{prefix}' for intermediateExchangeId '{ex.attrib.get('intermediateExchangeId')}' with amount={ex.attrib.get('amount')}.")
            else:
                # 如果没有 amount=1，则查找 amount=-1 或 -1.0
                amount_neg1_exchanges = [ex for ex in matching_exchanges if ex.attrib.get('amount', '').strip() in ['-1', '-1.0']]
                if amount_neg1_exchanges:
                    for ex in amount_neg1_exchanges:
                        ex.attrib['activityLinkId'] = prefix
                        logger.info(f"{filename}: Updated activityLinkId to '{prefix}' for intermediateExchangeId '{ex.attrib.get('intermediateExchangeId')}' with amount={ex.attrib.get('amount')}.")
                else:
                    logger.info(f"{filename}: Multiple matches. No amount=1 or amount=-1. No changes made.")

        # 处理所有 intermediateExchange
        for exchange in intermediate_exchanges:
            record = process_intermediate_exchange(exchange, current_activity_name, current_location, filename, file_activity_mapping, namespaces)
            if record:
                records.append(record)
            else:
                deleted_rows_per_file[filename] += 1

            # 统计没有 activityLinkId 且 amount !=1 的情况
            activityLinkId = exchange.attrib.get('activityLinkId', '').strip()
            amount_str = exchange.attrib.get('amount', 'N/A').strip()
            try:
                amount = float(amount_str)
            except ValueError:
                amount = None
                logger.warning(f"{filename}: Invalid amount value '{amount_str}' in intermediateExchangeId '{exchange.attrib.get('intermediateExchangeId', '').strip()}'.")
        
            if not activityLinkId and amount is not None and amount != 1.0:
                non1_amount_per_file[filename] = non1_amount_per_file.get(filename, 0) + 1
                logger.info(f"{filename}: Found intermediateExchange with amount={amount} and no activityLinkId.")
                if amount == -1.0:
                    neg1_amount_per_file[filename] = neg1_amount_per_file.get(filename, 0) + 1
                    logger.info(f"{filename}: Found intermediateExchange with amount=-1.0 and no activityLinkId.")

        # 提取 elementaryExchange 部分的信息
        elementary_exchanges = ecoSpold.findall('.//eco:elementaryExchange', namespaces)
        logger.info(f"{filename}: Found {len(elementary_exchanges)} elementaryExchange elements.")
        for exchange in elementary_exchanges:
            record = process_elementary_exchange(exchange, current_activity_name, filename, file_activity_mapping, namespaces)
            if record:  # 只有在处理成功时才添加
                records.append(record)
            else:
                deleted_rows_per_file[filename] += 1

            # 统计没有 activityLinkId 且 amount !=1 的情况
            activityLinkId = exchange.attrib.get('activityLinkId', '').strip()
            amount_str = exchange.attrib.get('amount', 'N/A').strip()
            try:
                amount = float(amount_str)
            except ValueError:
                amount = None
                logger.warning(f"{filename}: Invalid amount value '{amount_str}' in elementaryExchangeId '{exchange.attrib.get('intermediateExchangeId', '').strip()}'.")
    
            if not activityLinkId and amount is not None and amount != 1.0:
                non1_amount_per_file[filename] = non1_amount_per_file.get(filename, 0) + 1
                logger.info(f"{filename}: Found elementaryExchange with amount={amount} and no activityLinkId.")
                if amount == -1.0:
                    neg1_amount_per_file[filename] = neg1_amount_per_file.get(filename, 0) + 1
                    logger.info(f"{filename}: Found elementaryExchange with amount=-1.0 and no activityLinkId.")

        # 创建 DataFrame
        all_data = pd.DataFrame(records, columns=[
            'process_name', 'flow', 'unit', 'amount', 'category', 'field', 'value',
            'compartment', 'subcompartment', 'comment', 'outputGroup', 'section',
            'activityLinkId', 'intermediateExchangeId'
        ])

        # 生成组合字符串并检查是否与 "prefix_suffix" 匹配
        matched_flow = None
        target_combined = f"{prefix}_{suffix}"
        logger.info(f"{filename}: Target combined string: '{target_combined}'")

        # 创建布尔条件
        condition = (all_data['activityLinkId'].fillna('') + '_' + all_data['intermediateExchangeId'].fillna('')) == target_combined

        if any(condition):
            matched_flow = all_data.loc[condition, 'flow'].iloc[0]  # 假设只有一个匹配
            logger.info(f"{filename}: Matched combined string '{target_combined}'. Setting process_name to '{matched_flow}' for all records.")

        # 根据匹配结果，替换整个文件的 process_name
        if matched_flow:
            all_data['process_name'] = matched_flow  # 替换整个列
            replaced_process_name[filename] = matched_flow
            logger.info(f"{filename}: process_name has been set to '{matched_flow}' for all records based on combination '{target_combined}'.")

        # 检查是否有 flow 字段包含 [Unknown Location]Unknown Activity Name
        if any(all_data['flow'].str.contains(r'\[Unknown Location\]Unknown Activity Name', na=False)):
            unknown_activity_files.append(filename)
            logger.warning(f"{filename}: Contains '[Unknown Location]Unknown Activity Name' in flow fields after processing.")

        # 仅选择 'Flow Data' 类别的记录，并创建副本
        flow_data = all_data[all_data['category'] == 'Flow Data'].copy()
        logger.info(f"{filename}: 'Flow Data' records count: {len(flow_data)}")

        # **新增功能：修改 'flow' 列中符合条件的内容**
        # 定义修改 flow 的函数
        def modify_flow(flow):
            """
            修改 flow 字段的内容：
            如果 flow 包含 "//"，则删除从 "//" 开始到第一个 "_" 之前的所有内容，
            并在保留部分之间添加一个 "_"
            """
            if '//' in flow:
                try:
                    start = flow.index('//')
                    # 查找从 start 开始后的第一个 '_'
                    after_start = flow[start:].index('_')
                    end = start + after_start + 1  # 包括 '_'
                    # 删除从 "//" 到第一个 "_" 之前的内容，并添加 "_"
                    modified_flow = flow[:start] + '_' + flow[end:]
                    logger.debug(f"Original flow: '{flow}' | Modified flow: '{modified_flow}'")
                    return modified_flow
                except ValueError:
                    # 如果没有找到 '_', 返回原始 flow
                    logger.warning(f"Flow '{flow}' contains '//' but no '_'. No modification applied.")
                    return flow
            else:
                return flow

        # 应用 modify_flow 函数到 'flow' 列中符合条件的行
        try:
            # 仅对 'elementaryExchange' 的行应用修改
            condition_modify = flow_data['field'] == 'elementaryExchange'
            original_count = flow_data[condition_modify].shape[0]
            flow_data.loc[condition_modify, 'flow'] = flow_data.loc[condition_modify, 'flow'].apply(modify_flow)
            modified_count = flow_data[condition_modify]['flow'].str.contains('_').sum()  # 简单检查修改是否成功
            logger.info(f"{filename}: Modified 'flow' column for 'elementaryExchange' records. {modified_count}/{original_count} records modified.")
        except Exception as e:
            logger.error(f"{filename}: Error modifying 'flow' column for 'elementaryExchange' - {e}")

        # **新增功能结束**

        # **新增功能：删除 outputGroup 为 'N/A' 的重复行**
        # 查找除了 'outputGroup' 外完全相同的行
        subset_cols = [col for col in flow_data.columns if col != 'outputGroup']
        duplicates = flow_data.duplicated(subset=subset_cols, keep=False)

        duplicated_flow_data = flow_data[duplicates]

        # 初始化列表保存要删除的索引
        indexes_to_drop = []

        # 分组查找重复行
        grouped = duplicated_flow_data.groupby(subset_cols)

        for group_keys, group in grouped:
            if set(group['outputGroup']) == {'0', 'N/A'}:
                # 找到 outputGroup 为 'N/A' 的行并标记为删除
                n_a_rows = group[group['outputGroup'] == 'N/A']
                indexes_to_drop.extend(n_a_rows.index.tolist())

        if indexes_to_drop:
            flow_data = flow_data.drop(indexes_to_drop)
            logger.info(f"{filename}: Removed {len(indexes_to_drop)} duplicate rows with outputGroup 'N/A'.")

        # **新增功能结束**

        # 生成输出文件路径，使用 os.path.splitext 确保正确替换扩展名
        output_filename = f"{os.path.splitext(filename)[0]}.csv"
        output_path = os.path.join(batch_folder, output_filename)

        # 保存 'Flow Data' 相关的记录到 CSV 文件
        try:
            # 即使 flow_data 为空，也生成一个包含表头的 CSV 文件
            flow_data.to_csv(output_path, index=False, encoding='utf-8')
            converted_files += 1
            logger.info(f"{filename}: Flow data processed and saved to '{output_path}'")
        except Exception as e:
            failed_files += 1
            failed_file_names.append(filename)
            logger.error(f"{filename}: Error saving CSV file - {e}")

    except Exception as e:
        logger.error(f"{filename}: Error processing file - {e}")
        failed_files += 1
        failed_file_names.append(filename)

def main():
    global total_files, converted_files, failed_files, failed_file_names

    # 第一遍遍历所有文件，构建全局映射
    logger.info("Starting first pass to build global activity mapping.")
    for filename in os.listdir(input_folder):
        if filename.endswith(".spold"):
            file_path = os.path.join(input_folder, filename)
            logger.info(f"Building mapping from file {filename}")
            file_mapping = extract_activity_mapping(file_path)
            global_activity_mapping.update(file_mapping)

    logger.info(f"Global activity mapping built with {len(global_activity_mapping)} entries.")

    # 输出全局映射字典到CSV文件以便检查
    mapping_output_path = os.path.join(batch_folder, "global_activity_mapping.csv")
    try:
        with open(mapping_output_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['activity_id', 'shortname', 'activityName'])
            for activity_id, (shortname, activityName) in global_activity_mapping.items():
                writer.writerow([activity_id, shortname, activityName])
        logger.info(f"Global activity mapping exported to {mapping_output_path}")
    except Exception as e:
        logger.error(f"Error exporting global activity mapping to CSV: {e}")

    # 第二遍遍历所有文件，处理数据
    logger.info("Starting second pass to process all files.")

    for filename in os.listdir(input_folder):
        if filename.endswith(".spold"):
            total_files += 1
            logger.info(f"Processing file {filename}")

            try:
                process_file(filename)
            except Exception as e:
                failed_files += 1
                failed_file_names.append(filename)
                logger.error(f"{filename}: Error processing file - {e}")
                continue

    # 生成失败文件列表并保存到文件
    failed_files_log_path = os.path.join(batch_folder, "failed_files.txt")
    try:
        with open(failed_files_log_path, 'w', encoding='utf-8') as f:
            for fname in failed_file_names:
                f.write(f"{fname}\n")
        logger.info(f"Failed file names saved to {failed_files_log_path}")
    except Exception as e:
        logger.error(f"Error writing failed files log: {e}")

    # 生成处理总结并保存到 summary.txt，包括删除的行数统计和 process_name 替换统计
    summary_log_path = os.path.join(batch_folder, "summary.txt")
    try:
        with open(summary_log_path, 'w', encoding='utf-8') as f:
            f.write("=== Processing Summary ===\n")
            f.write(f"Total .spold files found: {total_files}\n")
            f.write(f"Successfully converted to CSV: {converted_files}\n")
            f.write(f"Failed to convert: {failed_files}\n\n")
            if failed_files > 0:
                f.write("List of failed files:\n")
                for fname in failed_file_names:
                    f.write(f"- {fname}\n")
            f.write("\n=== Deleted Rows Per File ===\n")
            for fname, count in deleted_rows_per_file.items():
                f.write(f"{fname}: {count} rows deleted\n")
            f.write("\n=== Replaced Process Name Per File ===\n")
            for fname, flow in replaced_process_name.items():
                f.write(f"{fname}: process_name replaced with flow '{flow}'\n")
            f.write("\n=== Non-1 Amount Without activityLinkId Per File ===\n")
            for fname, count in non1_amount_per_file.items():
                f.write(f"{fname}: {count} rows with amount != 1 and no activityLinkId\n")
            f.write("\n=== Amount=-1 Without activityLinkId Per File ===\n")
            for fname, count in neg1_amount_per_file.items():
                f.write(f"{fname}: {count} rows with amount=-1 and no activityLinkId\n")
            f.write("\n=== Files with Unknown Activity Name After Deletion ===\n")
            for fname in unknown_activity_files:
                f.write(f"{fname}\n")
        logger.info(f"Processing summary saved to {summary_log_path}")
    except Exception as e:
        logger.error(f"Error writing summary log: {e}")

    # 保存 non1_amount_per_file 到 CSV
    output_non1_path = os.path.join(batch_folder, "non1_amount_files.csv")
    try:
        with open(output_non1_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Filename', 'Non-1 Amount Count'])
            for fname, count in non1_amount_per_file.items():
                if count > 0:
                    writer.writerow([fname, count])
        logger.info(f"Non-1 amount files saved to {output_non1_path}")
    except Exception as e:
        logger.error(f"Error exporting non1 amount files to CSV: {e}")

    # 保存 neg1_amount_per_file 到 CSV
    output_neg1_path = os.path.join(batch_folder, "neg1_amount_files.csv")
    try:
        with open(output_neg1_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Filename', 'Amount=-1 Count'])
            for fname, count in neg1_amount_per_file.items():
                if count > 0:
                    writer.writerow([fname, count])
        logger.info(f"Amount=-1 files saved to {output_neg1_path}")
    except Exception as e:
        logger.error(f"Error exporting amount=-1 files to CSV: {e}")

    # 统计结果并记录到 summary.txt
    logger.info("=== Processing Summary ===")
    logger.info(f"Total .spold files found: {total_files}")
    logger.info(f"Successfully converted to CSV: {converted_files}")
    logger.info(f"Failed to convert: {failed_files}")

    if failed_files > 0:
        logger.info("List of failed files:")
        for fname in failed_file_names:
            logger.info(f"- {fname}")

    # 记录每个文件被删除的行数
    logger.info("=== Deleted Rows Per File ===")
    for fname, count in deleted_rows_per_file.items():
        logger.info(f"{fname}: {count} rows deleted")

    # 记录每个文件被替换的 process_name
    logger.info("=== Replaced Process Name Per File ===")
    for fname, flow in replaced_process_name.items():
        logger.info(f"{fname}: process_name replaced with flow '{flow}'")

    # 记录没有 activityLinkId 且 amount !=1 的行数
    logger.info("=== Non-1 Amount Without activityLinkId Per File ===")
    for fname, count in non1_amount_per_file.items():
        logger.info(f"{fname}: {count} rows with amount != 1 and no activityLinkId")

    # 记录没有 activityLinkId 且 amount ==-1 的行数
    logger.info("=== Amount=-1 Without activityLinkId Per File ===")
    for fname, count in neg1_amount_per_file.items():
        logger.info(f"{fname}: {count} rows with amount=-1 and no activityLinkId")

    # 记录仍有 [Unknown Location]Unknown Activity Name 的文件
    logger.info("=== Files with Unknown Activity Name After Deletion ===")
    for fname in unknown_activity_files:
        logger.info(f"{fname}")

    # 增加批次编号以便下一次运行
    try:
        with open(batch_file_path, "w") as f:
            f.write(str(batch_number + 1))
        logger.info(f"Batch number incremented to {batch_number + 1}")
    except Exception as e:
        logger.error(f"Error updating batch number: {e}")

    logger.info("All files processed successfully.")

if __name__ == "__main__":
    main()



Current batch number: 97
Current date: 0428
Output will be saved to: C:\Users\WasteWang\LCA\OUTPUT\0428_97


In [3]:
import os

def compare_directories(dir1, dir2):
    """
    Compare two directories and find files present in dir2 but missing in dir1.
    
    Args:
        dir1 (str): The path to the first directory.
        dir2 (str): The path to the second directory.
    
    Returns:
        list: A list of files present in dir2 but missing in dir1.
    """
    # List files in both directories
    files_dir1 = set(os.listdir(dir1))
    files_dir2 = set(os.listdir(dir2))
    
    # Find files present in dir2 but missing in dir1
    missing_files = files_dir2 - files_dir1
    return list(missing_files)

if __name__ == "__main__":
    # Define the directories to compare
    dir1 = r"C:\Users\WasteWang\LCA\OUTPUT\1213_93"
    dir2 = r"C:\Users\WasteWang\LCA\OUTPUT\1211_88"
    
    # Compare directories
    missing_files = compare_directories(dir1, dir2)
    
    # Print results
    if missing_files:
        print("Files present in dir2 but missing in dir1:")
        for file in missing_files:
            print(file)
    else:
        print("No files are missing in dir1 compared to dir2.")


No files are missing in dir1 compared to dir2.


In [None]:
import os
import pandas as pd
from collections import OrderedDict

def build_lca_matrix(folder_path, output_file):
    # 用于按处理顺序记录process_name的列表
    process_name_list = []
    # 初始化有序字典以保持插入顺序
    data_dict = OrderedDict()
    flow_set = set()

    # 列出所有CSV文件路径
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    total_files = len(csv_files)
    print(f"总共有 {total_files} 个CSV文件需要处理。")

    # 用于记录未成功处理的文件
    failed_files = []
    # 用于检测重复的process_name
    process_name_set = set()
    duplicate_process_files = []

    for idx, file_name in enumerate(csv_files, 1):
        file_path = os.path.join(folder_path, file_name)
        try:
            # 尝试用UTF-8读取
            df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            # 如果UTF-8失败，尝试gbk
            try:
                df = pd.read_csv(file_path, encoding='gbk')
            except Exception as e:
                print(f"错误: 无法读取文件 {file_name}。原因: {e}")
                failed_files.append(file_name)
                continue
        except Exception as e:
            print(f"错误: 无法读取文件 {file_name}。原因: {e}")
            failed_files.append(file_name)
            continue

        # 检查必需列
        if not {'process_name', 'field', 'flow', 'amount'}.issubset(df.columns):
            print(f"警告: 文件 {file_name} 缺少必要的列。")
            failed_files.append(file_name)
            continue

        process_names = df['process_name'].unique()
        if len(process_names) < 1:
            print(f"警告: 文件 {file_name} 无法找到process_name。")
            failed_files.append(file_name)
            continue

        process_name = process_names[0]
        if len(process_names) > 1:
            print(f"警告: 文件 {file_name} 中存在多个process_name，将使用第一个: {process_name}")

        # 检查是否有重复的process_name
        if process_name in process_name_set:
            print(f"警告: 重复的process_name '{process_name}' 出现在文件 {file_name} 中，跳过该文件。")
            duplicate_process_files.append(file_name)
            failed_files.append(file_name)
            continue
        else:
            process_name_set.add(process_name)
            # 将process_name按照处理顺序加入列表
            process_name_list.append(process_name)

        # 过滤需要的字段
        df_filtered = df[df['field'].isin(['intermediateExchange', 'elementaryExchange'])]

        if df_filtered.empty:
            print(f"警告: 文件 {file_name} 过滤后没有有效的行。")
            failed_files.append(file_name)
            continue

        # 分离 intermediate 和 elementary
        df_intermediate = df_filtered[df_filtered['field'] == 'intermediateExchange']
        df_elementary = df_filtered[df_filtered['field'] == 'elementaryExchange']

        if df_intermediate.empty and df_elementary.empty:
            print(f"警告: 文件 {file_name} 中没有 intermediateExchange 或 elementaryExchange 的行。")
            failed_files.append(file_name)
            continue

        column_added = False

        # 处理 intermediateExchange
        for _, row in df_intermediate.iterrows():
            flow = row['flow']
            amount = row['amount'] if pd.notnull(row['amount']) else 0
            if flow not in flow_set:
                flow_set.add(flow)
                data_dict[flow] = {}
            data_dict[flow][process_name] = amount
            column_added = True

        # 处理 elementaryExchange
        for _, row in df_elementary.iterrows():
            flow = row['flow']
            amount = row['amount'] if pd.notnull(row['amount']) else 0
            if flow not in flow_set:
                flow_set.add(flow)
                data_dict[flow] = {}
            data_dict[flow][process_name] = amount
            column_added = True

        if not column_added:
            failed_files.append(file_name)

        if idx % 1000 == 0 or idx == total_files:
            print(f"已处理 {idx}/{total_files} 个文件。")

    # 合并数据构建矩阵
    print("构建DataFrame中...")
    all_flows = list(data_dict.keys())
    matrix_df = pd.DataFrame.from_dict(data_dict, orient='index').fillna(0)
    # 按插入顺序排序行
    matrix_df = matrix_df.reindex(all_flows)
    # 使用 process_name_list 来保证列顺序
    matrix_df = matrix_df.reindex(columns=process_name_list)
    matrix_df.index.name = 'flow'

    # 对角化处理：
    # 如果某列的列名在行名中存在，则将对应的行移动至与列同索引（对角线）位置
    row_order = list(matrix_df.index)
    for i, col in enumerate(matrix_df.columns):
        if col in row_order:
            current_pos = row_order.index(col)
            if current_pos != i:
                # 交换行，使得 (col, col) 处于对角线上
                row_order[i], row_order[current_pos] = row_order[current_pos], row_order[i]
    matrix_df = matrix_df.reindex(index=row_order)

    # 保存到CSV
    print(f"保存矩阵到 {output_file} 中...")
    matrix_df.to_csv(output_file, encoding='utf-8')
    print("完成矩阵构建！")

    # 统计信息
    print("\n===== 矩阵统计信息 =====")
    num_rows, num_cols = matrix_df.shape
    total_elements = num_rows * num_cols
    non_zero = (matrix_df != 0).sum().sum()
    non_zero_ratio = non_zero / total_elements if total_elements != 0 else 0
    negative = (matrix_df < 0).sum().sum()
    negative_ratio = negative / total_elements if total_elements != 0 else 0
    negative_over_non_zero = (negative / non_zero) if non_zero != 0 else 0

    print(f"行数: {num_rows}")
    print(f"列数: {num_cols}")
    print(f"非零值数量: {non_zero}")
    print(f"非零值占比: {non_zero_ratio:.2%}")
    print(f"负值数量: {negative}")
    print(f"负值占比: {negative_ratio:.2%}")
    print(f"负值占非零值比重: {negative_over_non_zero:.2%}")
    print("========================\n")

    # 统计全零行数量
    all_zero_rows = (matrix_df.sum(axis=1) == 0).sum()
    print(f"全零行的数量: {all_zero_rows}")

    # 输出未生成新列的文件列表
    if failed_files:
        failed_file_path = os.path.join(os.path.dirname(output_file), "failed_files.txt")
        with open(failed_file_path, "w", encoding="utf-8") as f:
            f.write("未生成新列的CSV文件列表:\n")
            for file in failed_files:
                f.write(f"{file}\n")
        print(f"\n以下CSV文件未生成新列，详情请查看 '{failed_file_path}' 文件：")
        for file in failed_files:
            print(f" - {file}")
    else:
        print("所有CSV文件均已成功生成新列。")

    # 输出重复process_name的CSV文件
    if duplicate_process_files:
        duplicate_file_path = os.path.join(os.path.dirname(output_file), "duplicate_process_files.txt")
        with open(duplicate_file_path, "w", encoding='utf-8') as f:
            f.write("具有重复 process_name 的CSV文件列表:\n")
            for file in duplicate_process_files:
                f.write(f"{file}\n")
        print(f"\n以下CSV文件具有重复的 process_name，未生成新列，详情请查看 '{duplicate_file_path}' 文件：")
        for file in duplicate_process_files:
            print(f" - {file}")
    else:
        print("没有发现重复的 process_name。")

if __name__ == "__main__":
    folder_path = r"C:\Users\WasteWang\LCA\OUTPUT\1213_93"
    output_file = r"C:\Users\WasteWang\LCA\OUTPUT\NEW_3.11_1213_93_LCA_matrix.csv"
    # 每运行一次这里要改一次文件名
    build_lca_matrix(folder_path, output_file)


总共有 26037 个CSV文件需要处理。
已处理 1000/26037 个文件。
已处理 2000/26037 个文件。
已处理 3000/26037 个文件。
已处理 4000/26037 个文件。
已处理 5000/26037 个文件。
已处理 6000/26037 个文件。
已处理 7000/26037 个文件。
已处理 8000/26037 个文件。
已处理 9000/26037 个文件。
已处理 10000/26037 个文件。
已处理 11000/26037 个文件。
已处理 12000/26037 个文件。
已处理 13000/26037 个文件。
已处理 14000/26037 个文件。
已处理 15000/26037 个文件。
已处理 16000/26037 个文件。
已处理 17000/26037 个文件。
已处理 18000/26037 个文件。
已处理 19000/26037 个文件。
已处理 20000/26037 个文件。
已处理 21000/26037 个文件。
已处理 22000/26037 个文件。
已处理 23000/26037 个文件。
已处理 24000/26037 个文件。
已处理 25000/26037 个文件。
已处理 26000/26037 个文件。
警告: 文件 global_activity_mapping.csv 缺少必要的列。
警告: 文件 neg1_amount_files.csv 缺少必要的列。
警告: 文件 non1_amount_files.csv 缺少必要的列。
构建DataFrame中...
保存矩阵到 C:\Users\WasteWang\LCA\OUTPUT\NEW_3.11_1213_93_LCA_matrix.csv 中...
完成矩阵构建！

===== 矩阵统计信息 =====
行数: 29271
列数: 26034
非零值数量: 1645996
非零值占比: 0.22%
负值数量: 183324
负值占比: 0.02%
负值占非零值比重: 11.14%

全零行的数量: 0

以下CSV文件未生成新列，详情请查看 'C:\Users\WasteWang\LCA\OUTPUT\failed_files.txt' 文件：
 - global_activity_mapping.csv
 -