In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DataCleaner – 按行缺失率删行版
双击即可运行，参数已写死在 main 区域
"""
import pandas as pd
import re
import logging
import numpy as np
import warnings
from pathlib import Path
from typing import List

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(message)s")
logger = logging.getLogger("data_cleaner")

# ==================== 正则提取数字 ====================
NUM_PATTERN = re.compile(r"-?\d+(?:\.\d+)?")
MISSING_PATTERN = re.compile(r".*missing.*form.*|.*incomplete.*workbook.*", re.IGNORECASE)


class DataCleaner:
    # ---------- 1. 正则提取 ----------
    def extract_number(self, text) -> float | int | str | None:
        if pd.isna(text):
            return np.nan
        text_str = str(text).strip()
        if MISSING_PATTERN.match(text_str):
            return np.nan
        match = NUM_PATTERN.search(text_str)
        if match:
            num_str = match.group(0)
            try:
                return float(num_str) if '.' in num_str else int(num_str)
            except Exception:
                return np.nan
        if not text_str or text_str.isspace():
            return np.nan
        return text_str

    def batch_extract_numbers(self, df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
        logger.info(f"批量正则提取数字，共 {len(cols)} 列...")
        for i, col in enumerate(cols, 1):
            try:
                df[col] = df[col].apply(self.extract_number)
                if i % 20 == 0 or i == len(cols):
                    logger.info(f"进度: {i}/{len(cols)}  ({i/len(cols)*100:.1f}%)")
            except Exception as e:
                logger.warning(f"列 {col} 处理失败: {e}")
        return df

    # ---------- 2. 按行缺失率删行 ----------
    def remove_high_missing_rows(self, df: pd.DataFrame, id_col: str, threshold: float) -> pd.DataFrame:
        """删除缺失率 > threshold 的行（只看非 ID 列）"""
        cols2check = [c for c in df.columns if c != id_col]
        missing_rate = df[cols2check].isna().mean(axis=1)  # 每行缺失比例
        before_rows = len(df)
        df = df[missing_rate <= threshold].copy()
        after_rows = len(df)
        removed = before_rows - after_rows
        logger.info(f"行缺失率 > {threshold*100:.0f}% 的行数: {removed}；剩余: {after_rows}")
        return df

    # ---------- 3. 主入口 ----------
    def clean_excel_inplace(self,
                            in_file: str | Path,
                            sheet_name: str | int = 0,
                            id_col: str = None,
                            missing_threshold: float = 0.6,
                            backup: bool = True) -> None:
        in_file = Path(in_file)
        if not in_file.exists():
            raise FileNotFoundError(in_file)

        if backup:
            backup_file = in_file.with_name(
                f"{in_file.stem}_backup_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
            )
            import shutil
            shutil.copy2(in_file, backup_file)
            logger.info(f"已备份 -> {backup_file}")

        logger.info(f"读取文件: {in_file}  sheet: {sheet_name}")
        try:
            df = pd.read_excel(in_file, sheet_name=sheet_name)
        except Exception as e:
            logger.error(f"读取失败: {e}")
            raise

        original_rows = len(df)
        logger.info(f"原始数据: {original_rows} 行 × {df.shape[1]} 列")

        # 确定 ID 列
        if id_col is None or id_col not in df.columns:
            id_col = df.columns[0]
            logger.info(f"使用第一列作为 ID 列: {id_col}")
        else:
            logger.info(f"使用指定 ID 列: {id_col}")

        # ① 按行缺失率删行
        df = self.remove_high_missing_rows(df, id_col, missing_threshold)

        # ② 正则提取数字（保留 ID 列原样）
        id_data = df[id_col].copy()
        other_cols = [c for c in df.columns if c != id_col]
        df = self.batch_extract_numbers(df, other_cols)
        df[id_col] = id_data

        # ③ 写回 Excel
        if isinstance(sheet_name, int):
            with pd.ExcelFile(in_file) as xls:
                actual_sheet = xls.sheet_names[sheet_name]
        else:
            actual_sheet = sheet_name

        logger.info(f"覆盖原 sheet: {actual_sheet}")
        with pd.ExcelWriter(in_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            df.to_excel(writer, sheet_name=actual_sheet, index=False)

        logger.info("✅ 清洗完成！")
        self._print_summary(df, original_rows)

    # ---------- 4. 摘要 ----------
    def _print_summary(self, df: pd.DataFrame, original_rows: int):
        print("\n" + "="*60)
        print("数据清洗摘要")
        print("="*60)
        total = df.shape[0] * df.shape[1]
        missing = df.isna().sum().sum()
        print(f"缺失统计:")
        print(f"  总单元格: {total:,}")
        print(f"  缺失单元格: {missing:,}")
        print(f"  缺失率: {missing/total*100:.2f}%")
        removed = original_rows - df.shape[0]
        print(f"行数变化:")
        print(f"  原始行数: {original_rows}")
        print(f"  清洗后行数: {df.shape[0]}")
        print(f"  删除行数: {removed}")
        print(f"数据维度: {df.shape[0]} 行 × {df.shape[1]} 列")


# ==================== 一键执行 ====================
if __name__ == "__main__":
    INPUT_FILE = r"C:\Users\DXW\Desktop\matched_result.xlsx"
    SHEET_NAME = "out"        # 可改为 0 或任意 sheet 名
    ID_COL     = None            # None=用第一列当 ID
    THRESHOLD  = 0.7             # 单行的缺失率阈值
    BACKUP     = True            # 是否备份原文件

    cleaner = DataCleaner()
    try:
        cleaner.clean_excel_inplace(
            in_file=INPUT_FILE,
            sheet_name=SHEET_NAME,
            id_col=ID_COL,
            missing_threshold=THRESHOLD,
            backup=BACKUP
        )
        print(f"\n✅ 清洗完成！原文件 {INPUT_FILE} 的 {SHEET_NAME} sheet 已被更新")
    except Exception as e:
        logger.error(f"清洗失败: {e}")
        import traceback
        traceback.print_exc()

INFO | 已备份 -> C:\Users\DXW\Desktop\matched_result_backup_20260103_223823.xlsx
INFO | 读取文件: C:\Users\DXW\Desktop\matched_result.xlsx  sheet: out
INFO | 原始数据: 163 行 × 125 列
INFO | 使用第一列作为 ID 列: MONTHS
INFO | 行缺失率 > 70% 的行数: 90；剩余: 73
INFO | 批量正则提取数字，共 124 列...
INFO | 进度: 20/124  (16.1%)
INFO | 进度: 40/124  (32.3%)
INFO | 进度: 60/124  (48.4%)
INFO | 进度: 80/124  (64.5%)
INFO | 进度: 100/124  (80.6%)
INFO | 进度: 120/124  (96.8%)
INFO | 进度: 124/124  (100.0%)
INFO | 覆盖原 sheet: out
INFO | ✅ 清洗完成！



数据清洗摘要
缺失统计:
  总单元格: 9,125
  缺失单元格: 2,878
  缺失率: 31.54%
行数变化:
  原始行数: 163
  清洗后行数: 73
  删除行数: 90
数据维度: 73 行 × 125 列

✅ 清洗完成！原文件 C:\Users\DXW\Desktop\matched_result.xlsx 的 out sheet 已被更新
