In [2]:
# step_filter.py
import os
import pandas as pd
from typing import Dict, List, Tuple, Union

Condition = Dict[str, object]
Rule = Tuple[str, List[Union[Condition, 'Rule']]]


class OAIPatientFilter:
    """
    总表与条件文件分离版本
    ID 比对不区分大小写（统一转大写）
    用法：
        filter = OAIPatientFilter(
            master_file="总表.xlsx",          # 当前目录
            cond_root="OAIdatabase"           # 当前目录下
        )
        filter.filter("初始", "step1", 条件树)
    """

    def __init__(self, master_file: str, cond_root: str = "OAIdatabase"):
        # 总表路径
        if not os.path.isfile(master_file):
            raise FileNotFoundError(master_file)
        self.master = os.path.abspath(master_file)

        # 条件文件根目录
        self.cond_root = os.path.abspath(cond_root)
        if not os.path.isdir(self.cond_root):
            raise FileNotFoundError(self.cond_root)

    # ---------------- 对外 API ----------------
    def filter(self,
               from_sheet: str,
               to_sheet: str,
               conditions: Rule) -> int:
        """单步筛选：源 sheet → 新 sheet（同名覆盖）"""
        df = pd.read_excel(self.master, sheet_name=from_sheet)
        # 不区分大小写：统一列名大写 + 值转大写
        df = self._normalize_id_col(df)
        keep_ids = self._eval_rules(conditions)
        new_df = df[df['ID'].str.upper().isin(keep_ids)].copy()

        with pd.ExcelWriter(self.master, mode='a', engine='openpyxl',
                            if_sheet_exists='replace') as w:
            new_df.to_excel(w, sheet_name=to_sheet, index=False)

        print(f"→ {from_sheet} ➜ {to_sheet}  保留 {len(new_df)} 人")
        return len(new_df)

    # ---------------- 内部 ----------------
    def _normalize_id_col(self, df: pd.DataFrame) -> pd.DataFrame:
        """把 ID 列找出来并统一转大写字符串"""
        id_col = None
        for col in df.columns:
            if col.upper() == 'ID':
                id_col = col
                break
        if id_col is None:
            raise ValueError("找不到 ID 列（不区分大小写）")
        df = df.rename(columns={id_col: 'ID'})
        df['ID'] = df['ID'].astype(str).str.upper()
        return df

    def _single_file_ids(self, cond: Condition) -> set:
        """单条件 → ID 集合（路径相对于 cond_root）"""
        path = os.path.join(self.cond_root, cond['file'])
        if not os.path.isfile(path):
            raise FileNotFoundError(path)
        df = pd.read_excel(path)
        # 同样不区分大小写
        df = self._normalize_id_col(df)
        hit = df.loc[df[cond['var']].apply(cond['cond']), 'ID']
        return set(hit)

    def _eval_rules(self, rule: Rule) -> set:
        logic, items = rule
        sets = [self._eval_rules(it) if isinstance(it, tuple)
                else self._single_file_ids(it) for it in items]
        if logic == '&':
            return set.intersection(*sets) if sets else set()
        if logic == '|':
            return set.union(*sets) if sets else set()
        raise ValueError("logic 只支持 & 或 |")


# ---------------- Jupyter 调用示例 ----------------
if __name__ == '__main__':
    filter = OAIPatientFilter(
        master_file="总表.xlsx",
        cond_root="OAIdatabase"
    )


In [4]:
    filter.filter("初始", "step1_左右关节镜手术",
                  ('|', [
                      {'file': 'Allclinical/AllClinical01.xlsx', 'var': 'V01MENR12', 'cond': (lambda x: x == 1)},
                      {'file': 'Allclinical/AllClinical01.xlsx', 'var': 'V01MENL12', 'cond': (lambda x: x == 1)}
                  ]))

→ 初始 ➜ step1_左右关节镜手术  保留 0 人


0

In [None]:
    filter.filter("初始", "step1_左右关节镜手术",
                  ('|', [
                      {'file': 'Allclinical/AllClinical01.xlsx', 'var': 'V01MENR12', 'cond': (lambda x: x == 1)},
                      {'file': 'Allclinical/AllClinical01.xlsx', 'var': 'V01MENL12', 'cond': (lambda x: x == 1)}
                  ]))

In [21]:
    filter.filter("step1_左右半月板切除", "step_2无死亡或置换",
                  ('&', [
                      {'file': 'General/Outcomes99.xlsx', 'var': 'V99EDDCF', 'cond': (lambda x: x == '.: Missing Form/Incomplete Workbook')}
                  ]))

→ step1_左右半月板切除 ➜ step_2无死亡或置换  保留 28 人


28