In [None]:
import pandas as pd

from load_raw_data import load_secms_data, get_disambiguated_dfs

In [None]:
base_path = "../data/raw"

In [None]:
def _n_total_unique(s: pd.Series) -> tuple[int, int]:
    return len(s), s.nunique()

In [None]:
def print_secms_report(df: pd.DataFrame, df_ambig: pd.DataFrame | None = None) -> None:
    COL_UNIPROT = "UniProtIds"
    COL_CELL    = "cell_line"
    COL_CAT     = "category"
    COL_REP     = "expnum"

    ambig_by_cell_cat_rep = {}
    ambig_by_rep = {}

    if df_ambig is not None and len(df_ambig) > 0:
        if all(c in df_ambig.columns for c in [COL_CELL, COL_CAT, COL_REP]):
            ambig_by_cell_cat_rep = (
                df_ambig.groupby([COL_CELL, COL_CAT, COL_REP], dropna=False)
                        .size()
                        .to_dict()
            )
        elif COL_REP in df_ambig.columns:
            # fallback if ambig df doesn't have cell_line/category
            ambig_by_rep = (
                df_ambig.groupby(COL_REP, dropna=False)
                        .size()
                        .to_dict()
            )

    # --- overall ---
    total, unique = _n_total_unique(df[COL_UNIPROT])
    cell_lines = sorted(df[COL_CELL].dropna().unique())
    n_cell_lines = len(cell_lines)

    print("=== Overall ===")
    print(f"Cell lines:           {n_cell_lines:,}")
    print(f"Cell line list:       {', '.join(cell_lines)}")
    print(f"Total UniProtIds rows: {total:,}")
    print(f"Unique UniProtIds:     {unique:,}")
    print(f"Total ambiguous rows: {len(df_ambig):,}")
    print()

    # --- per cell line ---
    for cell_line, dcell in df.groupby(COL_CELL, dropna=False, sort=True):
        n_cats = dcell[COL_CAT].nunique(dropna=True)
        t_cell, u_cell = _n_total_unique(dcell[COL_UNIPROT])

        print(f"=== Cell line: {cell_line} ===")
        print(f"Categories: {n_cats:,}")
        print(f"Total UniProtIds rows: {t_cell:,}")
        print(f"Unique UniProtIds:     {u_cell:,}")
        print()

        # --- per category (within cell line) ---
        for cat, dcat in dcell.groupby(COL_CAT, dropna=False, sort=True):
            t_cat, u_cat = _n_total_unique(dcat[COL_UNIPROT])
            n_reps_cat = dcat[COL_REP].nunique(dropna=True)

            print(f"  - Category: {cat}")
            print(f"    Total UniProtIds rows: {t_cat:,}")
            print(f"    Unique UniProtIds:     {u_cat:,}")
            print(f"    Replicas: {n_reps_cat:,}")

            # --- per replicate (within category) ---
            for rep, drep in dcat.groupby(COL_REP, dropna=False, sort=True):
                if df_ambig is None:
                    ambig_rep = "UNKNOWN"
                else:
                    ambig_rep = ambig_by_cell_cat_rep.get((cell_line, cat, rep), None)
                    if ambig_rep is None:
                        ambig_rep = ambig_by_rep.get(rep, "UNKNOWN")

                t_rep, u_rep = _n_total_unique(drep[COL_UNIPROT])
                print(f"      * Replica {rep} UniProtIds: total {t_rep:,} (unique {u_rep:,})")
                print(f"         * Ambiguous rows: {ambig_rep}")
            print()

In [None]:
secms_data = load_secms_data(base_path)
secms_data_clean, secms_data_ambig = get_disambiguated_dfs(secms_data)

In [None]:
print_secms_report(secms_data_clean, secms_data_ambig)