# Comparison of HPO Binary Matrices (2022 vs 2025)

## Objectives

This notebook compares two binary HPO matrices generated with the same extraction method
on the same set of patient files, but using two different HPO versions (2022 vs 2025).

The objectives are to:
- quantify global differences between the two matrices
- identify new and lost HPO terms
- measure how many patient profiles are impacted
- describe changes at both patient and HPO-term levels

Matrices are assumed to be binary (0/1), with:
- rows = patient keys
- columns = HPO terms


## Imports and parameters

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_colwidth", 120)

# === PARAMÈTRES ===
MATRIX_2022_PATH = "matrix_2022.csv"   # csv / tsv / parquet
MATRIX_2025_PATH = "matrix_2025.csv"
LABEL_2022 = "2022"
LABEL_2025 = "2025"

## Helper functions

In [None]:
def read_matrix(path: Path) -> pd.DataFrame:
    """Read a binary HPO matrix (rows=keys, columns=HPO)."""
    suf = path.suffix.lower()
    if suf == ".parquet":
        df = pd.read_parquet(path)
    elif suf in [".tsv", ".txt"]:
        df = pd.read_csv(path, sep="\t", index_col=0)
    else:
        df = pd.read_csv(path, sep=",", index_col=0)

    df.index = df.index.astype(str)
    df.columns = [str(c).strip() for c in df.columns]

    df = df.apply(pd.to_numeric, errors="coerce").fillna(0).astype(int)
    df = df.clip(lower=0, upper=1)
    return df


def align_matrices(a: pd.DataFrame, b: pd.DataFrame):
    """Align on common keys and union of HPO columns."""
    keys_common = sorted(set(a.index) & set(b.index))
    cols_union = sorted(set(a.columns) | set(b.columns))

    a2 = a.reindex(index=keys_common, columns=cols_union, fill_value=0)
    b2 = b.reindex(index=keys_common, columns=cols_union, fill_value=0)

    keys_only_a = set(a.index) - set(b.index)
    keys_only_b = set(b.index) - set(a.index)

    return a2, b2, keys_only_a, keys_only_b


## Loading matrices

In [None]:
m22 = read_matrix(Path(MATRIX_2022_PATH))
m25 = read_matrix(Path(MATRIX_2025_PATH))

print("Matrix 2022 shape:", m22.shape)
print("Matrix 2025 shape:", m25.shape)

m22.head()


## Aligning matrices

In [None]:
m22a, m25a, keys_only_22, keys_only_25 = align_matrices(m22, m25)

print("Common keys:", m22a.shape[0])
print("Union HPO columns:", m22a.shape[1])
print("Keys only in 2022:", len(keys_only_22))
print("Keys only in 2025:", len(keys_only_25))


## Global descriptive metrics

In [None]:
summary = {
    "keys_2022": m22.shape[0],
    "keys_2025": m25.shape[0],
    "keys_common": m22a.shape[0],
    "hpo_columns_2022": m22.shape[1],
    "hpo_columns_2025": m25.shape[1],
    "hpo_columns_union": m22a.shape[1],
    "total_present_cells_2022": int(m22.values.sum()),
    "total_present_cells_2025": int(m25.values.sum()),
}

summary_df = pd.DataFrame([summary])
summary_df


## HPO terms present (at least one time)

In [None]:
terms_22 = set(m22.columns[m22.sum(axis=0) > 0])
terms_25 = set(m25.columns[m25.sum(axis=0) > 0])

new_terms_25 = sorted(terms_25 - terms_22)
lost_terms_25 = sorted(terms_22 - terms_25)
common_terms = sorted(terms_22 & terms_25)

pd.DataFrame({
    "metric": [
        "Terms present in 2022",
        "Terms present in 2025",
        "Common terms",
        "New terms in 2025",
        "Lost terms in 2025",
    ],
    "count": [
        len(terms_22),
        len(terms_25),
        len(common_terms),
        len(new_terms_25),
        len(lost_terms_25),
    ]
})


## Modification cell by cell (0->1, 1->0)

In [None]:
added = (m25a == 1) & (m22a == 0)
removed = (m25a == 0) & (m22a == 1)

n_added = int(added.values.sum())
n_removed = int(removed.values.sum())

pd.DataFrame({
    "change_type": ["Added (0→1)", "Removed (1→0)"],
    "n_cells": [n_added, n_removed]
})


## Impacted phenotypic profiles

In [None]:
changed_profiles = added.any(axis=1) | removed.any(axis=1)

pd.DataFrame({
    "metric": [
        "Profiles with any change",
        "Profiles unchanged",
        "% profiles with change"
    ],
    "value": [
        int(changed_profiles.sum()),
        int((~changed_profiles).sum()),
        round(changed_profiles.mean() * 100, 2)
    ]
})


## Per patient modifications

In [None]:
per_key_changes = pd.DataFrame({
    "n_terms_2022": m22a.sum(axis=1).astype(int),
    "n_terms_2025": m25a.sum(axis=1).astype(int),
    "n_added_terms": added.sum(axis=1).astype(int),
    "n_removed_terms": removed.sum(axis=1).astype(int),
})

per_key_changes["n_changed_terms"] = (
    per_key_changes["n_added_terms"] + per_key_changes["n_removed_terms"]
)

per_key_changes.sort_values("n_changed_terms", ascending=False).head(10)


## Per HPO term modifications

In [None]:
per_term_changes = pd.DataFrame({
    "freq_2022": m22a.sum(axis=0).astype(int),
    "freq_2025": m25a.sum(axis=0).astype(int),
    "n_patients_added": added.sum(axis=0).astype(int),
    "n_patients_removed": removed.sum(axis=0).astype(int),
})

per_term_changes["delta_freq_2025_minus_2022"] = (
    per_term_changes["freq_2025"] - per_term_changes["freq_2022"]
)

per_term_changes.sort_values("n_patients_added", ascending=False).head(10)


## Top news / old terms

In [None]:
top_new_terms = (
    per_term_changes[per_term_changes["freq_2022"] == 0]
    .sort_values("freq_2025", ascending=False)
)

top_lost_terms = (
    per_term_changes[per_term_changes["freq_2025"] == 0]
    .sort_values("freq_2022", ascending=False)
)

display(top_new_terms.head(10))
display(top_lost_terms.head(10))


## To add

- Jaccard similarity 2022 vs 2025 par patient
- Distribution plots (histogrammes) des ajouts/suppressions
- Comparaison restreinte aux termes HPO communs
- Lien entre changements HPO et changements de rang (dans le notebook de comparaison des runs)