# General description of \<matrix\>

## Loading Matrix

In [None]:
import pandas as pd
import numpy as np

# Load the EHR × HPO one-hot matrix
df_ehr_hpo = pd.read_csv("../data/ohe_20kRennes_EHR_2025_03_10.csv", index_col=0)

df_ehr_hpo.head()

## General Stats

In [None]:
# Matrix shape
n_ehrs, n_terms = df_ehr_hpo.shape

print("=== Matrix Shape ===")
print(f"Number of EHRs (rows)        : {n_ehrs}")
print(f"Number of HPO terms (columns): {n_terms}\n")

print("=== Column Types ===")
print(df_ehr_hpo.dtypes.value_counts())


## Counting terms (by columns)

In [None]:
# Sum per term (column)
term_counts = df_ehr_hpo.sum(axis=0)

# Terms actually present at least once
n_terms_observed = (term_counts > 0).sum()

# Total occurrences
total_term_occurrences = term_counts.sum()

print("=== Term Statistics ===")
print(f"Total terms (columns)                : {n_terms}")
print(f"Observed terms (>0)                  : {n_terms_observed}")
print(f"Unobserved terms (=0)                : {n_terms - n_terms_observed}")
print(f"Total term occurrences (global sum)  : {int(total_term_occurrences)}")


## Terms per EHR (row-wise)

In [None]:
terms_per_ehr = df_ehr_hpo.sum(axis=1)

print("=== HPO Terms per EHR ===")
display(terms_per_ehr.describe())

print("\nExamples (first 5 EHRs):")
display(terms_per_ehr.head())


## Top frequent terms

In [None]:
TOP_N = 50

top_terms = term_counts.sort_values(ascending=False).head(TOP_N)

print(f"=== Top {TOP_N} Most Frequent Terms ===")
display(top_terms.to_frame(name="count"))


## Key (index) information

In [None]:
print("=== Key / Index Information ===")
n_keys = len(df_ehr_hpo.index)
n_unique_keys = df_ehr_hpo.index.nunique()

print(f"Number of keys (rows)        : {n_keys}")
print(f"Unique keys                  : {n_unique_keys}")
print(f"Duplicated keys              : {n_keys - n_unique_keys}")

if n_keys != n_unique_keys:
    duplicates = df_ehr_hpo.index[df_ehr_hpo.index.duplicated()].unique()
    print("\nDuplicated keys (sample):")
    print(duplicates[:20])


## Simple quality checks / detection

#### EHR without any terms

In [None]:
ehr_zero_terms = terms_per_ehr[terms_per_ehr == 0]

print("=== EHRs With No HPO Terms ===")
print(f"Count: {len(ehr_zero_terms)}")

if len(ehr_zero_terms) > 0:
    print("\nSample:")
    print(ehr_zero_terms.head())


#### Terms never observed

In [None]:
terms_zero = term_counts[term_counts == 0]

print("=== Terms Never Observed ===")
print(f"Count: {len(terms_zero)}")

if len(terms_zero) > 0:
    print("\nSample:")
    print(terms_zero.head())


#### Rare terms Detection

In [None]:
RARE_THRESHOLD = 5  # adjustable

rare_terms = term_counts[term_counts > 0][term_counts <= RARE_THRESHOLD]

print(f"=== Rare Terms (≤ {RARE_THRESHOLD} occurrences) ===")
print(f"Count: {len(rare_terms)}")

if len(rare_terms) > 0:
    print("\nSample:")
    print(rare_terms.head())
