## Transformation doc;variant -> patient level

In [None]:
####### 
df_doc["avg_sentence_len_words"] = df_doc["CLEAN_FR_SPLIT"].apply(avg_sentence_length)

df_doc["hpo_per_sentence"] = df_doc.apply(
    lambda r: r["n_hpo"] / r["n_sentences"] if r["n_sentences"] > 0 else 0,
    axis=1
)

df_doc["hpo_per_1k_words"] = df_doc.apply(
    lambda r: r["n_hpo"] / r["n_words"] * 1000 if r["n_words"] > 0 else 0,
    axis=1
)


patient_agg = (
    df_doc.groupby("PATIENT_ID")
          .agg({
              "DOCUMENT_ID": "nunique",
              "CREATED_AT": ["min", "max"],
              "n_words": "sum",
              "n_sentences": "sum",
              "n_hpo": "sum",
              "n_variants": "first",
          })
)

patient_agg.columns = [
    "n_docs", "t_start", "t_end",
    "n_words_total", "n_sent_total",
    "n_hpo_total", "n_variants"
]

patient_agg["span_days"] = (patient_agg["t_end"] - patient_agg["t_start"]).dt.days



###### 
from collections.abc import Iterable

def flatten_hpo(series, unique: bool = False):
    """
    Prend une Series de listes d'HPO codes et la "flatten".
    
    - series : pd.Series où chaque élément est une liste/iterable de HPO codes
    - unique : si True, renvoie une liste triée de codes uniques
    """
    # On accepte list/tuple/set, on ignore le reste
    codes = [
        code
        for lst in series
        if isinstance(lst, Iterable) and not isinstance(lst, (str, bytes))
        for code in lst
    ]
    
    if unique:
        return sorted(set(codes))
    return codes


# Groupby sur les listes HPO par patient
grouped = df_doc.groupby("PATIENT_ID")["HPO_code"]

# Liste complète (avec répétitions)
hpo_full_list = grouped.apply(lambda s: flatten_hpo(s, unique=False))

# Liste unique (sans répétitions, triée)
hpo_unique_list = grouped.apply(lambda s: flatten_hpo(s, unique=True))

# Injection dans patient_agg (en s'assurant que l'index est PATIENT_ID)
patient_agg = patient_agg.join(
    pd.DataFrame({
        "HPO_full_list": hpo_full_list,
        "HPO_unique_list": hpo_unique_list,
    })
)

# Comptages avec .str.len() (plus lisible que apply(len))
patient_agg["n_hpo_full_list"] = patient_agg["HPO_full_list"].str.len()
patient_agg["n_hpo_unique"] = patient_agg["HPO_unique_list"].str.len()

# HPO (toutes occurrences) par document
patient_agg["hpo_total_per_doc"] = (
    patient_agg["n_hpo_full_list"] / patient_agg["n_docs"].replace(0, np.nan)
)

# HPO uniques par document
patient_agg["hpo_unique_per_doc"] = (
    patient_agg["n_hpo_unique"] / patient_agg["n_docs"].replace(0, np.nan)
)

# HPO (toutes occurrences) par phrase
patient_agg["hpo_per_sentence"] = (
    patient_agg["n_hpo_full_list"] /
    patient_agg["n_sent_total"].replace(0, np.nan)
)

# HPO (toutes occurrences) pour 1000 mots
patient_agg["hpo_per_1k_words"] = (
    patient_agg["n_hpo_full_list"] /
    patient_agg["n_words_total"].replace(0, np.nan) * 1000
)


##### 
def variant_status(n):
    n = 0 if pd.isna(n) else int(n)
    if n == 0:
        return "none"
    elif n == 1:
        return "mono"
    else:
        return "poly"

patient_agg["variant_status"] = patient_agg["n_variants"].apply(variant_status)

df_patient = patient_agg.reset_index()a

In [None]:
### Agg

In [None]:
agg_hpo = df.groupby('PATIENT_ID').agg({
    'HPO_unique_code': lambda x: [item for sub in x for item in sub],        # full list
    'HPO_unique_name': lambda x: [item for sub in x for item in sub],        # full list name
})
# Renomme après

## Création de la version unique
agg_hpo['hpo_unique_list'] = agg_hpo['HPO_unique_code'].apply(lambda x: list(set(x)))
agg_hpo['hpo_unique_list_name'] = agg_hpo['HPO_unique_name'].apply(lambda x: list(set(x)))

agg_hpo.rename(columns={
    'HPO_unique_code': 'hpo_full_list',
    'HPO_unique_name': 'hpo_full_list_name'
}, inplace=True)


## Agrégation des données documentaires
agg_doc = df.groupby('PATIENT_ID').agg(
    n_documents = ('DOCUMENT_ID', 'count'),
    first_doc_date = ('CREATED_AT', 'min'),
    last_doc_date = ('CREATED_AT', 'max'),
    total_words = ('n_words', 'sum'),
    mean_words_per_doc = ('n_words', 'mean'),
    max_words_doc = ('n_words', 'max'),
    total_sentences = ('n_sentences', 'sum'),
    mean_avg_sentence_len_words = ('avg_sentence_len_words', 'mean'),
    hpo_total_per_doc = ('HPO_unique_len', 'mean'), # moy par doc
)

## Data ingeneering
agg = agg_doc.join(agg_hpo)

# n HPO
agg['n_hpo_full_list'] = agg['hpo_full_list'].map(len)
agg['n_hpo_unique_list'] = agg['hpo_unique_list'].map(len)

# ratios
agg['hpo_unique_per_doc'] = agg['n_hpo_unique_list'] / agg['n_documents']
agg['hpo_per_sentence'] = agg['n_hpo_full_list'] / agg['total_sentences']
agg['hpo_per_1k_words'] = agg['n_hpo_full_list'] / (agg['total_words'] / 1000)

# Info temporelle
agg['followup_days'] = (agg['last_doc_date'] - agg['first_doc_date']).dt.days
agg['docs_per_year'] = agg['n_documents'] / (agg['followup_days'] / 365.25)

# Patient age 
agg_age = df.groupby('PATIENT_ID').agg(
    patient_age_first = ('PATIENT AGE', 'min'),
    patient_age_last = ('PATIENT AGE', 'max'),
)
agg = agg.join(agg_age)



In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# --- PLOT 1 : follow-up distribution ---
axes[0].hist(agg["followup_days"], bins=30, color="steelblue", edgecolor="black", alpha=0.8)
axes[0].set_title("Follow-up distribution")
axes[0].set_xlabel("Follow-up (days)")
axes[0].set_ylabel("Number of patients")

# --- PLOT 2 : distribution of n_docs ---
axes[1].hist(agg["n_documents"], bins=30, color="darkorange", edgecolor="black", alpha=0.8)
axes[1].set_title("Number of documents per patient")
axes[1].set_xlabel("Number of documents")
axes[1].set_ylabel("Number of patients")

plt.tight_layout()
plt.show()
