In [1]:
import random
import pandas as pd

In [40]:
n_augmentations = 10
subset_size = 17

# Lista 173 HCP
hcp_list_path = '/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/list_HCP.txt'

# Txt di output
path_missing_report = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/report_hcp_mancanti.txt"
path_final_report = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/final_report.txt"

# Path per il file CSV con missing SCA
path_sca_missing = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/missing_SCA_files.csv"

# Path csv riassuntivo delle augmentation
csv_aug_path = '/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/metadata/aug_tracking.csv'

## HCP List

Controllo se il metodo utilizzato nella funzione augmentation.py funziona (ma anche là ci sono dei controlli)

In [35]:
# Read HCP file
with open(hcp_list_path, "r") as f:
    hcp_pool = [line.strip() for line in f.readlines()]

random.seed(42)

# Generate fixed HCP subsets that NOT overlap
shuffled_hcp = random.sample(hcp_pool, n_augmentations * subset_size)

hcp_subsets = [
    shuffled_hcp[i * subset_size : (i + 1) * subset_size]
    for i in range(n_augmentations)
]

Controlla che ogni subset sia di 17

In [55]:
for idx, subset in enumerate(hcp_subsets, start=1):
    if len(subset) != subset_size:
        print(f"Subset {idx} ha una lunghezza errata: {len(subset)} elementi")
    else:
        print(f"Subset {idx} OK ({len(subset)} HCP)")

Subset 1 OK (17 HCP)
Subset 2 OK (17 HCP)
Subset 3 OK (17 HCP)
Subset 4 OK (17 HCP)
Subset 5 OK (17 HCP)
Subset 6 OK (17 HCP)
Subset 7 OK (17 HCP)
Subset 8 OK (17 HCP)
Subset 9 OK (17 HCP)
Subset 10 OK (17 HCP)


Controlla quali sono quelli che rimangono fuori

In [59]:
used_hcp = set(shuffled_hcp)
hcp_pool_set = set(hcp_pool)
excluded_hcp = hcp_pool_set - used_hcp

print("Lista HCP esclusi:", sorted(excluded_hcp))

Lista HCP esclusi: ['146432', '178647', '826353']


Controlla overlap

In [37]:
# Flatten all subsets into a single list
all_hcps = [hcp for subset in hcp_subsets for hcp in subset]

# Compare total length vs. number of unique HCPs
if len(all_hcps) == len(set(all_hcps)):
    print("Subsets are disjoint: no overlaps.")
else:
    print("Warning: there are overlapping HCPs between subsets.")

Subsets are disjoint: no overlaps.


## Augmentation Check

In [14]:
aug_track = pd.read_csv(csv_aug_path)

In [15]:
aug_track.iloc[:5, :5]

Unnamed: 0,subject,augmentation,hcp_subset
0,002_S_4654,1,"901139,145834,109123,191336,180533,177645,1559..."
1,002_S_4654,2,"601127,833249,943862,757764,169747,901442,3800..."
2,002_S_4654,3,"205220,905147,872764,257845,135124,221319,2124..."
3,002_S_4654,4,"199655,397760,389357,167440,192641,671855,2145..."
4,002_S_4654,5,"910241,177746,200311,436845,131722,169040,1670..."


Controllo che ogni soggetto abbiamo 10 augmentation

In [16]:
augment_per_subject = aug_track.groupby("subject")["augmentation"].count()
subjects_missing = augment_per_subject[augment_per_subject != 10]

if subjects_missing.empty:
    print("Tutti i soggetti hanno esattamente 10 augmentation.")
else:
    print("Alcuni soggetti NON hanno 10 augmentation:")
    display(subjects_missing)

Tutti i soggetti hanno esattamente 10 augmentation.


Controllo che ogni augmentation abbia lo stesso numero di HCP soggetti per tutti i pazienti

In [28]:
# Verifica che ogni riga del DataFrame aug_track abbia esattamente 17 HCP
aug_track["hcp_count"] = aug_track["hcp_subset"].apply(lambda x: len(x.split(",")))

# Filtra le righe con meno o più di 17 HCP
invalid_hcp_counts = aug_track[aug_track["hcp_count"] != 17]

# Output
if invalid_hcp_counts.empty:
    print("Tutte le augmentation hanno esattamente 17 HCP.")
else:
    print("Alcune augmentation non hanno 17 HCP")
    #display(invalid_hcp_counts[["subject", "augmentation", "hcp_count"]])

Alcune augmentation non hanno 17 HCP:


Escludendo i soggetti sopra, controllo che quelli con 17 HCP siano coerenti tra loro, ovvero che per ogni soggetto gli HCP usati nelle varie augmentation siano uguali. Quindi:
- **Sub1**
    - *aug 1*: hcp1, hcp2, hcp3
    - *aug 2*: hcp4, hcp5, hcp6
    -  ...
...
- **Sub2**
    - *aug 1*: hcp1, hcp2, hcp3
    - *aug 2*: hcp4, hcp5, hcp6
    -  ...

In [21]:
# Escludi i soggetti con augmentations incomplete
valid_aug_track = aug_track[aug_track["hcp_count"] == 17].copy()

# Per ogni augmentation, calcola il gruppo hcp più frequente
ref_hcp_by_aug = valid_aug_track.groupby("augmentation")["hcp_subset"].agg(lambda x: x.mode().iloc[0])

# Assegna il valore atteso a ogni riga
valid_aug_track["expected_hcp_subset"] = valid_aug_track["augmentation"].map(ref_hcp_by_aug)

# Trova soggetti incoerenti
df_inconsistent = valid_aug_track[valid_aug_track["hcp_subset"] != valid_aug_track["expected_hcp_subset"]]

# Output finale
if df_inconsistent.empty:
    print("Gli stessi HCP sono stati usati per ogni augmentation nei soggetti validi.")
else:
    print("Alcune augmentation hanno gruppi HCP diversi tra soggetti (solo tra quelli validi):")
    display(df_inconsistent[["subject", "augmentation", "hcp_subset", "expected_hcp_subset"]].sort_values(["augmentation", "subject"]))


Gli stessi HCP sono stati usati per ogni augmentation nei soggetti validi.


Soggetti che contengono incorenze

In [26]:
print("Soggetti con inconsistenze negli HCP subset:")
print(invalid_hcp_counts["subject"].unique())

Soggetti con inconsistenze negli HCP subset:
['3_S_5003' '4_S_5003' '4_S_5005' '4_S_5007' '4_S_5008']


Vedi file per controllare le mancanze

In [30]:
# Funzione per calcolare gli HCP mancanti
def get_missing_hcps(expected, actual):
    expected_set = set(expected.split(","))
    actual_set = set(actual.split(","))
    return sorted(expected_set - actual_set)

# Calcola expected_hcp_subset per ogni augmentation (pattern atteso)
aug_track["expected_hcp_subset"] = aug_track["augmentation"].map(
    aug_track.groupby("augmentation")["hcp_subset"].agg(lambda x: x.mode().iloc[0])
)

# Filtra righe con augmentations non complete
incomplete_rows = aug_track[aug_track["hcp_count"] != 17]

# Report
with open(path_missing_report, "w") as f:
    for subject in incomplete_rows["subject"].unique():
        f.write(f"Soggetto {subject}\n")
        subj_df = incomplete_rows[incomplete_rows["subject"] == subject]
        for _, row in subj_df.iterrows():
            missing = get_missing_hcps(row["expected_hcp_subset"], row["hcp_subset"])
            if missing:
                f.write(f"Augmentation {int(row['augmentation'])}\n")
                f.write(f"File HCP mancanti: {', '.join(missing)}\n\n")
        f.write("-" * 40 + "\n\n")

## Comparing HCPs and SCA files missing

HCP missing as SCA files in the dataset (Lorenzo folder) and here

In [51]:
import pandas as pd

# Carica dati da CSV
sca_missing_df = pd.read_csv(path_sca_missing)

# Funzione per leggere il report delle mancanze HCP
with open(path_missing_report, "r") as file:
    report_lines = file.readlines()

results = []
current_subject = None
all_missing_hcp = set()
unexpected_missing_total = set()

for line in report_lines:
    line = line.strip()
    if line.startswith("Soggetto"):
        if current_subject:
            sca_row = sca_missing_df[sca_missing_df['subject'] == current_subject]
            if not sca_row.empty:
                sca_missing = set(sca_row.iloc[0]['SCA_files_missing'].split(","))

                expected_and_missing = all_missing_hcp.intersection(sca_missing)
                unexpected_and_missing = sca_missing - all_missing_hcp

                results.append(f"Soggetto {current_subject}\n")
                results.append(f"File attesi e mancanti come SCA: {', '.join(sorted(expected_and_missing))}\n")
                results.append(f"File NON attesi e mancanti come SCA: {', '.join(sorted(unexpected_and_missing))}\n\n")

                unexpected_missing_total.update(unexpected_and_missing)

            all_missing_hcp.clear()

        current_subject = line.split()[1]

    elif line.startswith("File HCP mancanti"):
        missing_hcp = line.replace("File HCP mancanti: ", "").split(", ")
        all_missing_hcp.update(missing_hcp)

# Gestisce l'ultimo soggetto nel ciclo stesso senza bisogno di duplicare codice
if current_subject:
    sca_row = sca_missing_df[sca_missing_df['subject'] == current_subject]
    if not sca_row.empty:
        sca_missing = set(sca_row.iloc[0]['SCA_files_missing'].split(","))

        expected_and_missing = all_missing_hcp.intersection(sca_missing)
        unexpected_and_missing = sca_missing - all_missing_hcp

        results.append(f"Soggetto {current_subject}\n")
        results.append(f"File attesi e mancanti come SCA: {', '.join(sorted(expected_and_missing))}\n")
        results.append(f"File NON attesi e mancanti come SCA: {', '.join(sorted(unexpected_and_missing))}\n\n")

        unexpected_missing_total.update(unexpected_and_missing)

# Aggiungi alla fine la lista totale degli inattesi mancanti
results.append("Lista totale dei file NON attesi e mancanti come SCA:\n")
results.append(f"{', '.join(sorted(unexpected_missing_total))}\n")

# Scrivi risultati
with open(path_final_report, "w") as file:
    file.writelines(results)

In [54]:
unexpected_missing_total

{'146432', '178647', '826353'}

Coincidono con quelli eslcusi dalla divisione pre-augmentation?

In [60]:
excluded_hcp == unexpected_missing_total

True