In [1]:
import random
import pandas as pd

In [2]:
hcp_list_path = '/data_utils/debugging/list_HCP.txt'
n_augmentations = 10
subset_size = 17
csv_aug_path = '/data_utils/metadata/aug_tracking.csv'
output_txt = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/hcp_missing_report.txt"
csv_missing_path = "/data_utils/debugging/missing_hcp.csv"

## HCP List

In [3]:
# Read HCP file
with open(hcp_list_path, "r") as f:
    hcp_pool = [line.strip() for line in f.readlines()]

random.seed(42)

# Generate fixed HCP subsets that NOT overlap
shuffled_hcp = random.sample(hcp_pool, n_augmentations * subset_size)

hcp_subsets = [
    shuffled_hcp[i * subset_size : (i + 1) * subset_size]
    for i in range(n_augmentations)
]

In [5]:
hcp_subsets

[['901139',
  '145834',
  '109123',
  '191336',
  '180533',
  '177645',
  '155938',
  '140117',
  '725751',
  '134627',
  '818859',
  '381038',
  '114823',
  '111514',
  '134829',
  '176542',
  '178142'],
 ['601127',
  '833249',
  '943862',
  '757764',
  '169747',
  '901442',
  '380036',
  '177140',
  '401422',
  '191841',
  '102311',
  '159239',
  '898176',
  '209228',
  '771354',
  '158136',
  '871762'],
 ['205220',
  '905147',
  '872764',
  '257845',
  '135124',
  '221319',
  '212419',
  '186949',
  '116726',
  '429040',
  '146735',
  '251833',
  '115825',
  '942658',
  '157336',
  '365343',
  '200210'],
 ['199655',
  '397760',
  '389357',
  '167440',
  '192641',
  '671855',
  '214524',
  '878877',
  '108323',
  '203418',
  '146129',
  '263436',
  '550439',
  '572045',
  '393247',
  '118225',
  '169343'],
 ['910241',
  '177746',
  '200311',
  '436845',
  '131722',
  '169040',
  '167036',
  '706040',
  '204521',
  '150423',
  '214019',
  '751550',
  '200614',
  '115017',
  '197348',


In [4]:
# Flatten all subsets into a single list
all_hcps = [hcp for subset in hcp_subsets for hcp in subset]

# Compare total length vs. number of unique HCPs
if len(all_hcps) == len(set(all_hcps)):
    print("Subsets are disjoint: no overlaps.")
else:
    print("Warning: there are overlapping HCPs between subsets.")

Subsets are disjoint: no overlaps.


## Augmentation Check

In [3]:
df = pd.read_csv(csv_aug_path)

In [10]:
df.iloc[:5,:5]

Unnamed: 0,subject,augmentation,hcp_subset,expected_hcp_subset
0,002_S_4654,1,"901139,145834,109123,191336,180533,177645,1559...","901139,145834,109123,191336,180533,177645,1559..."
1,002_S_4654,2,"601127,833249,943862,757764,169747,901442,3800...","601127,833249,943862,757764,169747,901442,3800..."
2,002_S_4654,3,"205220,905147,872764,257845,135124,221319,2124...","205220,905147,872764,257845,135124,221319,2124..."
3,002_S_4654,4,"199655,397760,389357,167440,192641,671855,2145...","199655,397760,389357,167440,192641,671855,2145..."
4,002_S_4654,5,"910241,177746,200311,436845,131722,169040,1670...","910241,177746,200311,436845,131722,169040,1670..."


In [11]:
# 1. Check: ogni soggetto ha 10 augmentation
augment_per_subject = df.groupby("subject")["augmentation"].count()
subjects_missing = augment_per_subject[augment_per_subject != 10]

if subjects_missing.empty:
    print("Tutti i soggetti hanno esattamente 10 augmentation.")
else:
    print("Alcuni soggetti NON hanno 10 augmentation:")
    display(subjects_missing)

Tutti i soggetti hanno esattamente 10 augmentation.


In [14]:
# 2. Check: gli hcp_subset per ogni augmentation devono essere sempre gli stessi per tutti i soggetti
expected_hcp_by_aug = df.groupby("augmentation")["hcp_subset"].agg(lambda x: x.mode().iloc[0])
df["expected_hcp_subset"] = df["augmentation"].map(expected_hcp_by_aug)
df_inconsistent = df[df["hcp_subset"] != df["expected_hcp_subset"]]

if df_inconsistent.empty:
    print("Gli stessi HCP sono stati usati per ogni augmentation in tutti i soggetti.")
else:
    print("Alcune augmentation hanno gruppi HCP diversi tra soggetti")

Alcune augmentation hanno gruppi HCP diversi tra soggetti


In [15]:
incosistent_subject = df_inconsistent["subject"].unique()
print("Soggetti con inconsistenze negli HCP subset:")
print(incosistent_subject)

Soggetti con inconsistenze negli HCP subset:
['3_S_5003' '4_S_5003' '4_S_5005' '4_S_5007' '4_S_5008']


In [16]:
df_inconsistent.iloc[:1,:]['hcp_subset'].values[0]

'901139,109123,725751,134627,381038,111514,134829,176542'

In [17]:
df_inconsistent.iloc[:1,:]['expected_hcp_subset'].values[0]

'901139,145834,109123,191336,180533,177645,155938,140117,725751,134627,818859,381038,114823,111514,134829,176542,178142'

In [24]:
# Funzione per calcolare i veri mancanti
def get_missing_hcps(expected, actual):
    expected_set = set(expected.split(","))
    actual_set = set(actual.split(","))
    return sorted(list(expected_set - actual_set))

# Report
with open(output_txt, "w") as f:
    for subject in df["subject"].unique():
        subj_df = df[(df["subject"] == subject) & (df["hcp_subset"] != df["expected_hcp_subset"])]
        if subj_df.empty:
            continue
        f.write(f"soggetto {subject}\n")
        for _, row in subj_df.iterrows():
            missing = get_missing_hcps(row["expected_hcp_subset"], row["hcp_subset"])
            f.write(f"Augmentation {int(row['augmentation'])}\n")
            f.write(f"File HCP mancanti: {','.join(missing)}\n\n")
        f.write("-" * 30 + "\n\n")

print(f"Report corretto salvato in: {output_txt}")

Report corretto salvato in: /Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/hcp_missing_report.txt


In [32]:
# Dizionario: soggetto → set di HCP mancanti (accumulati da più augmentations)
missing_dict = {}

for _, row in df_inconsistent.iterrows():
    subject = row["subject"]
    missing = set(get_missing_hcps(row["expected_hcp_subset"], row["hcp_subset"]))
    if subject not in missing_dict:
        missing_dict[subject] = set()
    missing_dict[subject].update(missing)

# Costruzione del DataFrame e salvataggio
rows = [{"subject": subj, "missing_hcp": ",".join(sorted(missing_dict[subj]))} for subj in missing_dict]
df_missing = pd.DataFrame(rows)
df_missing.to_csv(csv_missing_path, index=False)

print(f"CSV con HCP mancanti salvato in: {csv_missing_path}")

CSV con HCP mancanti salvato in: /Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/missing_hcp.csv


## Comparing HCPs and SCA files missing

In [4]:
# Percorsi ai file
missing_sca_path = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/missing_SCA_files.csv"
missing_hcp_path = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/missing_hcp.csv"
output_txt_path = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/final_comparation.txt"

# Caricamento dei CSV
df_sca = pd.read_csv(missing_sca_path)  # Contiene SCA mancanti
df_hcp = pd.read_csv(missing_hcp_path)  # Contiene HCP mancanti post-augmentation

 # Lista globale per accumulare tutti gli SCA che non sono nei missing HCP
all_sca_not_in_hcp = set()

# Apertura file di output
with open(output_txt_path, "w") as f:
    # Iterazione con contatore numerato per ogni soggetto
    for idx, subject in enumerate(df_sca["subject"], start=1):
        sca_row = df_sca[df_sca["subject"] == subject].iloc[0]  # Riga dei missing SCA
        hcp_row = df_hcp[df_hcp["subject"] == subject]          # Riga dei missing HCP

        # Parsing delle stringhe in insiemi
        sca_missing = set(sca_row["SCA_files_missing"].split(","))
        hcp_missing = set()

        if not hcp_row.empty:
            hcp_missing = set(hcp_row["missing_hcp"].iloc[0].split(","))

        f.write(f"{idx}) Subject: {subject}\n")

        # Lista per ID che mancano nei SCA ma non nei missing_hcp
        sca_not_in_hcp = []

        # Controllo 1:1 tra ogni SCA ID e quelli HCP
        for sca_id in sorted(sca_missing):
            if sca_id in hcp_missing:
                f.write(f"- Missing SCA {sca_id} --> Missing hcp after augmentation\n")
            else:
                f.write(f"- Missing SCA {sca_id} --> NOT Missing hcp after augmentation\n")
                sca_not_in_hcp.append(sca_id)
                all_sca_not_in_hcp.add(sca_id)

        # Eventuale stampa dei missing in SCA e non HCP after augmentation
        extra_hcp = sorted(hcp_missing - sca_missing)
        if extra_hcp:
            f.write("\n------------------\n")
            f.write("HCP missing after augmentation BUT not in SCA missing:\n")
            f.write(", ".join(extra_hcp) + "\n")

        # Stampa finale della lista accumulata
        if sca_not_in_hcp:
            f.write("\nSoggetti che NON mancano dopo augmentation ma di cui mancano gli SCA files:\n")
            f.write(", ".join(sca_not_in_hcp) + "\n")

        # Separatore blocco soggetto
        f.write("-" * 50 + "\n\n")

print(f"Comparazione completata e salvata in: {output_txt_path}")

Comparazione completata e salvata in: /Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/final_comparation.txt


HCP missing from both

In [5]:
all_sca_not_in_hcp

{'146432', '178647', '826353'}

## HCP subsets

In [6]:
aug1 = ['901139', '145834', '109123', '191336', '180533', '177645', '155938', '140117', '725751', '134627', '818859', '381038', '114823', '111514', '134829', '176542', '178142']
aug2 = ['601127', '833249', '943862', '757764', '169747', '901442', '380036', '177140', '401422', '191841', '102311', '159239', '898176', '209228', '771354', '158136', '871762']
aug3 = ['205220', '905147', '872764', '257845', '135124', '221319', '212419', '186949', '116726', '429040', '146735', '251833', '115825', '942658', '157336', '365343', '200210']
aug4 = ['199655', '397760', '389357', '167440', '192641', '671855', '214524', '878877', '108323', '203418', '146129', '263436', '550439', '572045', '393247', '118225', '169343']
aug5 = ['910241', '177746', '200311', '436845', '131722', '169040', '167036', '706040', '204521', '150423', '214019', '751550', '200614', '115017', '197348', '330324', '132118']
aug6 = ['187345', '581450', '318637', '861456', '360030', '249947', '246133', '745555', '951457', '162935', '878776', '789373', '105923', '765864', '171633', '233326', '782561']
aug7 = ['144226', '193845', '562345', '181232', '899885', '346137', '130518', '148133', '541943', '126931', '283543', '352738', '192439', '128935', '178243', '385046', '239136']
aug8 = ['814649', '137128', '690152', '126426', '927359', '146937', '724446', '467351', '104416', '406836', '654552', '191033', '825048', '175237', '158035', '195041', '412528']
aug9 = ['173334', '196144', '627549', '926862', '100610', '165436', '644246', '525541', '130114', '156334', '169444', '164131', '573249', '198653', '463040', '164636', '125525']
aug10 = ['320826', '185442', '201515', '680957', '131217', '182739', '770352', '859671', '732243', '547046', '182436', '958976', '102816', '783462', '172130', '617748', '395756']

In [7]:
lengths = [len(aug1), len(aug2), len(aug3), len(aug4), len(aug5),
           len(aug6), len(aug7), len(aug8), len(aug9), len(aug10)]

for i, l in enumerate(lengths, start=1):
    print(f"aug{i} length: {l}")


aug1 length: 17
aug2 length: 17
aug3 length: 17
aug4 length: 17
aug5 length: 17
aug6 length: 17
aug7 length: 17
aug8 length: 17
aug9 length: 17
aug10 length: 17


In [8]:
# Lista di tutti i subset
augmentations = [aug1, aug2, aug3, aug4, aug5, aug6, aug7, aug8, aug9, aug10]

# Controllo duplicati per ogni subset
for i, aug in enumerate(augmentations, start=1):
    duplicates = [x for x in set(aug) if aug.count(x) > 1]
    if duplicates:
        print(f"Duplicati trovati in aug{i}: {duplicates}")
    else:
        print(f"Nessun duplicato in aug{i}")


Nessun duplicato in aug1
Nessun duplicato in aug2
Nessun duplicato in aug3
Nessun duplicato in aug4
Nessun duplicato in aug5
Nessun duplicato in aug6
Nessun duplicato in aug7
Nessun duplicato in aug8
Nessun duplicato in aug9
Nessun duplicato in aug10


In [10]:
# 1. Unisci tutti i subset in una lista unica
all_aug_hcp = set(aug1 + aug2 + aug3 + aug4 + aug5 + aug6 + aug7 + aug8 + aug9 + aug10)

# 2. Carica la lista completa da file
with open("/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/list_HCP.txt", "r") as f:
    full_hcp_list = set([line.strip() for line in f.readlines()])

# 3. Confronto: quali HCP mancano nei subset
missing_from_aug = sorted(full_hcp_list - all_aug_hcp)

print("HCP presenti nel file ma ASSENTI nei 10 subset:")
for hcp in missing_from_aug:
    print(hcp)

print(f"\nTotale mancanti: {len(missing_from_aug)}")

HCP presenti nel file ma ASSENTI nei 10 subset:
146432
178647
826353

Totale mancanti: 3
