# Plot creation

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

def plot_regression_clean(x, y, xlabel='Functional Disconnection', ylabel='Cognition'):
    # Fit regressione
    model = LinearRegression()
    model.fit(x.reshape(-1, 1), y)

    x_vals = np.linspace(min(x), max(x), 100)
    y_vals = model.predict(x_vals.reshape(-1, 1))

    # Colore rosso campionato
    dot_color = '#6FE6FC'  # colore rosso dalla tua immagine

    plt.figure(figsize=(6, 4))

    # Scatter
    plt.scatter(
        x, y,
        s=70,  # punti grandi
        color=dot_color,
        edgecolor='black',
        linewidth=0.6,
        alpha=0.9
    )

    # Regressione tratteggiata
    plt.plot(x_vals, y_vals, linestyle='--', color='black', linewidth=1.5)

    # Asse X e Y in basso/sinistra, più spessi
    ax = plt.gca()
    ax.spines['bottom'].set_linewidth(1)
    ax.spines['left'].set_linewidth(1)
    ax.spines['bottom'].set_edgecolor('black')
    ax.spines['left'].set_edgecolor('black')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Etichette assi
    plt.xlabel(xlabel, fontsize=12, fontweight='bold')
    plt.ylabel(ylabel, fontsize=12, fontweight='bold')
    ax.set_xticks([])  # Rimuove i valori sull'asse X
    ax.set_yticks([])  # Rimuove i valori sull'asse Y

    # Niente griglia
    plt.grid(False)
    plt.tight_layout()
    plt.show()

In [None]:
# Esempio dati
np.random.seed(1)
x = np.random.rand(60) * 10
y = -0.6 * x + np.random.normal(0, 2, 60) + 5

plot_regression_clean(x, y)

# Augmentation check

In [1]:
import random
import pandas as pd
import os

In [None]:
n_augmentations = 10
subset_size = 17

# Path to augmented FC maps directory
fcmaps_augmented_dir = '/data/OLD_FCmaps_augmented'

# List of 173 HCP subjects
hcp_list_path = '/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/list_HCP.txt'

# Output txt report
path_final_report = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/final_report.txt"

# Path to CSV file with missing SCA files
path_sca_missing = "/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/debugging/missing_SCA_files.csv"

# Summary CSV path of augmentation info
csv_aug_path = '/Users/emmatosato/Documents/PhD/ANM_Verona/data_utils/metadata/aug_tracking.csv'

## HCP List and Subsets

Check if the method used in augmentation.py works (note: it already has checks inside)

In [None]:
# Read HCP file
with open(hcp_list_path, "r") as f:
    hcp_pool = [line.strip() for line in f.readlines()]

random.seed(42)

# Generate fixed non-overlapping HCP subsets
shuffled_hcp = random.sample(hcp_pool, n_augmentations * subset_size)

hcp_subsets = [
    shuffled_hcp[i * subset_size : (i + 1) * subset_size]
    for i in range(n_augmentations)
]

Check that each subset has exactly 17 elements

In [None]:
for idx, subset in enumerate(hcp_subsets, start=1):
    if len(subset) != subset_size:
        print(f"Subset {idx} has incorrect length: {len(subset)} elements")
    else:
        print(f"Subset {idx} OK ({len(subset)} HCP)")

Check which HCPs were left out

In [None]:
used_hcp = set(shuffled_hcp)
hcp_pool_set = set(hcp_pool)
excluded_hcp = hcp_pool_set - used_hcp

print("List of excluded HCPs:", sorted(excluded_hcp))

Check for overlaps between subsets

In [None]:
# Flatten all subsets into a single list
all_hcps = [hcp for subset in hcp_subsets for hcp in subset]

# Compare total length vs. number of unique HCPs
if len(all_hcps) == len(set(all_hcps)):
    print("Subsets are disjoint: no overlaps.")
else:
    print("Warning: there are overlapping HCPs between subsets.")

## Augmentation Check

#### CSV

In [None]:
aug_track = pd.read_csv(csv_aug_path)

In [None]:
aug_track.iloc[:5, :5]

Check that there are exactly 177 unique subjects

In [None]:
unique_subjects = aug_track['subject'].unique()
print(f"Number of unique subjects: {len(unique_subjects)}")

Check that each subject has 10 augmentations

In [None]:
augment_per_subject = aug_track.groupby("subject")["augmentation"].count()
subjects_missing = augment_per_subject[augment_per_subject != 10]

if subjects_missing.empty:
    print("All subjects have exactly 10 augmentations.")
else:
    print("Some subjects DO NOT have 10 augmentations")
    print(subjects_missing)

Check that each augmentation has exactly 17 HCPs for all patients

In [None]:
# Verify that each row in the DataFrame has exactly 17 HCPs
aug_track["hcp_count"] = aug_track["hcp_subset"].apply(lambda x: len(x.split(",")))

# Filter rows with fewer or more than 17 HCPs
invalid_hcp_counts = aug_track[aug_track["hcp_count"] != 17]

# Output
if invalid_hcp_counts.empty:
    print("All augmentations have exactly 17 HCPs.")
else:
    print("Some augmentations do NOT have 17 HCPs")

Excluding inconsistent subjects, check that valid ones use the same HCPs per augmentation index across all subjects.
That is:
- **Sub1**
    - *aug 1*: hcp1, hcp2, hcp3
    - *aug 2*: hcp4, hcp5, hcp6
    -  ...
...
- **Sub2**
    - *aug 1*: hcp1, hcp2, hcp3
    - *aug 2*: hcp4, hcp5, hcp6
    -  ...

In [None]:
# Exclude subjects with incomplete augmentations
valid_aug_track = aug_track[aug_track["hcp_count"] == 17].copy()

# For each augmentation index, compute the most frequent HCP group
ref_hcp_by_aug = valid_aug_track.groupby("augmentation")["hcp_subset"].agg(lambda x: x.mode().iloc[0])

# Assign expected value to each row
valid_aug_track["expected_hcp_subset"] = valid_aug_track["augmentation"].map(ref_hcp_by_aug)

# Find inconsistent subjects
df_inconsistent = valid_aug_track[valid_aug_track["hcp_subset"] != valid_aug_track["expected_hcp_subset"]]

# Final output
if df_inconsistent.empty:
    print("Same HCPs used for each augmentation across all VALID subjects.")
else:
    print("Some augmentations have different HCPs between subjects (only among valid ones):")
    display(df_inconsistent[["subject", "augmentation", "hcp_subset", "expected_hcp_subset"]].sort_values(["augmentation", "subject"]))


Subjects with inconsistencies

In [None]:
print("Subjects with inconsistencies in HCP subset:")
print(invalid_hcp_counts["subject"].unique())

Example: inspect one subject with inconsistencies

In [None]:
# Filter rows for a specific subject
filtered = aug_track[aug_track["subject"] == "4_S_5005"]

In [None]:
filtered[["subject", "augmentation", "missing_hcps"]].sort_values("augmentation").reset_index()

#### Folder

Check if augmentations were actually created

In [None]:
# List all subfolders
subfolders = [f.path for f in os.scandir(fcmaps_augmented_dir) if f.is_dir()]
print(f"Total subfolders (Patients): {len(subfolders)}\n")

count_problems = 0
# Check each subfolder
for folder in subfolders:
    # List only files (ignore subdirectories)
    files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    num_files = len(files)

    # Only print if not exactly 10
    if num_files != 10:
        print(f"[WARNING] {folder} --> {num_files} files (expected 10)")
        count_problems += 1

if count_problems == 0:
    print("All folders contain exactly 10 files.")
else:
    print(f"Total folders with problems: {count_problems}")

Check that each subject has 10 unique augmentations

In [None]:
problems = []

# Loop over subject folders
for subject_dir in os.listdir(fcmaps_augmented_dir):
    subject_path = os.path.join(fcmaps_augmented_dir, subject_dir)
    if os.path.isdir(subject_path):
        files = [f for f in os.listdir(subject_path) if f.endswith(".nii.gz")]

        # Extract augmentation indices from filenames
        found_aug = sorted([
            int(f.split("aug")[-1].split(".")[0])
            for f in files
            if "aug" in f and f.split("aug")[-1].split(".")[0].isdigit()
        ])

        expected = list(range(1, 11))
        if found_aug != expected:
            problems.append((subject_dir, found_aug))

# Final report
if problems:
    print("Problems found in the following subjects:")
    for subj, augs in problems:
        print(f"[PROBLEM] {subj} has augmentations: {augs}")
else:
    print("All folders contain augmentations from 1 to 10 correctly.")


## Comparing HCPs and SCA files missing

HCP missing as SCA files in the dataset (Lorenzo folder) and here

In [None]:
import pandas as pd

# Load the CSVs
sca_missing_df = pd.read_csv(path_sca_missing)
aug_track = pd.read_csv(csv_aug_path)

# Variables
n_total_hcp = 173
results = []
unexpected_missing_total = set()

# Loop through subjects
for subject in sca_missing_df["subject"].unique():
    # From missing_SCA_files
    row_sca = sca_missing_df[sca_missing_df["subject"] == subject]
    sca_missing = set(row_sca.iloc[0]["SCA_files_missing"].split(","))
    count_missing = len(sca_missing)

    # From aug_tracking
    sub_aug = aug_track[aug_track["subject"] == subject]
    expected_hcp = set()
    for row in sub_aug["missing_hcps"]:
        hcp_ids = row.strip().strip('"').split(",")
        expected_hcp.update(hcp_ids)

    # Intersections
    expected_and_missing = sca_missing.intersection(expected_hcp)
    unexpected_and_missing = sca_missing - expected_hcp
    unexpected_missing_total.update(unexpected_and_missing)

    # Subject output
    results.append(f"Soggetto {subject}:\n")
    results.append(f"- Missing SCA files: {count_missing}\n")
    results.append(f"- Used SCA files:  {n_total_hcp - count_missing}\n")
    results.append(f"- File attesi nell'augmentation e mancanti come SCA: {', '.join(sorted(expected_and_missing))}\n")
    results.append(f"- File NON attesi nell'augmentation e mancanti come SCA: {', '.join(sorted(unexpected_and_missing))}\n\n")

# Final section
results.append("Unique list of NON expected SCA-missing files:\n")
results.append(f"{', '.join(sorted(unexpected_missing_total))}\n")

if unexpected_missing_total == excluded_hcp:
    results.append(f"----> The NON expected and missing SCA files match those excluded from augmentation.\n")

# Write report to file
with open(path_final_report, "w") as f:
    f.writelines(results)


In [7]:
import pandas as pd


mam = pd.read_excel("/Users/emmatosato/Downloads/5AS calcolo_valutazione_esame.xlsx", header=1)

  for idx, row in parser.parse():


In [8]:
mam

Unnamed: 0.1,Unnamed: 0,Numero progressivo,Cognome e Nome,Media,Credito,Prima prova,Seconda prova,Colloquio,Totale prove,Totale crediti + scritti + colloquio,Bonus attribuibile?,BONUS,Voto finale,LODE ASSEGNABILE?
0,,1,AMBROSI MATTEO,9.5,39.0,18.0,20.0,,38.0,77.0,No,0.0,77.0,No
1,,2,BOJA LIVIA,7.2,30.0,13.0,8.0,,21.0,51.0,No,0.0,51.0,No
2,,3,CIONCA TABITA,8.7,36.0,10.0,18.0,,28.0,64.0,No,0.0,64.0,No
3,,4,COSTANTINI IRENE,8.5,36.0,11.0,19.0,,30.0,66.0,No,0.0,66.0,No
4,,5,ED DAOUDI YASMINE,7.9,32.0,10.0,14.0,,24.0,56.0,No,0.0,56.0,No
5,,6,GOBBI ALBERTO,8.5,37.0,10.0,16.0,,26.0,63.0,No,0.0,63.0,No
6,,7,GUZZO GIORGIA,9.1,39.0,18.0,19.0,,37.0,76.0,No,0.0,76.0,No
7,,8,ISALBERTI MARTA,8.0,34.0,17.0,10.0,,27.0,61.0,No,0.0,61.0,No
8,,9,LOVATO SARA,8.5,37.0,14.0,18.0,,32.0,69.0,No,0.0,69.0,No
9,,10,MARANGONI FEDERICO,7.8,33.0,12.0,11.0,,23.0,56.0,No,0.0,56.0,No


In [15]:
media = round(mam['Media'].mean(),2)
media1 = round(mam['Prima prova'].mean(),2)
media2 = round(mam['Seconda prova'].mean(),2)

print(f"Media della media:", media)
print(f"Media della Prima prova:", media1)
print(f"Media della Seconda prova:", media2)

Media della media: 8.51
Media della Prima prova: 13.81
Media della Seconda prova: 16.0
