In [None]:
edaic_dir = "../datasets/EDAIC-WOZ"

# Remove folders from 300 to 492
for i in range(300, 493):
    folder_name = f"{i}_P"
    folder_path = os.path.join(edaic_dir, folder_name)
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)

# XXX_P/XXX_P -> XXX_P
for dir_name in os.listdir(edaic_dir):
    outer_path = os.path.join(edaic_dir, dir_name)
    if os.path.isdir(outer_path):
        inner_path = os.path.join(outer_path, dir_name)
        if os.path.isdir(inner_path):
            # Sposta tutti i file dal secondo livello al primo
            for filename in os.listdir(inner_path):
                src = os.path.join(inner_path, filename)
                dst = os.path.join(outer_path, filename)
                shutil.move(src, dst)
            # Rimuove la cartella interna vuota
            os.rmdir(inner_path)

# Csv files to concatenate
csv_files = ['dev_split.csv', 'test_split.csv', 'train_split.csv']
dfs = []

for csv_file in csv_files:
    path = os.path.join(edaic_dir, csv_file)
    df = pd.read_csv(path)
    dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

# Filter from 300 to 492
all_data = all_data[~all_data['Participant_ID'].between(300, 492)]

# Save
output_path = os.path.join(edaic_dir, "all_data.csv")
all_data.to_csv(output_path, index=False)

In [None]:
# Fix inconsistent PHQ labels
# Find participants with PHQ_Score >= 10 but PHQ_Binary = 0
inconsistent_mask = (all_data['PHQ_Score'] >= 10) & (all_data['PHQ_Binary'] == 0)
inconsistent_participants = all_data[inconsistent_mask]['Participant_ID'].tolist()

print(f"Found {len(inconsistent_participants)} participants with inconsistent PHQ labels:")
for participant_id in inconsistent_participants:
    phq_score = all_data[all_data['Participant_ID'] == participant_id]['PHQ_Score'].iloc[0]
    print(f"  Participant {participant_id}: PHQ_Score={phq_score}, PHQ_Binary=0 -> fixing to PHQ_Binary=1")

# Fix the inconsistent labels
all_data.loc[inconsistent_mask, 'PHQ_Binary'] = 1

print(f"\nFixed {len(inconsistent_participants)} inconsistent labels")

# Save the corrected data
all_data.to_csv(output_path, index=False)

In [None]:
# Delete directories for non-depressed participants (PHQ_Binary = 0)
non_depressed_participants = all_data[all_data['PHQ_Binary'] == 0]['Participant_ID'].unique()

print(f"Found {len(non_depressed_participants)} non-depressed participants to remove:")

deleted_count = 0
for participant_id in non_depressed_participants:
    folder_name = f"{participant_id}_P"
    folder_path = os.path.join(edaic_dir, folder_name)
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        deleted_count += 1

print(f"Deleted {deleted_count} directories for non-depressed participants")

# Remove non-depressed participants from the CSV data
all_data = all_data[all_data['PHQ_Binary'] == 1]
all_data.to_csv(output_path, index=False)