In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# Define input and output directories
input_directory = "../00_data/"
output_directory = "../00_data/final_folder/"
mri_folder = "deconfounded_but_age"
demographic_file = "all_ages_all_ids.csv"

#ID,label_Age,Sex,Site,label_age_group, BMI possible
data_to_keep_in_demographic = ["label_age_group", "ID", "label_Age"]

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Get a list of all subdirectories
subfolders = [os.path.join(input_directory, folder) for folder in os.listdir(input_directory) if os.path.isdir(os.path.join(input_directory, folder))]

# Find unique filenames across all subfolders
file_map = {}
demographic_files = []
for folder in subfolders:
    if 'ds' in folder:
        demographic_files.append(os.path.join(folder, demographic_file))
        for file in os.listdir(os.path.join(folder, mri_folder)):
            sorted_name = "_".join(sorted(file.replace(".csv", "").split("_"))) + ".csv"
            if sorted_name not in file_map:
                file_map[sorted_name] = []
            file_map[sorted_name].append(os.path.join(folder, mri_folder, file))

# Process each file type found in the folders
for file_name, file_paths in file_map.items():
    all_dfs = []
    all_dfs_demographic = []

    for folder in subfolders:
        if 'ds' not in folder:
            continue
        for file_path in file_paths:
            df = pd.read_csv(file_path)
            #check columns if there is something with age or Age and drop them us to lower
            if any('age' in col.lower() for col in df.columns):
                print(f"Found age-related column in {file_path}")
                df = df.drop(columns=[col for col in df.columns if 'age' in col.lower()])
            #df['Source'] = os.path.basename(file_path)  # Track original file
            all_dfs.append(df)

    #concatenate demographic files
    for demographic_file in demographic_files:
        df = pd.read_csv(demographic_file)
        #df['Source'] = demographic_file
        all_dfs_demographic.append(df)
    merged_df_demographic = pd.concat(all_dfs_demographic, ignore_index=True)
    merged_df_demographic = merged_df_demographic[data_to_keep_in_demographic]
    if all_dfs:
        # Concatenate all DataFrames for the current file type
        merged_df = pd.concat(all_dfs, ignore_index=True)
        # Drop any duplicate rows
        merged_df = merged_df.drop_duplicates(subset='ID', keep='first')
        merged_df_demographic = merged_df_demographic.drop_duplicates(subset='ID', keep='first')
        # Merge the demographic data
        merged_df = pd.merge(merged_df, merged_df_demographic, on='ID', how='inner')
        #data to keep without ID
        col_to_drop = [col for col in data_to_keep_in_demographic if 'ID' not in col]
        mri_data = merged_df.drop(columns=col_to_drop)
        label_data = merged_df[data_to_keep_in_demographic]
        # Save the merged file
        output_path = os.path.join(output_directory, file_name)
        mri_data.to_csv(output_path, index=False)
        label_data.to_csv(output_path.replace(".csv", "_label.csv"), index=False)
        print(f"Merged and saved: {output_path}")

print("All matching files have been merged and saved.")

In [None]:
#chekc for double IDs in merged_df
ids = merged_df['ID']
ids = ids[ids.duplicated()]

ids_demo = merged_df_demographic['ID']
ids_demo = ids_demo[ids_demo.duplicated()]

print(f"Double IDs in merged_df: {ids}")
print(f"Double IDs in merged_df_demographic: {ids_demo}")

#check that all Ids in demo and merged_df
ids_demo = set(ids_demo)
ids = set(ids)
diff = ids_demo - ids
print(f"IDs in demo but not in merged_df: {diff}")
