In [6]:
import subprocess
import pandas as pd
import numpy as np
import os
from scipy.stats import ttest_ind
from statsmodels.api import OLS, add_constant

In [7]:
import os
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.api import OLS, add_constant

for folder in os.listdir("../00_data/"):
    # Define folder paths
    input_folder = f"../00_data/{folder}/freesurfer_finished/"  # Folder containing the brain data files
    output_folder = f"../00_data/{folder}/deconfounded_but_age/"  # Folder to save the corrected files

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Load the covariates data
    covariates_file = f"../00_data/{folder}/all_ages_all_ids.csv"
    covariates_data = pd.read_csv(covariates_file)

    # Rename columns to a consistent format
    columns_to_rename = {"ID": "ID", "basis_sex": "Sex", "basis_uort": "Site", "label_Age": "Age"}
    covariates_data.rename(columns={k: v for k, v in columns_to_rename.items() if k in covariates_data.columns}, inplace=True)

    # Define required covariate columns
    required_covariates = ["ID", "Sex", "Site", "Age"]

    # Filter covariate data to keep only available columns
    covariates_data = covariates_data[[col for col in required_covariates if col in covariates_data.columns]]


    # Map categorical values to numerical ones
    if "Site" in covariates_data.columns:
        site_mapping = {site: idx for idx, site in enumerate(covariates_data["Site"].unique())}
        covariates_data["Site"] = covariates_data["Site"].map(site_mapping)
    sex_mapping = {"Male": 0, "Female": 1}
    covariates_data["Sex"] = covariates_data["Sex"].map(sex_mapping)
    covariates_data["ID"] = covariates_data["ID"].astype(str)

    # Process each brain data file in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".csv"):
            brain_volume_file = os.path.join(input_folder, file_name)
            brain_data_filename = os.path.splitext(file_name)[0]

            # Load the brain volume data
            brain_data = pd.read_csv(brain_volume_file)
            brain_data.rename(columns={"eid": "ID"}, inplace=True)
            brain_data["ID"] = brain_data["ID"].astype(str)
            
            if "sub-" in brain_data["ID"].iloc[0]:
                brain_data["ID"] = brain_data["ID"].str.replace("sub-", "")

            # Merge brain data with covariates
            merged_data = pd.merge(brain_data, covariates_data, on="ID", how="inner")

            # Define numerical columns for analysis
            numerical_cols = merged_data.select_dtypes(include=[np.number]).columns.tolist()

            # Handle NaN values
            nan_counts = merged_data.isnull().sum()
            if nan_counts.sum() > 0:
                merged_data[numerical_cols] = merged_data[numerical_cols].apply(
                    lambda col: col.fillna(col.mean())
                )

            # Step 1: Pre-Correction Analysis
            pre_correction_results = []
            for col in numerical_cols:
                if col not in ["Sex", "Site", "BMI"]:
                    group1 = merged_data[merged_data["Sex"] == 0][col]
                    group2 = merged_data[merged_data["Sex"] == 1][col]
                    t_stat, p_value = ttest_ind(group1, group2, equal_var=False)
                    pre_correction_results.append({"Variable": col, "t-statistic": t_stat, "p-value": p_value})

            pre_correction_df = pd.DataFrame(pre_correction_results)

            # Identify significant variables
            significant_vars = pre_correction_df[pre_correction_df["p-value"] < 0.05]["Variable"].tolist()

            # Step 2: Residualization
            corrected_data = merged_data.copy()
            predictors = ["Sex", "BMI"]

            if "Site" in merged_data.columns:
                predictors.append("Site")

            for col in significant_vars:
                X = add_constant(merged_data[predictors])
                y = merged_data[col]
                model = OLS(y, X).fit()
                residuals = y - model.predict(X)
                corrected_data[col] = residuals + y.mean()

            # Save corrected data
            corrected_data = corrected_data.drop(columns=[col for col in ["Sex", "Site", "Age", "BMI", "label_Age_group"] if col in corrected_data.columns])
            corrected_file_name = f"{brain_data_filename}.csv"
            full_file_path = os.path.join(output_folder, corrected_file_name)
            corrected_data.to_csv(full_file_path, index=False)

            print(f"Processed and saved: {full_file_path}")


Processed and saved: ../00_data/00_ds000030/deconfounded_but_age/aparc.volume.csv
Processed and saved: ../00_data/00_ds000030/deconfounded_but_age/aparc.thickness.csv
Processed and saved: ../00_data/00_ds000030/deconfounded_but_age/aseg.volume.csv
Processed and saved: ../00_data/02_ds002790/deconfounded_but_age/aseg.volume.csv
Processed and saved: ../00_data/02_ds002790/deconfounded_but_age/aparc.volume.csv
Processed and saved: ../00_data/02_ds002790/deconfounded_but_age/aparc.thickness.csv
Processed and saved: ../00_data/01_ds002785/deconfounded_but_age/aparc.thickness.csv
Processed and saved: ../00_data/01_ds002785/deconfounded_but_age/aparc.volume.csv
Processed and saved: ../00_data/01_ds002785/deconfounded_but_age/aseg.volume.csv


In [8]:
""" # Define folder paths
input_folder = "/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/freesurfer_finished/"  # Folder containing the brain data files
output_folder = "/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/deconfounded_but_age/"  # Folder to save the corrected files
#make sure the output folder exists

# Load the covariates data (this file is shared across all analyses)
covariates_file = "../00_data/Val_demographics_covariates.csv"  # Adjust the path as needed
covariates_data = pd.read_csv(covariates_file)
#covariates_data = covariates_data.drop(columns=["p20252_i2", "p34"])
covariates_data.rename(
    columns={"ID": "ID", "basis_sex": "Sex", "basis_uort": "Site", "basis_age": "Age"},
    inplace=True,
)
site_mapping = {site: idx for idx, site in enumerate(covariates_data["Site"].unique())}
covariates_data["Site"] = covariates_data["Site"].map(site_mapping)
sex_mapping = {"Male": 0, "Female": 1}
covariates_data["Sex"] = covariates_data["Sex"].map(sex_mapping)
covariates_data["ID"] = covariates_data["ID"].astype(str)

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Process each brain data file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        brain_volume_file = os.path.join(input_folder, file_name)
        brain_data_filename = os.path.splitext(file_name)[0]

        # Load the brain volume data
        brain_data = pd.read_csv(brain_volume_file)
        brain_data.rename(columns={"eid": "ID"}, inplace=True)
        brain_data["ID"] = brain_data["ID"].astype(str)
        if "sub-" in brain_data["ID"].iloc[0]:
            brain_data["ID"] = brain_data["ID"].str.replace("sub-", "")


        # Merge brain data with covariates
        merged_data = pd.merge(brain_data, covariates_data, on="ID", how="inner")

        # Define numerical columns for analysis
        numerical_cols = brain_data.select_dtypes(include=[np.number]).columns.tolist()

        # Handle NaN values
        nan_counts = merged_data.isnull().sum()
        if nan_counts.sum() > 0:
            merged_data[numerical_cols] = merged_data[numerical_cols].apply(
                lambda col: col.fillna(col.mean())
            )

        # Step 1: Pre-Correction Analysis
        pre_correction_results = []
        for col in numerical_cols:
            if col not in ["Sex", "Site"]:
                group1 = merged_data[merged_data["Sex"] == 0][col]
                group2 = merged_data[merged_data["Sex"] == 1][col]
                t_stat, p_value = ttest_ind(group1, group2, equal_var=False)
                pre_correction_results.append({"Variable": col, "t-statistic": t_stat, "p-value": p_value})
        pre_correction_df = pd.DataFrame(pre_correction_results)

        # Identify significant variables
        significant_vars = pre_correction_df[pre_correction_df["p-value"] < 0.05]["Variable"].tolist()

        # Step 2: Residualization
        corrected_data = merged_data.copy()
        for col in significant_vars:
            predictors = ["Sex", "Site"]
            X = add_constant(merged_data[predictors])
            y = merged_data[col]
            model = OLS(y, X).fit()
            residuals = y - model.predict(X)
            corrected_data[col] = residuals + y.mean()

        # Save corrected data
        corrected_data = corrected_data.drop(columns=["Sex", "Site", "Age"])
        corrected_file_name = f"{brain_data_filename}.csv"
        os.makedirs(output_folder, exist_ok=True)
        full_file_path = os.path.join(output_folder, corrected_file_name)
        corrected_data.to_csv(full_file_path, index=False)

        print(f"Processed and saved: {full_file_path}")
 """

' # Define folder paths\ninput_folder = "/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/freesurfer_finished/"  # Folder containing the brain data files\noutput_folder = "/zi/home/esra.lenz/Documents/00_HITKIP/09_TABPFN/01_Validation_data_set/00_data/deconfounded_but_age/"  # Folder to save the corrected files\n#make sure the output folder exists\n\n# Load the covariates data (this file is shared across all analyses)\ncovariates_file = "../00_data/Val_demographics_covariates.csv"  # Adjust the path as needed\ncovariates_data = pd.read_csv(covariates_file)\n#covariates_data = covariates_data.drop(columns=["p20252_i2", "p34"])\ncovariates_data.rename(\n    columns={"ID": "ID", "basis_sex": "Sex", "basis_uort": "Site", "basis_age": "Age"},\n    inplace=True,\n)\nsite_mapping = {site: idx for idx, site in enumerate(covariates_data["Site"].unique())}\ncovariates_data["Site"] = covariates_data["Site"].map(site_mapping)\nsex_mapping = {"Male": 0, "Female": 1}\n