In [1]:
import subprocess
import pandas as pd
import numpy as np
import os
from scipy.stats import ttest_ind
from statsmodels.api import OLS, add_constant
import glob

In [3]:
df = pd.read_csv("../00_data/UKB_demographics_covariates_subset_healthy.csv")

In [4]:
df.columns

Index(['ID', 'Sex', 'Age', 'Site'], dtype='object')

In [5]:
def remove_old_csv_files(output_folder):
    csv_files = glob.glob(os.path.join(output_folder, "*.csv"))
    for file in csv_files:
        try:
            os.remove(file)
            print(f"Deleted: {file}")
        except Exception as e:
            print(f"Error deleting {file}: {e}")

In [12]:
# Define folder paths
input_folder = "../00_data/freesurfer_finished/"  # Folder containing the brain data files
output_folder = "../00_data/deconfounded_but_age/"  # Folder to save the corrected files
#make sure the output folder exists

# Load the covariates data (this file is shared across all analyses)
covariates_file = "../00_data/UKB_demographics_covariates_subset_healthy.csv"  # Adjust the path as needed
covariates_data = pd.read_csv(covariates_file)
covariates_data = covariates_data.drop(columns=["Age"])
filter_df = "../00_data/age_label/all_ages_healthy.csv"
filter_df = pd.read_csv(filter_df)
covariates_data = covariates_data[covariates_data["ID"].isin(filter_df["ID"])]
#covariates_data = covariates_data.drop(columns=["p20252_i2", "p34"])
covariates_data.rename(
    columns={"ID": "ID", "basis_sex": "Sex", "basis_uort": "Site", "basis_age": "Age"},
    inplace=True,
)
site_mapping = {site: idx for idx, site in enumerate(covariates_data["Site"].unique())}
covariates_data["Site"] = covariates_data["Site"].map(site_mapping)
sex_mapping = {"Male": 0, "Female": 1}
covariates_data["Sex"] = covariates_data["Sex"].map(sex_mapping)
covariates_data["ID"] = covariates_data["ID"].astype(str)

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)
remove_old_csv_files(output_folder)
# Process each brain data file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        brain_volume_file = os.path.join(input_folder, file_name)
        brain_data_filename = os.path.splitext(file_name)[0]

        # Load the brain volume data
        brain_data = pd.read_csv(brain_volume_file)
        brain_data.rename(columns={"eid": "ID"}, inplace=True)
        brain_data["ID"] = brain_data["ID"].astype(str)
        if "sub-" in brain_data["ID"].iloc[0]:
            brain_data["ID"] = brain_data["ID"].str.replace("sub-", "")


        # Merge brain data with covariates
        merged_data = pd.merge(brain_data, covariates_data, on="ID", how="inner")

        # Define numerical columns for analysis
        numerical_cols = brain_data.select_dtypes(include=[np.number]).columns.tolist()

        # Handle NaN values
        nan_counts = merged_data.isnull().sum()
        if nan_counts.sum() > 0:
            merged_data[numerical_cols] = merged_data[numerical_cols].apply(
                lambda col: col.fillna(col.mean())
            )

        # Step 1: Pre-Correction Analysis
        pre_correction_results = []
        for col in numerical_cols:
            if col not in ["Sex", "Site", "BMI"]:
                group1 = merged_data[merged_data["Sex"] == 0][col]
                group2 = merged_data[merged_data["Sex"] == 1][col]
                t_stat, p_value = ttest_ind(group1, group2, equal_var=False)
                pre_correction_results.append({"Variable": col, "t-statistic": t_stat, "p-value": p_value})
        pre_correction_df = pd.DataFrame(pre_correction_results)

        # Identify significant variables
        significant_vars = pre_correction_df[pre_correction_df["p-value"] < 0.05]["Variable"].tolist()

        # Step 2: Residualization
        corrected_data = merged_data.copy()
        for col in significant_vars:
            predictors = ["Sex", "Site", "BMI"]
            X = add_constant(merged_data[predictors])
            y = merged_data[col]
            model = OLS(y, X).fit()
            residuals = y - model.predict(X)
            corrected_data[col] = residuals + y.mean()

        # Save corrected data
        #check if columns are in the corrected data
        columns_to_drop = [col for col in corrected_data.columns if col not in numerical_cols + ["ID"]]
        print("columns to drop", columns_to_drop)
        corrected_data = corrected_data.drop(columns=columns_to_drop)
        corrected_file_name = f"{brain_data_filename}.csv"
        #remove all old files that are csvs in the output folder
        
        full_file_path = os.path.join(output_folder, corrected_file_name)
        corrected_data.to_csv(full_file_path, index=False)

        print(f"Processed and saved: {corrected_file_name}")


Deleted: ../00_data/deconfounded_but_age/aseg.volume.csv
Deleted: ../00_data/deconfounded_but_age/aparc.volume.csv
Deleted: ../00_data/deconfounded_but_age/aparc.thickness.csv


  return f(*args, **kwargs)


columns to drop ['Sex', 'Site']
Processed and saved: aseg.volume.csv


  return f(*args, **kwargs)


columns to drop ['Sex', 'Site']
Processed and saved: aparc.volume.csv


  return f(*args, **kwargs)


columns to drop ['Sex', 'Site']
Processed and saved: aparc.thickness.csv
