In [None]:
# Specify the path to your CSV file containing NIFTI paths
input_csv_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/ccm_memory/results/notebook_00/master_list_working_v4.csv'
sheet = None #'master_list_proper_subjects' ?

In [None]:
from calvin_utils.permutation_analysis_utils.statsmodels_palm import CalvinStatsmodelsPalm
# Instantiate the PalmPrepararation class
cal_palm = CalvinStatsmodelsPalm(input_csv_path=input_csv_path, output_dir=None, sheet=sheet)
# Call the process_nifti_paths method
data_df = cal_palm.read_and_display_data()
data_df


# 01 - Preprocess Your Data

**Handle NANs**
- Set drop_nans=True is you would like to remove NaNs from data
- Provide a column name or a list of column names to remove NaNs from

In [None]:
data_df.columns

In [None]:
drop_list = ['Memory_Score', 'Dataset']

In [None]:
data_df = cal_palm.drop_nans_from_columns(columns_to_drop_from=drop_list)
data_df

**Drop Row Based on Value of Column**

Define the column, condition, and value for dropping rows
- column = 'your_column_name'
- condition = 'above'  # Options: 'equal', 'above', 'below'

Set the parameters for dropping rows

In [None]:
data_df['Dataset'].unique()

In [None]:
data_df.columns

In [None]:
column = 'Dataset'  # The column you'd like to evaluate
condition = 'equal'  # Thecondition to check ('equal', 'above', 'below', 'not')
value = 'manitoba_memory' # The value to compare against

In [None]:
data_df, other_df = cal_palm.drop_rows_based_on_value(column, condition, value)
data_df

Regress out Covariates

In [None]:
data_df.columns

In [None]:
dependent_variable_list = ['Memory_Score']
regressors = ['Source', 'Age']

In [None]:
#Impute covariates in if desired
for col in regressors:
    if data_df[col].dtype == 'O':  # object/categorical
        data_df[col] = data_df[col].fillna(data_df[col].mode()[0])
    else:  # numeric
        data_df[col] = data_df[col].fillna(data_df[col].mean())
data_df[regressors].isna().sum()

In [None]:
from calvin_utils.statistical_utils.regression_utils import RegressOutCovariates
data_df, adjusted_dep_vars_list = RegressOutCovariates.run(df=data_df, dependent_variable_list=dependent_variable_list, covariates_list=regressors)
print(adjusted_dep_vars_list)

**Standardize Data**
- Enter Columns you Don't want to standardize into a list

In [None]:
data_df.columns

In [None]:
# Remove anything you don't want to standardize
cols_not_to_standardize = []

In [None]:
data_df = cal_palm.standardize_columns(cols_not_to_standardize, group_col='Dataset')

Do Miscellaneous Data Cleaning

In [None]:
# data_df = data_df[(data_df['Pre_Post_Memory_Effect_Size__Cohen_s_D_'] >= -1) & (data_df['Pre_Post_Memory_Effect_Size__Cohen_s_D_'] <= 1)]

Test Similarity Across Distributions

In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
from scipy.stats import ks_2samp
from tqdm import tqdm


class KSSimilarityTester:
    """
    Mean pair-wise Kolmogorov–Smirnov distance across categories + permutation test.

    Parameters
    ----------
    df : pd.DataFrame
    dv : str
        Name of the numeric dependent-variable column.
    cat : str
        Categorical column that defines the cohorts to compare.
    block : str | None, default None
        Optional column whose values define *exchangeability blocks*.
        Permutations are performed **within** each block only.
        (Typical use-case: Age_Group, Site, Batch, etc.)
    n_perms : int, default 1000
    random_state : int | None, default None
    """

    def __init__(
        self,
        df: pd.DataFrame,
        dv: str,
        cat: str,
        block: str | None = None,
        n_perms: int = 1000,
        random_state: int | None = None,
    ):
        cols = [dv, cat] + ([block] if block else [])
        self.df = df[cols].dropna().reset_index(drop=True)

        self.dv = dv
        self.cat = cat
        self.block = block
        self.n_perms = n_perms
        self.rng = np.random.default_rng(random_state)

        self.categories = self.df[cat].unique()
        self.obs_avg_ks: float | None = None
        self.perm_avgs: np.ndarray | None = None
        self.sim_p: float | None = None
        self.dissim_p: float | None = None

    # Internal #
    def _pairwise_mean_ks(self, labels: np.ndarray) -> float:
        """Mean KS statistic over all category pairs for the given label vector."""
        groups = {c: self.df[self.dv].values[labels == c] for c in self.categories}
        ks_vals = [
            ks_2samp(groups[a], groups[b], method="asymp").statistic
            for a, b in combinations(self.categories, 2)
        ]
        return float(np.mean(ks_vals))

    def _permute_labels(self) -> np.ndarray:
        """
        Return a permutation of cohort labels.
        • If no block column → shuffle all labels.
        • If block column present → shuffle labels **within** each block.
        """
        labels = self.df[self.cat].values.copy()

        if self.block is None:
            return self.rng.permutation(labels)

        # stratified permutation
        block_vals = self.df[self.block].values
        for b in np.unique(block_vals):
            idx = np.where(block_vals == b)[0]
            labels[idx] = self.rng.permutation(labels[idx])
        return labels

    # Public #
    def run(self) -> dict:
        """Compute observed statistic, permutation null, and p-values."""
        original_labels = self.df[self.cat].values
        self.obs_avg_ks = self._pairwise_mean_ks(original_labels)

        self.perm_avgs = np.empty(self.n_perms, dtype=float)
        for i in tqdm(range(self.n_perms), desc="Permuting", leave=False):
            permuted = self._permute_labels()
            self.perm_avgs[i] = self._pairwise_mean_ks(permuted)

        self.sim_p = (self.perm_avgs <= self.obs_avg_ks).mean()
        self.dissim_p = (self.perm_avgs >= self.obs_avg_ks).mean()

        return {
            "observed_mean_ks": self.obs_avg_ks,
            "permuted_mean_ks": self.perm_avgs,
            "similarity_p": self.sim_p,
            "dissimilarity_p": self.dissim_p,
        }

    def report(self) -> None:
        if self.obs_avg_ks is None:
            raise RuntimeError("Call .run() first.")
        print(f"Observed mean KS        : {self.obs_avg_ks:.4f}")
        print(f"Mean permuted KS        : {self.perm_avgs.mean():.4f}")
        print(f"Similarity p-value      : {self.sim_p:.4g}")
        print(f"Dissimilarity p-value   : {self.dissim_p:.4g}")


In [None]:
data_df.columns

In [None]:
data_df['Cause_of_Change'].unique()

In [None]:
tester = KSSimilarityTester(
    data_df,
    dv='Memory_Score',
    cat="Cause_of_Change",
    block="Group",
    n_perms=1000,
    random_state=42,
)
result = tester.run()
tester.report()
