In [1]:
out_dir = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/ccm_memory/results/notebook_02'

# 01 - Get Datasets


**Data will be Imported from a CSV which is expected to be in this format**
- sub column contents MUST match the names of the neuroimaging files above. 
    - ID column 
```
+-----+----------------------------+--------------+--------------+--------------+
| sub | Nifti_File_Path            | Indep. Var.  | Covariate_N  | Dataset      |
+-----+----------------------------+--------------+--------------+--------------+
| 1   | /path/to/file1.nii.gz      | 0.5          | 1.2          | 1            |
| 2   | /path/to/file2.nii.gz      | 0.7          | 1.4          | 1            |
| 3   | /path/to/file3.nii.gz      | 0.6          | 1.5          | 2            |
| 4   | /path/to/file4.nii.gz      | 0.9          | 1.1          | 3            |
| ... | ...                        | ...          | ...          | ...          |
+-----+----------------------------+--------------+--------------+--------------+
```

In [2]:
input_csv_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/ccm_memory/metadata/master_list_v3.csv'
sheet = None # Set to None if CSV

In [None]:
from calvin_utils.permutation_analysis_utils.statsmodels_palm import CalvinStatsmodelsPalm
# Instantiate the PalmPrepararation class
cal_palm = CalvinStatsmodelsPalm(input_csv_path=input_csv_path, output_dir=out_dir, sheet=sheet)
# Call the process_nifti_paths method
data_df = cal_palm.read_and_display_data()
display(data_df)

**Handle NANs**
- Set drop_nans=True is you would like to remove NaNs from data
- Provide a column name or a list of column names to remove NaNs from

In [11]:
drop_list = ['Nifti_File_Path', 'diagnosis']

In [None]:
data_df = cal_palm.drop_nans_from_columns(columns_to_drop_from=drop_list)
display(data_df)

**Drop Row Based on Value of Column**

Define the column, condition, and value for dropping rows
- column = 'your_column_name'
- condition = 'above'  # Options: 'equal', 'above', 'below'

In [17]:
column = 'Dataset'
condition = 'equal'
value = 'adni_memory'

In [None]:
data_df, other_df = cal_palm.drop_rows_based_on_value(column, condition, value)
data_df

**Standardize Data**
- Enter Columns you Don't want to standardize into a list

In [19]:
# Remove anything you don't want to standardize
cols_not_to_standardize = ['Dataset', 'Subject'] # ['Z_Scored_Percent_Cognitive_Improvement_By_Origin_Group', 'Z_Scored_Subiculum_T_By_Origin_Group_'] #['Age']
group_col = 'Dataset' #Set to none if there are no specific groups

In [None]:
data_df = cal_palm.standardize_columns(cols_not_to_standardize, group_col)
data_df

# 02 - Import the Data into DataFrames, Control them, and Save them

In [20]:
dataset_col = 'Dataset'
nifti_path_col = 'Nifti_File_Path'
indep_var_col = 'percent_memory_improvement'
covariate_cols = ['diagnosis']

In [None]:
from calvin_utils.file_utils.import_functions import DatasetNiftiImporter
data_importer = DatasetNiftiImporter(df=data_df, dataset_col=dataset_col, nifti_col=nifti_path_col, indep_var_col=indep_var_col, covariate_cols=covariate_cols, out_dir=out_dir, regression_method='ols')

# 03 - Begin Analysis

In [7]:
import numpy as np
from scipy.stats import spearmanr
import json
from tqdm import tqdm

class DataLoader:
    def __init__(self, data_dict_path):
        with open(data_dict_path, 'r') as f:
            self.dataset_paths_dict = json.load(f)
    
    def load_dataset(self, dataset_name):
        paths = self.dataset_paths_dict[dataset_name]
        data = {
            'niftis': np.load(paths['niftis']),
            'indep_var': np.load(paths['indep_var']),
            'covariates': np.load(paths['covariates'])
        }
        return data
    
    @staticmethod
    def load_dataset_static(data_paths_dict, dataset_name):
        paths = data_paths_dict[dataset_name]

        data_dict = {
            'niftis': np.load(paths['niftis']),
            'indep_var': np.load(paths['indep_var']),
            'covariates': np.load(paths['covariates'])
        }
        return data_dict

class CorrelationCalculator:
    def __init__(self, method='pearson', verbose=False):
        self.method = method
        self.verbose = verbose

    def _calculate_pearson_r_map(self, niftis, indep_var):
        X = indep_var
        Y = niftis
        X_BAR = X.mean(axis=0)[:, np.newaxis]
        Y_BAR = Y.mean(axis=0)[np.newaxis, :]
        X_C = X - X_BAR
        Y_C = Y - Y_BAR
        NUMERATOR = np.dot(X_C.T, Y_C)
        SST_X = np.sum((X - X_BAR)**2, axis=0)
        SST_Y = np.sum((Y - Y_BAR)**2, axis=0)
        DENOMINATOR = np.sqrt(SST_X * SST_Y)
        r = NUMERATOR / DENOMINATOR
        
        if self.verbose:
            print(f"Shape of X: {X.shape}")
            print(f"Shape of Y: {Y.shape}")
            print(f"Shape of X_BAR: {X_BAR.shape}")
            print(f"Shape of Y_BAR: {Y_BAR.shape}")
            print(f"Shape of X_C: {X_C.shape}")
            print(f"Shape of Y_C: {Y_C.shape}")
            print(f"Shape of NUMERATOR: {NUMERATOR.shape}")
            print(f"Shape of DENOMINATOR: {DENOMINATOR.shape}")
        return r

    def _calculate_spearman_r_map(self, niftis, indep_var):
        '''Not easily broadcast, sorry!'''
        n_voxels = niftis.shape[1]
        rho = np.zeros(n_voxels)
        for i in tqdm(range(n_voxels), desc='Running Spearman Rho'):
            rho[i], _ = spearmanr(indep_var, niftis[:, i])
            
        if self.verbose:
            print(f"Shape of niftis: {niftis.shape}")
            print(f"Shape of rho: {rho.shape}")
        return rho
    
    def _process_data(self, data):
        if self.method == 'pearson':
            self.correlation_map = self._calculate_pearson_r_map(data['niftis'], data['indep_var'])
        elif self.method == 'spearman':
            self.correlation_map = self._calculate_spearman_r_map(data['niftis'], data['indep_var'])
    
    def process_all_datasets(self, data_dict):
        correlation_maps = {}
        for dataset_name in data_dict.keys():
            data = DataLoader.load_dataset_static(data_dict, dataset_name)
            self._process_data(data)
            correlation_maps[dataset_name] = self.correlation_map
        return correlation_maps


Enter the path to the dictionary generated by Step 2

In [8]:
dict_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/ccm_memory/results/notebook_02/tmp/dataset_dict.json'

What correlation method would you like to use?
- Pearson is very fast, but depends on linearity and is sensitive to outliers
- Spearman is slower, but much more robust

In [9]:
correlation = 'pearson'

In [10]:
data_loader = DataLoader(dict_path)
correlation_calculator = CorrelationCalculator(method=correlation, verbose=False)
corr_map_dict = correlation_calculator.process_all_datasets(data_loader.dataset_paths_dict)

  r = NUMERATOR / DENOMINATOR


In [14]:
from nilearn import plotting
import nibabel as nib
import os
import numpy as np

class ConvergentMapGenerator:
    def __init__(self, corr_map_dict, data_loader, mask_path=None, out_dir=None, weight=False):
        self.corr_map_dict = corr_map_dict
        self.data_loader = data_loader
        self.mask_path = mask_path
        self.out_dir = out_dir
        self.weight = weight
        self._handle_nans()
        
    def _handle_nans(self):
        drop_list = []
        for key in self.corr_map_dict.keys():
            if np.isnan(self.corr_map_dict[key]).all():
                print(f"Warning: The correlation map for {key} contains only NaNs and will be excluded from the analysis.")
                drop_list.append(key)
            elif np.isnan(self.corr_map_dict[key]).any():
                self.corr_map_dict[key] = np.nan_to_num(self.corr_map_dict[key], nan=0, posinf=1, neginf=-1)
            else:
                continue
        
        for key in drop_list:
            del self.corr_map_dict[key]
            
    def generate_weighted_average_r_map(self):
        r_maps = np.array(list(self.corr_map_dict.values()))
        if self.weight:
            weights = []
            for dataset_name in self.corr_map_dict.keys():
                data = self.data_loader.load_dataset(dataset_name)
                weights.append(data['niftis'].shape[0])
            weights = np.array(weights)
            return np.average(r_maps, axis=0, weights=weights)
        else:
            return np.mean(r_maps, axis=0)

    def generate_agreement_map(self):
        r_maps = np.array(list(self.corr_map_dict.values()))
        signs = np.sign(r_maps)
        agreement = np.all(signs == signs[0], axis=0)
        return agreement.astype(int)
    
    def _unmask_array(self, data_array, threshold=0):
        if self.mask_path is None:
            from nimlab import datasets as nimds
            mask = nimds.get_img("mni_icbm152")
        else:
            mask = nib.load(self.mask_path)

        mask_data = mask.get_fdata()
        mask_indices = mask_data.flatten() > threshold
        
        unmasked_array = np.zeros(mask_indices.shape)
        unmasked_array[mask_indices] = data_array.flatten()
        return unmasked_array.reshape(mask_data.shape), mask.affine

    def _save_map(self, map_data, file_name):
        unmasked_map, mask_affine = self._unmask_array(map_data)
        img = nib.Nifti1Image(unmasked_map, affine=mask_affine)
        if self.out_dir is not None:
            file_path = os.path.join(out_dir, 'convergence_map', file_name)
            nib.save(img, file_path)
        return img

    def _visualize_map(self, img, title):
        plotting.view_img(img, title=title).open_in_browser()
        
    def generate_and_save_maps(self):
        # Generate weighted average r map
        weighted_avg_map = self.generate_weighted_average_r_map()
        try:
            weighted_avg_img = self._save_map(weighted_avg_map, 'weighted_average_r_map.nii.gz')
            self._visualize_map(weighted_avg_img, 'Weighted Average R Map')
        except:
            pass

        # Generate agreement map
        agreement_map = self.generate_agreement_map()
        try:
            agreement_img = self._save_map(agreement_map, 'agreement_map.nii.gz')
            self._visualize_map(agreement_img, 'Agreement Map')
        except:
            pass
    
    def save_individual_r_maps(self):
        for dataset_name, r_map in self.corr_map_dict.items():
            r_img = self._save_map(r_map, f'{dataset_name}_correlation_map.nii.gz')
            self._visualize_map(r_img, f'{dataset_name} Correlation Map')

Generate the Convergent Maps and Save the Individual Correlation Maps

In [16]:
convergent_map_generator = ConvergentMapGenerator(corr_map_dict, data_loader, weight=False)
convergent_map_generator.generate_and_save_maps()
convergent_map_generator.save_individual_r_maps()

  a.partition(kth, axis=axis, kind=kind, order=order)
  a.partition(kth, axis=axis, kind=kind, order=order)
  a.partition(kth, axis=axis, kind=kind, order=order)
  a.partition(kth, axis=axis, kind=kind, order=order)
  a.partition(kth, axis=axis, kind=kind, order=order)
  a.partition(kth, axis=axis, kind=kind, order=order)
  a.partition(kth, axis=axis, kind=kind, order=order)
  a.partition(kth, axis=axis, kind=kind, order=order)


In [13]:
import numpy as np
from scipy.stats import spearmanr, pearsonr
import pandas as pd
from tqdm import tqdm

class CorrelationCalculator:
    def __init__(self, method='pearson', verbose=False):
        self.method = method
        self.verbose = verbose

    def _calculate_pearson_r_map(self, niftis, indep_var):
        X = indep_var
        Y = niftis
        X_BAR = X.mean(axis=0)[:, np.newaxis]
        Y_BAR = Y.mean(axis=0)[np.newaxis, :]
        X_C = X - X_BAR
        Y_C = Y - Y_BAR
        NUMERATOR = np.dot(X_C.T, Y_C)
        SST_X = np.sum((X - X_BAR)**2, axis=0)
        SST_Y = np.sum((Y - Y_BAR)**2, axis=0)
        DENOMINATOR = np.sqrt(SST_X * SST_Y)
        r = NUMERATOR / DENOMINATOR
        
        if self.verbose:
            print(f"Shape of X: {X.shape}")
            print(f"Shape of Y: {Y.shape}")
            print(f"Shape of X_BAR: {X_BAR.shape}")
            print(f"Shape of Y_BAR: {Y_BAR.shape}")
            print(f"Shape of X_C: {X_C.shape}")
            print(f"Shape of Y_C: {Y_C.shape}")
            print(f"Shape of NUMERATOR: {NUMERATOR.shape}")
            print(f"Shape of DENOMINATOR: {DENOMINATOR.shape}")
        return r

    def _calculate_spearman_r_map(self, niftis, indep_var):
        '''Not easily broadcast, sorry!'''
        n_voxels = niftis.shape[1]
        rho = np.zeros(n_voxels)
        for i in tqdm(range(n_voxels), desc='Running Spearman Rho'):
            rho[i], _ = spearmanr(indep_var, niftis[:, i])
            
        if self.verbose:
            print(f"Shape of niftis: {niftis.shape}")
            print(f"Shape of rho: {rho.shape}")
        return rho
    
    def _process_data(self, data):
        if self.method == 'pearson':
            self.correlation_map = self._calculate_pearson_r_map(data['niftis'], data['indep_var'])
        elif self.method == 'spearman':
            self.correlation_map = self._calculate_spearman_r_map(data['niftis'], data['indep_var'])
    
    def process_all_datasets(self, data_dict):
        correlation_maps = {}
        for dataset_name in data_dict.keys():
            data = DataLoader.load_dataset_static(data_dict, dataset_name)
            self._process_data(data)
            correlation_maps[dataset_name] = self.correlation_map
        return correlation_maps

class LOOCVAnalyzer(ConvergentMapGenerator):
    def __init__(self, corr_map_dict, data_loader, mask_path=None, out_dir=None, weight=False, method='spearman', convergence_type='agreement', similarity='cos', n_bootstrap=1000):
        """
        Initialize the LOOCVAnalyzer.

        Parameters:
        -----------
        corr_map_dict : dict
            Dictionary containing correlation maps for each dataset.
        data_loader : DataLoader
            Instance of DataLoader to load datasets.
        mask_path : str, optional
            Path to the mask file.
        out_dir : str, optional
            Output directory to save maps.
        weight : bool, optional
            Whether to weight the datasets.
        method : str, optional
            Correlation method to use ('spearman' or 'pearson').
        n_bootstrap : int, optional
            Number of bootstrap samples to generate.
        convergence_type : str, optional
            Type of convergence to use ('agreement' or other types). Default is 'agreement'.
        similarity : str, optional
            Similarity measure to use ('cos' for cosine similarity or other measures). Default is 'cos'.
            Number of bootstrap samples to generate. Default is 1000.
        """
        super().__init__(corr_map_dict, data_loader, mask_path, out_dir, weight)
        self.method = method
        self.n_bootstrap = n_bootstrap
        self.similarity = similarity
        self.convergence_type = convergence_type
        self.correlation_calculator = CorrelationCalculator(method=method)
        self.results = self.perform_loocv()
        self.results_df = self.results_to_dataframe()
    
    def results_to_dataframe(self):
        """
        Convert the LOOCV results to a pandas DataFrame.

        Returns:
        --------
        pd.DataFrame
            DataFrame containing the R-value, lower confidence interval, upper confidence interval, and mean R-value for each dataset.
        """
        columns = ['Dataset', 'Observed R', 'CI Lower', 'CI Upper', 'Mean R']
        data = []
        for i, (observed_r, ci_lower, ci_upper, mean_r) in enumerate(self.results):
            dataset_name = list(self.corr_map_dict.keys())[i]
            data.append([dataset_name, observed_r, ci_lower, ci_upper, mean_r])
        return pd.DataFrame(data, columns=columns)

    def perform_loocv(self):
        """
        Perform Leave-One-Out Cross-Validation (LOOCV) analysis.

        Returns:
        --------
        list of tuple
            List of tuples containing the R-value and confidence intervals for each dataset.
        """
        results = []
        dataset_names = list(self.corr_map_dict.keys())
        for i, test_dataset_name in enumerate(dataset_names):
            print("Evaluating dataset:", test_dataset_name)
            # Load the test dataset
            test_data = self.data_loader.load_dataset(test_dataset_name)
            test_niftis = test_data['niftis']
            test_indep_var = test_data['indep_var']

            # TRAIN - Generate the convergent map using the training datasets
            train_dataset_names = dataset_names[:i] + dataset_names[i+1:]
            self.corr_map_dict = self.generate_correlation_maps(train_dataset_names)
            self._handle_nans()
            if self.convergence_type == 'average':
                convergent_map = self.generate_weighted_average_r_map()
            elif self.convergence_type == 'agreement':
                convergent_map = self.generate_agreement_map()
            else:
                raise ValueError("Invalid convergence type (self.convergence_type). Please choose 'average' or 'agreement'.")
            
            # TEST - use the convergent map pm the test dataset
            ## Calculate similarity
            similarities = self.calculate_similarity(test_niftis, convergent_map)
            ## Correlate similarity values with independent variables
            observed_r, ci_lower, ci_upper, mean_r = self.correlate_similarity_with_outcomes(similarities, test_indep_var)
            results.append((observed_r, ci_lower, ci_upper, mean_r))
        return results

    def generate_correlation_maps(self, dataset_names):
        """
        Generate correlation maps for the given dataset names.

        Parameters:
        -----------
        dataset_names : list of str
            List of dataset names.

        Returns:
        --------
        dict
            Dictionary containing correlation maps for each dataset.
        """
        correlation_maps = {}
        for dataset_name in dataset_names:
            data = self.data_loader.load_dataset(dataset_name)
            self.correlation_calculator._process_data(data)
            correlation_maps[dataset_name] = self.correlation_calculator.correlation_map
        return correlation_maps

    def calculate_similarity(self, patient_maps, convergent_map):
        """
        Calculate cosine similarity between patient maps and the convergent map.

        Parameters:
        -----------
        patient_maps : np.array
            Array of patient maps.
        convergent_map : np.array
            Convergent map.

        Returns:
        --------
        list of float
            List of cosine similarity values.
        """
        if self.similarity == 'cos':
            similarities = [self.cosine_similarity(patient_map, convergent_map) for patient_map in patient_maps]
        elif self.similarity == 'spcorr':
            similarities = [pearsonr(patient_map, convergent_map)[0] for patient_map in patient_maps]
        else:
            raise ValueError("Invalid similarity measure (self.similarity). Please choose 'cos' or 'spcorr'.")
        return similarities
    
    def cosine_similarity(self, a, b):
        """
        Calculate the cosine similarity between two vectors.

        Parameters:
        -----------
        a : np.array
            First vector.
        b : np.array
            Second vector.

        Returns:
        --------
        float
            Cosine similarity value.
        """
        numerator = np.dot(a, b)
        denominator = np.sqrt(np.sum(a**2)) * np.sqrt(np.sum(b**2))
        return numerator / denominator
    
    def correlate_similarity_with_outcomes(self, similarities, indep_var):
        """
        Correlate similarity values with independent variables and calculate confidence intervals.

        Parameters:
        -----------
        similarities : list of float
            List of cosine similarity values.
        indep_var : np.array
            Array of independent variable values.

        Returns:
        --------
        tuple
            R-value, lower confidence interval, and upper confidence interval.
        """
        resampled_r = []
        observed_r = 0
        for _ in tqdm(range(self.n_bootstrap), 'Running bootstraps'):
            if _ == 0:
                resampled_indices = np.arange(len(similarities)) # No replacement for the first iteration
            else:
                resampled_indices = np.random.choice(len(similarities), len(similarities), replace=True)
            resampled_similarities = np.array(similarities)[resampled_indices]
            resampled_indep_var = np.array(indep_var)[resampled_indices]
            if self.method == 'spearman':
                resampled_r.append(spearmanr(resampled_similarities, resampled_indep_var)[0])
            else:
                resampled_r.append(pearsonr(resampled_similarities, resampled_indep_var)[0])
                
            if _ == 0:
                observed_r = resampled_r
                
        ci_lower = np.percentile(resampled_r, 2.5)
        ci_upper = np.percentile(resampled_r, 97.5)
        mean_r = np.mean(resampled_r)
        return observed_r, ci_lower, ci_upper, mean_r

In [None]:
# Instantiate the LOOCVAnalyzer class
loocv_analyzer = LOOCVAnalyzer(corr_map_dict, data_loader, method='spearman', n_bootstrap=1000, out_dir=None)
results = loocv_analyzer.perform_loocv()
display(loocv_analyzer.results_df)