In [4]:
import numpy as np
from scipy.stats import pearsonr

def correlate_npy_files(file1_path, file2_path):
    # Load the .npy files
    data1 = np.load(file1_path)
    data2 = np.load(file2_path)

    # Ensure the data shapes are compatible for correlation
    if data1.shape != data2.shape:
        raise ValueError("The shapes of the two datasets do not match.")
    # 
    #nan check
    if np.isnan(data1).any():
        print('nan in data1')
        data1 = np.nan_to_num(data1, nan=0, posinf=0, neginf=0)
        print(np.max(data1), np.min(data1), np.median(data1))
    if np.isnan(data2).any():
        print('nan in data2')
        data2 = np.nan_to_num(data2, nan=0, posinf=0, neginf=0)
        print(np.max(data2), np.min(data2), np.median(data2))

    # Flatten the data to 1D arrays
    data1_flat = data1.flatten()
    data2_flat = data2.flatten()

    # Calculate the Pearson correlation coefficient
    correlation, p_value = pearsonr(data1_flat, data2_flat)

    return correlation, p_value

# Example usage
file1_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/ccm_memory/results/notebook_02/tmp_tnsr/sante_memory/niftis.npy'
file2_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/ccm_memory/results/notebook_02/tmp_ols/sante_memory/niftis.npy'

correlation, p_value = correlate_npy_files(file1_path, file2_path)
print(f"Correlation: {correlation}, p-value: {p_value}")

Correlation: 0.9707757310837901, p-value: 0.0


In [2]:
import numpy as np
from scipy.stats import spearmanr
import time

def vectorized_rankdata(a):
    """
    Vectorized ranking function using NumPy.

    Parameters:
    -----------
    a : np.array
        Input array to be ranked.

    Returns:
    --------
    ranks : np.array
        Ranked array.
    """
    ranks = np.empty_like(a, dtype=float)
    ranks[np.argsort(a)] = np.arange(len(a)) + 1
    return ranks

def vectorized_spearmanr(niftis, indep_var):
    """
    Calculate the Spearman rank-order correlation coefficient for each voxel
    in a fully vectorized manner.

    Parameters:
    -----------
    niftis : np.array
        2D array where each row represents a subject and each column represents a voxel.
    indep_var : np.array
        1D array representing the independent variable for each subject.

    Returns:
    --------
    rho : np.array
        1D array of Spearman's rank correlation coefficients for each voxel.
    """
    # Rank the data
    ranked_niftis = np.apply_along_axis(vectorized_rankdata, 0, niftis)
    ranked_indep_var = vectorized_rankdata(indep_var)

    # Calculate the differences between the ranks
    d = ranked_niftis - ranked_indep_var[:, np.newaxis]

    # Square the differences
    d_squared = d ** 2

    # Sum the squared differences
    sum_d_squared = np.sum(d_squared, axis=0)

    # Number of observations
    n = niftis.shape[0]

    # Apply the Spearman's Rho formula
    rho = 1 - (6 * sum_d_squared) / (n * (n**2 - 1))

    return rho

# Generate synthetic data
np.random.seed(42)
n_observations = 100
n_voxels = 200000
niftis = np.random.rand(n_observations, n_voxels)
indep_var = np.random.rand(n_observations)

# Vectorized Spearman's Rho
start_time = time.time()
vectorized_rho = vectorized_spearmanr(niftis, indep_var)
vectorized_time = time.time() - start_time

# Looped Spearman's Rho using scipy.stats
start_time = time.time()
looped_rho = np.zeros(n_voxels)
for i in range(n_voxels):
    looped_rho[i] = spearmanr(niftis[:, i], indep_var)[0]
looped_time = time.time() - start_time

# Compare the results
print(f"Vectorized Spearman's Rho Time: {vectorized_time:.4f} seconds")
print(f"Looped Spearman's Rho Time: {looped_time:.4f} seconds")
print(f"Difference in Rho values: {np.mean(np.abs(vectorized_rho - looped_rho)):.6f}")

# Check if the results are close
print(f"Are the results close? {np.allclose(vectorized_rho, looped_rho)}")

Vectorized Spearman's Rho Time: 2.5860 seconds
Looped Spearman's Rho Time: 140.3134 seconds
Difference in Rho values: 0.000000
Are the results close? True


In [9]:
import json
import numpy as np
import time
from scipy.stats import spearmanr
from calvin_utils.ccm_utils.npy_utils import DataLoader
from calvin_utils.ccm_utils.stat_utils import CorrelationCalculator

def vectorized_rankdata(a):
    """
    Vectorized ranking function using NumPy.

    Parameters:
    -----------
    a : np.array
        Input array to be ranked.

    Returns:
    --------
    ranks : np.array
        Ranked array.
    """
    a = a.flatten()
    ranks = np.empty_like(a, dtype=float)
    ranks[np.argsort(a)] = np.arange(len(a)) + 1
    return ranks

def _calculate_pearson_r_map(self, niftis, indep_var):
        X = indep_var
        Y = niftis
        X_BAR = X.mean(axis=0)[:, np.newaxis]
        Y_BAR = Y.mean(axis=0)[np.newaxis, :]
        X_C = X - X_BAR
        Y_C = Y - Y_BAR
        NUMERATOR = np.dot(X_C.T, Y_C)
        SST_X = np.sum((X - X_BAR)**2, axis=0)
        SST_Y = np.sum((Y - Y_BAR)**2, axis=0)
        DENOMINATOR = np.sqrt(SST_X * SST_Y)
        r = NUMERATOR / DENOMINATOR
        
        if self.verbose:
            print(f"Shape of X: {X.shape}")
            print(f"Shape of Y: {Y.shape}")
            print(f"Shape of X_BAR: {X_BAR.shape}")
            print(f"Shape of Y_BAR: {Y_BAR.shape}")
            print(f"Shape of X_C: {X_C.shape}")
            print(f"Shape of Y_C: {Y_C.shape}")
            print(f"Shape of NUMERATOR: {NUMERATOR.shape}")
            print(f"Shape of DENOMINATOR: {DENOMINATOR.shape}")
        return r
    
def vectorized_spearmanr(niftis, indep_var):
    """
    Calculate the Spearman rank-order correlation coefficient for each voxel
    in a fully vectorized manner.

    Parameters:
    -----------
    niftis : np.array
        2D array where each row represents a subject and each column represents a voxel.
    indep_var : np.array
        1D array representing the independent variable for each subject.

    Returns:
    --------
    rho : np.array
        1D array of Spearman's rank correlation coefficients for each voxel.
    """
    # Rank the data
    ranked_niftis = np.apply_along_axis(vectorized_rankdata, 0, niftis)
    ranked_indep_var = vectorized_rankdata(indep_var)

    # Calculate the differences between the ranks
    d = ranked_niftis - ranked_indep_var[:, np.newaxis]

    # Square the differences
    d_squared = d ** 2

    # Sum the squared differences
    sum_d_squared = np.sum(d_squared, axis=0)

    # Number of observations
    n = niftis.shape[0]

    # Apply the Spearman's Rho formula
    rho = 1 - (6 * sum_d_squared) / (n * (n**2 - 1))

    return rho

# Load the dataset
data_dict_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/ccm_memory/results/notebook_02/tmp/dataset_dict.json'
data_loader = DataLoader(data_dict_path)

# Get the first dataset
with open(data_dict_path, 'r') as f:
    dataset_paths = json.load(f)
first_dataset_name = list(dataset_paths.keys())[1]
first_dataset = data_loader.load_dataset(first_dataset_name)

niftis = first_dataset['niftis']
indep_var = first_dataset['indep_var']

# Vectorized Spearman's Rho
start_time = time.time()
vectorized_rho = vectorized_spearmanr(niftis, indep_var)
vectorized_time = time.time() - start_time

# Looped Spearman's Rho using scipy.stats
start_time = time.time()
looped_rho = np.zeros(niftis.shape[1])
for i in range(niftis.shape[1]):
    looped_rho[i] = spearmanr(niftis[:, i], indep_var)[0]
looped_time = time.time() - start_time

# Compare the results
print(f"Vectorized Spearman's Rho Time: {vectorized_time:.4f} seconds")
print(f"Looped Spearman's Rho Time: {looped_time:.4f} seconds")
print(f"Difference in Rho values: {np.mean(np.abs(vectorized_rho - looped_rho)):.6f}")


Vectorized Spearman's Rho Time: 4.0655 seconds
Looped Spearman's Rho Time: 156.7029 seconds
Difference in Rho values: nan
Are the results close? False


In [19]:
# Check if the results are close
print(f"Are the results close? {np.allclose(np.nan_to_num(vectorized_rho), np.nan_to_num(looped_rho))}")

Are the results close? False


In [18]:
import numpy as np
from scipy.stats import pearsonr
pearsonr(np.nan_to_num(vectorized_rho), np.nan_to_num(looped_rho))

PearsonRResult(statistic=0.890446300992941, pvalue=0.0)

In [None]:
import json
import numpy as np
import time
from scipy.stats import spearmanr, rankdata, pearsonr
from calvin_utils.ccm_utils.npy_utils import DataLoader
from calvin_utils.ccm_utils.stat_utils import CorrelationCalculator

def vectorized_spearmanr(niftis, indep_var):
    """
    Calculate the Spearman rank-order correlation coefficient for each voxel
    in a fully vectorized manner.

    Parameters:
    -----------
    niftis : np.array
        2D array where each row represents a subject and each column represents a voxel.
    indep_var : np.array
        1D array representing the independent variable for each subject.

    Returns:
    --------
    rho : np.array
        1D array of Spearman's rank correlation coefficients for each voxel.
    """
    # Rank the data using scipy.stats.rankdata to handle ties
    ranked_niftis = np.apply_along_axis(rankdata, 0, niftis)
    ranked_indep_var = rankdata(indep_var)
    print(ranked_indep_var.shape, ranked_niftis.shape)
    # Calculate the Pearson correlation coefficient on the ranked data
    X = ranked_indep_var[:, np.newaxis]
    Y = ranked_niftis[:, np.newaxis]
    X_BAR = X.mean(axis=0)[:, np.newaxis]
    Y_BAR = Y.mean(axis=0)[np.newaxis, :]
    X_C = X - X_BAR
    Y_C = Y - Y_BAR
    NUMERATOR = np.dot(X_C.T, Y_C)
    SST_X = np.sum((X - X_BAR)**2, axis=0)
    SST_Y = np.sum((Y - Y_BAR)**2, axis=0)
    DENOMINATOR = np.sqrt(SST_X * SST_Y)
    rho = NUMERATOR / DENOMINATOR

    return rho

# Vectorized Spearman's Rho
start_time2 = time.time()
vectorized_rho2 = vectorized_spearmanr(niftis, indep_var)
vectorized_time2 = time.time() - start_time2

# # Looped Spearman's Rho using scipy.stats
# start_time = time.time()
# looped_rho = np.zeros(niftis.shape[1])
# for i in range(niftis.shape[1]):
#     looped_rho[i] = spearmanr(niftis[:, i], indep_var)[1]
# looped_time = time.time() - start_time

# Compare the results
print(f"Vectorized Spearman's Rho Time: {vectorized_time2:.4f} seconds")
print(f"Looped Spearman's Rho Time: {looped_time:.4f} seconds")
print(f"Difference in Rho values: {np.mean(np.abs(vectorized_rho2 - looped_rho)):.6f}")

# Check if the results are close
print(f"Are the results close? {np.allclose(np.nan_to_num(vectorized_rho2), np.nan_to_num(looped_rho))}")

(196,)


IndexError: invalid index to scalar variable.