# Run Correlation-Based Analyses

### Authors: Calvin Howard.

#### Last updated: July 6, 2023

Use this to assess if a correlation between a dependent variable and an independent variable is statistically significant using permutation analysis. 

Further, follow this up with a contrast analysis which sees which categorical variables have significantly different correlations from each other. 

Notes:
- To best use this notebook, you should be familar with mixed effects models

# 00 - Import CSV with All Data
**The CSV is expected to be in this format**
- ID and absolute paths to niftis are critical
```
+-----+----------------------------+--------------+--------------+--------------+
| ID  | Nifti_File_Path            | Covariate_1  | Covariate_2  | Covariate_3  |
+-----+----------------------------+--------------+--------------+--------------+
| 1   | /path/to/file1.nii.gz      | 0.5          | 1.2          | 3.4          |
| 2   | /path/to/file2.nii.gz      | 0.7          | 1.4          | 3.1          |
| 3   | /path/to/file3.nii.gz      | 0.6          | 1.5          | 3.5          |
| 4   | /path/to/file4.nii.gz      | 0.9          | 1.1          | 3.2          |
| ... | ...                        | ...          | ...          | ...          |
+-----+----------------------------+--------------+--------------+--------------+
```

In [None]:
# Specify the path to your CSV file containing NIFTI paths
input_csv_path = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/resources/datasets/TMS_studies_influencing_memory/metadata/master_list.csv'
sheet = None #'master_list_proper_subjects' ?

In [None]:
# Specify where you want to save your results to
out_dir = '/Users/cu135/Partners HealthCare Dropbox/Calvin Howard/studies/ccm_memory/results/notebook_10/tms_meta_analysis/test'

In [None]:
from calvin_utils.permutation_analysis_utils.statsmodels_palm import CalvinStatsmodelsPalm
# Instantiate the PalmPrepararation class
cal_palm = CalvinStatsmodelsPalm(input_csv_path=input_csv_path, output_dir=out_dir, sheet=sheet)
# Call the process_nifti_paths method
data_df = cal_palm.read_and_display_data()
data_df


# 01 - Preprocess Your Data

**Handle NANs**
- Set drop_nans=True is you would like to remove NaNs from data
- Provide a column name or a list of column names to remove NaNs from

In [None]:
len(data_df['study'].unique())

In [None]:
data_df.columns

In [None]:
drop_list = ['all_datasets_weighted_avg_r_map_pcc_inverse_r_map_average_target_in_subject', 'Pre_Post_Memory_Effect_Size__Cohen_s_D_']

In [None]:
data_df = cal_palm.drop_nans_from_columns(columns_to_drop_from=drop_list)
data_df

**Drop Row Based on Value of Column**

Define the column, condition, and value for dropping rows
- column = 'your_column_name'
- condition = 'above'  # Options: 'equal', 'above', 'below'

Set the parameters for dropping rows

In [None]:
data_df.columns

In [None]:
column = 'Disease'  # The column you'd like to evaluate
condition = 'equal'  # Thecondition to check ('equal', 'above', 'below', 'not')
value = 'Parkinson' # The value to compare against

In [None]:
# data_df, other_df = cal_palm.drop_rows_based_on_value(column, condition, value)
data_df

Regress out a Covariate

In [None]:
data_df.columns

In [None]:
for col in regressors:
    if data_df[col].dtype == 'O':  # object/categorical
        data_df[col] = data_df[col].fillna(data_df[col].mode()[0])
    else:  # numeric
        data_df[col] = data_df[col].fillna(data_df[col].mean())
data_df[regressors].isna().sum()


In [None]:
from calvin_utils.statistical_utils.regression_utils import RegressOutCovariates
dependent_variable_list = ['Pre_Post_Memory_Effect_Size__Cohen_s_D_']
regressors = ['Disease', 'Years_Between_Measurements', 'Mean_Age', 'Frequency__Hz_']

data_df, adjusted_dep_vars_list = RegressOutCovariates.run(df=data_df, dependent_variable_list=dependent_variable_list, covariates_list=regressors)
print(adjusted_dep_vars_list)

In [None]:
import numpy as np

nonzero_count = np.count_nonzero(data_df['Pre_Post_Memory_Effect_Size__Cohen_s_D__residual'])
not_nan_count = data_df['Pre_Post_Memory_Effect_Size__Cohen_s_D__residual'].notna().sum()
print(f"Nonzero count: {nonzero_count}")
print(f"Non-NaN count: {not_nan_count}")

**Standardize Data**
- Enter Columns you Don't want to standardize into a list

In [None]:
data_df.columns

In [None]:
# Remove anything you don't want to standardize
cols_not_to_standardize = ['TOTAL11']

In [None]:
data_df = cal_palm.standardize_columns(cols_not_to_standardize)
data_df

# 02 - Perform Basic Correlation

In [None]:
data_df.columns

In [None]:
from scipy.stats import ttest_ind, mannwhitneyu
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Split by sign of the memory effect
pos_group = data_df.loc[data_df['Pre_Post_Memory_Effect_Size__Cohen_s_D_'] > 0, x_col].dropna()
neg_group = data_df.loc[data_df['Pre_Post_Memory_Effect_Size__Cohen_s_D_'] < 0, x_col].dropna()

# Welch t-test (parametric, unequal variances)
t_stat, p_val = ttest_ind(pos_group, neg_group, equal_var=False)

# Mann–Whitney U (non-parametric)
mwu_stat, mwu_p = mannwhitneyu(pos_group, neg_group, alternative='two-sided')

print(f"T-statistic: {t_stat:.4f}, p-value: {p_val:.4g}")
print(f"Mann-Whitney U statistic: {mwu_stat:.4f}, p-value: {mwu_p:.4g}")

# ── Plot mean ± SEM ────────────────────────────────────────────────────────────
boxplot_df = pd.concat(
    [
        pos_group.to_frame(name=x_col).assign(Group='Positive Effect'),
        neg_group.to_frame(name=x_col).assign(Group='Negative Effect')
    ],
    ignore_index=True
)

means = boxplot_df.groupby('Group')[x_col].mean()
sems  = boxplot_df.groupby('Group')[x_col].sem()

plt.figure(figsize=(6, 4))
ax = sns.barplot(x=means.index, y=means.values, palette="Set2", ci=None)
ax.errorbar(
    x=np.arange(len(means)),
    y=means.values,
    yerr=sems.values,
    fmt='none',
    ecolor='black',
    capsize=4,
    lw=1
)

ax.set_title('Mean Spatial Correlation by Memory Effect Group (±SEM)')
ax.set_ylabel('Spatial Correlation')
ax.set_xlabel('Memory Effect Group')
plt.tight_layout()
plt.show()
fig = ax.get_figure()
fig.savefig('/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Research/2023/memory_ccm/Figures/supplement_prior_dbs-target/mean_spatial_corr_by_group.svg', dpi=300, bbox_inches='tight')


In [None]:
# ── Raincloud plot: violin + jitter (strip) ────────────────────────────────────
plt.figure(figsize=(6, 4))

# Violin (distribution “cloud”)
sns.violinplot(
    data=boxplot_df,
    x='Group',
    y=x_col,
    palette='Set2',
    inner=None,      # no box inside the violin
    cut=0,           # trim tails to data range
    linewidth=0
)

# Jittered points (“rain”)
sns.stripplot(
    data=boxplot_df,
    x='Group',
    y=x_col,
    color='k',
    size=4,
    jitter=0.25,
    alpha=0.7
)

plt.title('Raincloud Plot of Spatial Correlation by Memory Effect Group')
plt.ylabel('Spatial Correlation')
plt.xlabel('Memory Effect Group')
plt.tight_layout()
plt.show()


In [None]:
data_df = data_df[(data_df['Pre_Post_Memory_Effect_Size__Cohen_s_D_'] >= -1) & (data_df['Pre_Post_Memory_Effect_Size__Cohen_s_D_'] <= 1)]

In [None]:
data_df.columns

In [None]:
x_col = 'all_datasets_weighted_avg_r_map_pcc_inverse_r_map_average_target_in_subject'
y_col = 'Pre_Post_Memory_Effect_Size__Cohen_s_D_'

In [None]:
# data_df = data_df[~((data_df[x_col] < 0.2) & (data_df[y_col] > 0.2))]

In [None]:
out_dir='/Users/cu135/Library/CloudStorage/OneDrive-Personal/OneDrive_Documents/Research/2023/memory_ccm/Figures/supplement_prior_dbs-target'

In [None]:
from calvin_utils.statistical_utils.scatterplot import simple_scatter
simple_scatter(data_df, x_col, y_col, 'TMS', 
               x_label="Spatial Correlation",
               y_label='Memory Effect (Cohen\'s D)',
               out_dir=out_dir, flip_axes=False)

In [None]:
out_dir

# Correlate Variable with All Other Variables

In [None]:
from calvin_utils.statistical_utils.correlation_barplot import CorrelationBarPlot
plotter = CorrelationBarPlot(data_df, x_col=x_col, method="spearman")
corr_series = plotter.run(save_path=None)

Focus on correlations above a threhsold

In [None]:
strong_corrs = plotter.plot_threshold(0.4)      # |r| ≥ 0.4 plot