In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import platform
import pathlib

In [None]:
## Paths Input Here
analysis = '1_way_manova/baseline_vs_memnet_correl_vs_outcome'
if platform.uname().system == 'Darwin': #------------------------------Mac OS X---------------------------------------------------------------
    conn_path = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/analyses/composite_analyses/non_fl_vta_memory_derivatives/ferguson_memory_derivatives_with_clinical_data.csv'
    clin_path = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/patient_data/AD_Clinical_Data_CDR_ADAS_COG_13.xlsx'
    # clin_path = 'path to clinical values'
    out_dir = os.path.join(os.path.dirname(conn_path), f'{analysis}')
    print('I will save to:', out_dir)
    x_roi_names = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/analyses/roi-roi_correl/matrix_corrMx_names.csv'
    #roi_names = '<path to roi name location>'
    print('I have set pathnames in the Mac style')
else: #----------------------------------------------------------------Windows----------------------------------------------------------------
    conn_path = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\matrix_corrMx_AvgR.csv'
    clin_path = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\patient_data\AD_Clinical_Data_CDR_ADAS_COG_13.xlsx'
    # clin_path = 'path to clinical values'
    out_dir = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\stats'
    #out_dir = r'path to out dir here'
    x_roi_names = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\matrix_corrMx_names.csv'
    #roi_names = '<path to roi name location>'
    print('I have set pathnames in the Windows style')
    
## Do you want to save the reults?
save = True
if save:
    if os.path.exists(out_dir) != True:
        os.makedirs(out_dir)

## Begin Import

In [None]:
variables_to_change = {}
variables_to_change['original'] = [
    '% Change from baseline (ADAS-Cog11)'
  
    ]
variables_to_change['destination'] = [
    'percent_change_adascog11'
    
    ]
#----------------------------------------------------------------user input above----------------------------------------------------------------

In [None]:
data_df = pd.read_csv(conn_path)

#Prepare the dataframe for statsmodels
data_df = data_df.reset_index(drop=True)
#Drop NANs
data_df.dropna(inplace=True)

try:
    data_df.pop('Unnamed: 0')
except:
    print('none to pop')

#Rename variables to prevent errors
data_df = data_df.rename(columns={variables_to_change['original'][0]: variables_to_change['destination'][0]})#, '07_default': 'default', '02_somatomotor_seed': 'somatomotor', '03_dorsal_attention': 'dorsal', '01_visual_seed': 'visual', '04_ventral_attention': 'ventral', '05_limbic': 'limbic', '06_frontoparietal': 'frontoparietal'})
# #Organize variables
# data_df.sort_values

#Organize the coluns
import natsort
natsorted_columns = natsort.natsorted(data_df.columns)
data_df = data_df.reindex(columns=natsorted_columns)

#place response column at start of dataframe
response_series = data_df.pop(variables_to_change['destination'][0])
data_df.insert(0, variables_to_change['destination'][0], response_series)
display(data_df)

#Remove variables of low interest
# data_df.pop('SBC.')

In [None]:
#Melt DF to prepare for MANOVA
df_to_melt = data_df.copy()

value_vars = ['Baseline ADAS-Cog11', 'Memory Network'] # set variables specifically
# value_vars = df_to_melt.columns.values.tolist()[1:]) # set all variables
melted_df = pd.melt(df_to_melt, id_vars=['percent_change_adascog11'], value_vars=value_vars)

display(melted_df)

In [None]:
#Visualize
#Set colours
# color_list = ['#696969', '#BAB1B0', '#D3D3D3', '#D8D6D5']
# colors = []
# for i in range(0, len((melted_df['percent_change_adascog11'].unique()))):
#     colors.append(color_list[i])
# customPalette = sns.set_palette(sns.color_palette(colors))

fig, axs = plt.subplots(nrows=2, figsize=(30,20))
sns.boxplot(data=melted_df, x="variable", y="value", hue=melted_df['variable'].tolist(), ax=axs[0])#, palette=customPalette) 
sns.boxplot(data=melted_df, x="variable", y="percent_change_adascog11", hue=melted_df['variable'].tolist(), ax=axs[1])#, palette=customPalette) 

## Perform 1-Way MANOVA

In [None]:
from statsmodels.multivariate.manova import MANOVA
fit = MANOVA.from_formula('variable + value ~ percent_change_adascog11', data=melted_df)
print(fit.mv_test())

## Posthoc Notes

if you are interested in how the groups influence a given dependent variable, run an ANOVA. 
if you are interested in how the linear combination of variables leads to maximal separability of the groups, run LDA. 
- IE, said conversely, if you are interested in seeing if a group has a disproportionate effect on the 'linear combination of variables', run LDA


## Posthoc with Statsmodels

In [None]:
from scipy.stats import mannwhitneyu, normaltest, tukey_hsd, ttest_ind, kruskal, levene, brunnermunzel, wilcoxon
fig2, ax = plt.subplots(1,1, figsize=(30, 20))

main_tests = ['t-test_ind', 't-test_welch', 't-test_paired', 'Mann-Whitney', 'Mann-Whitney-gt', 'Mann-Whitney-ls', 'Levene', 'Wilcoxon', 'Kruskal', 'Brunner-Munzel']
post_hoc_tests = ['bonferroni', 'bonf', 'Bonferroni', 'holm-bonferroni', 'HB', 'Holm-Bonferroni', 'holm', 'benjamini-hochberg', 'BH', 'fdr_bh', 'Benjamini-Hochberg', 'fdr_by', 'Benjamini-Yekutieli', 'BY', None]

##----------------------------------------------------------------USER INPUT BELOW----------------------------------------------------------------
main_test = 't-test_ind'# #main_tests[0]
post_hoc_test = None #post_hoc_tests[3]

## Perform Post Hoc Analysis with LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
#Fit the LDA
X = melted_df[["percent_change_adascog11", "value"]]
y = melted_df["variable"]
lda_model = lda()
lda_model.fit(X=X, y=y)

In [None]:
#Define methods to evaluate model
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(lda_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores))   



In [None]:
#Visualize Results
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
#Fit the LDA
X = melted_df[["percent_change_adascog11", "value"]]
y = melted_df["variable"]
lda_model = lda().fit(X=X, y=y).transform(X)
target_names = y

#create LDA plot
plt.figure()
lw = 2
for i, target_name in zip([0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], target_names):
    plt.scatter(lda_model[y == i, 0], lda_model[y == i, 1], alpha=.8,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)

# # get Prior probabilities of groups:
# print('Prior: \n', lda_model.priors_)

# # get group means
# print('Means: \n', lda_model.means_)

# # get Coefficients of linear discriminants
# print('Scalings: \n', lda_model.scalings_)

# # get Proportion of trace (variance explained by each of the selected components)
# print('Explained Variance: \n', lda_model.explained_variance_ratio_)

# # plot 
# print(lda_model.transform(X))
# X_new = pd.DataFrame(lda_model.transform(X), columns=["lda1", "lda2"])
# X_new["variable"] = melted_df["variable"]
# sns.scatterplot(data=X_new, x="lda1", y="lda2", hue=melted_df['variable'].tolist())
# plt.show()

In [None]:
# plot 
print(post_hoc.transform(X))
X_new = pd.DataFrame(post_hoc.transform(X), columns=["lda1", "lda2"])
X_new["variable"] = melted_df["variable"]
sns.scatterplot(data=X_new, x="lda1", y="lda2", hue=melted_df['variable'].tolist())
plt.show()