In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import platform
import pathlib

In [None]:
## Paths Input Here
if platform.uname().system == 'Darwin': #------------------------------Mac OS X---------------------------------------------------------------
    conn_path = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/analyses/roi-roi_correl/matrix_corrMx_AvgR.csv'
    clin_path = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/patient_data/AD_Clinical_Data_CDR_ADAS_COG_13.xlsx'
    # clin_path = 'path to clinical values'
    out_dir = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/analyses/roi-roi_correl/figures/multidimensional_scatterplots'
    #out_dir = r'path to out dir here'
    x_roi_names = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/analyses/roi-roi_correl/matrix_corrMx_names.csv'
    #roi_names = '<path to roi name location>'
    print('I have set pathnames in the Mac style')
else: #----------------------------------------------------------------Windows----------------------------------------------------------------
    conn_path = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\matrix_corrMx_AvgR.csv'
    clin_path = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\patient_data\AD_Clinical_Data_CDR_ADAS_COG_13.xlsx'
    # clin_path = 'path to clinical values'
    out_dir = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\stats'
    #out_dir = r'path to out dir here'
    x_roi_names = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\matrix_corrMx_names.csv'
    #roi_names = '<path to roi name location>'
    print('I have set pathnames in the Windows style')

In [None]:
try:
    name_df = pd.read_csv(x_roi_names, names=['arb'], header=None)
    name_df.tail(5)
    colnames = name_df.arb.values.tolist()
    newname = []
    for name in colnames:
        arb = os.path.basename(name).split('.nii')[0]
        arb = arb[0:8]
        newname.append(arb)
    #print('NAMES: ', newname)

    x_df = pd.read_csv(conn_path, names=newname, header=None)#, ignore_index=True)
    x_df.index = newname
    x_df = x_df.iloc[8:,:8]
    x_df = x_df.reset_index(drop=True)
except:
    print('excepted')
    x_df = pd.read_csv(conn_path)
    colnames = x_df.columns.values
    newname = []
    for name in colnames:
        arb = os.path.basename(name).split('.nii')[0]
        arb = arb[0:8]
        newname.append(arb)
    x_df = x_df.set_axis(newname, axis=1, inplace=False)
    try:
        x_df.pop('Unnamed:')
    except:
        print('no x_df.pop(<name>) column to pop')


corr_df = x_df
corr_df.tail(3)

In [None]:
#Assess connectivity values fundamentally
corr_description = corr_df.describe().transpose()
display(corr_description)

In [None]:
#important metrics: 
# pt_id, randomization arm, age at DOS,
# baseline adas cog 11, baseline CDR
# %change adas cog 11, # change CDR


sheet_name = 'AD_Clinical_Scores'
alphab_cols = 'C, D, E, F, G, J, V'
clin_df = pd.read_excel(clin_path, sheet_name=sheet_name, usecols=alphab_cols, nrows=50)
print('Num NaNs: ', clin_df.isna().sum().sum())
# clin_df.tail(5)
display(clin_df[::2])

In [None]:
## Organize the clinical dataframe
clin_df = clin_df.sort_values(by=['Patient # CDR, ADAS'], kind='quicksort', axis=0, ascending=True, ignore_index=True)
clin_df.tail(10)

In [None]:
##One-hot-encode the dataframe | sham=0 stim=1 
shams = (clin_df['Randomization Arm'] == 'sham-stim')
clin_df.loc[shams, 'Randomization Arm'] = 0
stims = (clin_df['Randomization Arm'] == 'stim-sham')
clin_df.loc[stims, 'Randomization Arm'] = 1
clin_df.tail(10)

In [None]:
## Develop Understanding of the Clinical Data
try:
    clin_description = clin_df.describe().transpose()
    display(clin_description)
except:
    print('Failed to describe clinical dataframe, unknown cause')

In [None]:
corrd_df = corr_df.reset_index()
total_df = pd.concat([clin_df, corr_df], axis=1)
display(total_df)

In [None]:
#Handle NaNs
total_df = total_df.fillna(method='ffill')
print('Num NaNs: ', total_df.isna().sum().sum())

In [None]:
#Generate metrics for udnerstanding
try:
    total_desc = total_df.describe().transpose()
    total_desc
except:
    print('Failed to generate total metrics, unkown cause')

## Fisher Test Time

In [None]:
## Choose variables of interest.
x_name = '00_memor'
z_name = '% Change from baseline (ADAS-Cog11)' #--------------------------------------------------------This variable is the outcome variable

x_vals = total_df[x_name]
z_vals = total_df[z_name]#------------------------------------------------------------------------------This variable is the outcome variable


#subgrouping by age
metric = 'Age at DOS'
metric_val = 65

index_one = (total_df[metric] > metric_val) #example, all individuals over 65
index_two = (total_df[metric] <= metric_val) #example, all individuals under/equal to 65

#Do you want to do the converse analysis?
converse_analysis = False
if converse_analysis == True:
    #Store the variables in an intermediary
    X_name = x_name; Y_name = z_name
    X_vals = x_vals; Y_vals = z_vals

    #Set the variables to the converse varialbe using the intermediary
    x_name = Y_name; z_name = X_name
    x_vals = Y_vals; z_vals = X_vals

##Example of how to use these indices to manipulate data:
#### clin_df.loc[index_one, '<names of relevant columns>'] = 0 ## can use iloc too. 

#IN ORDER TO MASK DATA FROM THE DATAFRAME, TAKE THE MASK GENERATED BY THE INDEX AND USE MASKED_DATA = DATA_OF_INTEREST[INDEX_ONE]

In [None]:
## Choose how to bin the data
import scipy.stats as st
total_df['xbins'] = st.zscore(x_vals)#[index_one] #--------------------------------this is for maskig the data by an index
total_df['ybins'] = st.zscore(z_vals)#[index_one] #--------------------------------this is for maskig the data by an index
display(total_df)

In [None]:
#CALCULATE OBSERVED VALUES OF THE CONTINGENCY TABLE
or1c1 = np.count_nonzero(total_df.index[(total_df['xbins'] <= st.norm.ppf(.1)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or1c2 = np.count_nonzero(total_df.index[(total_df['xbins'] <= st.norm.ppf(.1)) & (total_df['ybins'] > st.norm.ppf(.5))])
or2c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.1)) & (total_df['xbins'] <= st.norm.ppf(.2)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or2c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.1)) & (total_df['xbins'] <= st.norm.ppf(.2)) & (total_df['ybins'] > st.norm.ppf(.5))])
or3c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.2)) & (total_df['xbins'] <= st.norm.ppf(.3)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or3c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.2)) & (total_df['xbins'] <= st.norm.ppf(.3)) & (total_df['ybins'] > st.norm.ppf(.5))])
or4c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.3)) & (total_df['xbins'] <= st.norm.ppf(.4)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or4c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.3)) & (total_df['xbins'] <= st.norm.ppf(.4)) & (total_df['ybins'] > st.norm.ppf(.5))])
or5c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.4)) & (total_df['xbins'] <= st.norm.ppf(.5)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or5c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.4)) & (total_df['xbins'] <= st.norm.ppf(.5)) & (total_df['ybins'] > st.norm.ppf(.5))])
or6c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.5)) & (total_df['xbins'] <= st.norm.ppf(.6)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or6c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.5)) & (total_df['xbins'] <= st.norm.ppf(.6)) & (total_df['ybins'] > st.norm.ppf(.5))])
or7c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.6)) & (total_df['xbins'] <= st.norm.ppf(.7)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or7c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.6)) & (total_df['xbins'] <= st.norm.ppf(.7)) & (total_df['ybins'] > st.norm.ppf(.5))])
or8c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.7)) & (total_df['xbins'] <= st.norm.ppf(.8)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or8c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.7)) & (total_df['xbins'] <= st.norm.ppf(.8)) & (total_df['ybins'] > st.norm.ppf(.5))])
or9c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.8)) & (total_df['xbins'] <= st.norm.ppf(.9)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or9c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.8)) & (total_df['xbins'] <= st.norm.ppf(.9)) & (total_df['ybins'] > st.norm.ppf(.5))])
or10c1 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.9)) & (total_df['xbins'] <= st.norm.ppf(.9999)) & (total_df['ybins'] <= st.norm.ppf(.5))])
or10c2 = np.count_nonzero(total_df.index[(total_df['xbins'] > st.norm.ppf(.9)) & (total_df['xbins'] <= st.norm.ppf(.9999)) & (total_df['ybins'] > st.norm.ppf(.5))])



cont_df = pd.DataFrame({'<=50% response by prob': [or1c1, or2c1, or3c1, or4c1, or5c1, or6c1, or7c1, or8c1, or9c1, or10c1],
                         '>50% response by prob': [or1c2, or2c2, or3c2, or4c2, or5c2, or6c2, or7c2, or8c2, or9c2, or10c2]})
print('Sums:', np.sum(cont_df))

#quantile bins
# quantile = 0.25
# outcome = np.quantile(total_df[z_name], quantile)
# conn = np.quantile(total_df[x_name], quantile)
display(cont_df)

In [None]:
from scipy.stats import fisher_exact
p_vals = []
odds_ratio = []
for i in range(0, len(cont_df.index)):
    if i < len(cont_df.index)-1:
        table = np.array([[cont_df.iloc[i,0], cont_df.iloc[i, 1]],
                            [cont_df.iloc[i+1,0], cont_df.iloc[i+1,1]]])
        odds_rat, p = fisher_exact(table, alternative='two-sided')
        p_vals.append(p)
        odds_ratio.append(odds_rat)
        print(i)

if len(p_vals) != len(cont_df.index):
    p_vals.append(0)
if len(odds_ratio) != len(cont_df.index):
    odds_ratio.append(0)
print(p_vals)
outcomes_df = cont_df
outcomes_df['p_vals'] = p_vals
outcomes_df['odds_ratio'] = odds_ratio
outcomes_df['hypothesis'] = 'two-sided'

display(outcomes_df)

In [None]:
##Save outcomes
figname = 'fishers_exact_10%prob_bins_of_' + x_name + '_by_50%_prob_bins_of_' + z_name + '_for_age_over_65'
if os.path.isdir(out_dir) != True:
    os.mkdir(out_dir)
outcomes_df.to_csv(os.path.join(out_dir, (figname+'.csv')))
print(f'{figname} saved to: \n {out_dir}')