In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import platform
import pathlib

In [None]:
## Paths Input Here
if platform.uname().system == 'Darwin': #------------------------------Mac OS X---------------------------------------------------------------
    conn_path = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/analyses/roi-roi_correl/matrix_corrMx_AvgR.csv'
    clin_path = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/patient_data/AD_Clinical_Data_CDR_ADAS_COG_13.xlsx'
    # clin_path = 'path to clinical values'
    out_dir = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/analyses/roi-roi_correl/figures/multidimensional_scatterplots'
    #out_dir = r'path to out dir here'
    x_roi_names = r'/Users/cu135/Dropbox (Partners HealthCare)/memory/analyses/roi-roi_correl/matrix_corrMx_names.csv'
    #roi_names = '<path to roi name location>'
    print('I have set pathnames in the Mac style')
else: #----------------------------------------------------------------Windows----------------------------------------------------------------
    conn_path = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\matrix_corrMx_AvgR.csv'
    clin_path = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\patient_data\AD_Clinical_Data_CDR_ADAS_COG_13.xlsx'
    # clin_path = 'path to clinical values'
    out_dir = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\stats\distribution_analysis'
    #out_dir = r'path to out dir here'
    x_roi_names = r'C:\Users\calvin.howard\Dropbox (Partners HealthCare)\memory\analyses\roi-roi_correl\matrix_corrMx_names.csv'
    #roi_names = '<path to roi name location>'
    print('I have set pathnames in the Windows style')

In [None]:
try:
    name_df = pd.read_csv(x_roi_names, names=['arb'], header=None)
    name_df.tail(5)
    colnames = name_df.arb.values.tolist()
    newname = []
    for name in colnames:
        arb = os.path.basename(name).split('.nii')[0]
        arb = arb[0:8]
        newname.append(arb)
    #print('NAMES: ', newname)

    x_df = pd.read_csv(conn_path, names=newname, header=None)#, ignore_index=True)
    x_df.index = newname
    x_df = x_df.iloc[8:,:8]
    x_df = x_df.reset_index(drop=True)
except:
    print('excepted')
    x_df = pd.read_csv(conn_path)
    colnames = x_df.columns.values
    newname = []
    for name in colnames:
        arb = os.path.basename(name).split('.nii')[0]
        arb = arb[0:8]
        newname.append(arb)
    x_df = x_df.set_axis(newname, axis=1, inplace=False)
    try:
        x_df.pop('Unnamed:')
    except:
        print('no x_df.pop(<name>) column to pop')


corr_df = x_df
corr_df.tail(3)

In [None]:
#Assess connectivity values fundamentally
corr_description = corr_df.describe().transpose()
display(corr_description)

In [None]:
#important metrics: 
# pt_id, randomization arm, age at DOS,
# baseline adas cog 11, baseline CDR
# %change adas cog 11, # change CDR


sheet_name = 'AD_Clinical_Scores'
alphab_cols = 'C, D, E, F, G, J, V'
clin_df = pd.read_excel(clin_path, sheet_name=sheet_name, usecols=alphab_cols, nrows=50)
print('Num NaNs: ', clin_df.isna().sum().sum())
# clin_df.tail(5)
display(clin_df[::2])

In [None]:
## Organize the clinical dataframe
clin_df = clin_df.sort_values(by=['Patient # CDR, ADAS'], kind='quicksort', axis=0, ascending=True, ignore_index=True)
clin_df.tail(10)

In [None]:
##One-hot-encode the dataframe | sham=0 stim=1 
shams = (clin_df['Randomization Arm'] == 'sham-stim')
clin_df.loc[shams, 'Randomization Arm'] = 0
stims = (clin_df['Randomization Arm'] == 'stim-sham')
clin_df.loc[stims, 'Randomization Arm'] = 1
clin_df.tail(10)

In [None]:
## Develop Understanding of the Clinical Data
try:
    clin_description = clin_df.describe().transpose()
    display(clin_description)
except:
    print('Failed to describe clinical dataframe, unknown cause')

In [None]:
corrd_df = corr_df.reset_index()
total_df = pd.concat([clin_df, corr_df], axis=1)
display(total_df)

In [None]:
#Handle NaNs
total_df = total_df.fillna(method='ffill')
print('Num NaNs: ', total_df.isna().sum().sum())

In [None]:
#Generate metrics for udnerstanding
try:
    total_desc = total_df.describe().transpose()
    total_desc
except:
    print('Failed to generate total metrics, unkown cause')

## Kolmogorov-Smirnov Time

In [None]:
#subgrouping by age
metric = 'Age at DOS'
metric_val = 65

index_one = (total_df[metric] > metric_val) #example, all individuals over 65
index_two = (total_df[metric] <= metric_val) #example, all individuals under/equal to 65

## Choose variables of interest.
x_name = '00_memor'
z_name = '% Change from baseline (ADAS-Cog11)' #--------------------------------------------------------This variable is the outcome variable

x_vals = total_df[x_name]#[index_two]
z_vals = total_df[z_name]#[index_two]#------------------------------------------------------------------------------This variable is the outcome variable


##Example of how to use these indices to manipulate data:
#### clin_df.loc[index_one, '<names of relevant columns>'] = 0 ## can use iloc too. 

In [None]:
from scipy.stats import ks_2samp

x_ks_stat, x_ks_p = ks_2samp(x_vals[index_one], x_vals[index_two])
y_ks_stat, y_ks_p = ks_2samp(z_vals[index_one], z_vals[index_two])

outcomes_df = pd.DataFrame({'x_ks_stat': x_ks_stat,
                            'x_ks_p': x_ks_p,
                            'y_ks_stat': y_ks_stat,
                            'y_ks_p': y_ks_p}, index=['stat vals'])
display(outcomes_df)

In [None]:
#Prepare Dataframe form SNS Plotting\
vec = index_one
vec[vec == True] = f'>{metric_val}'
vec[vec == False] = f'<={metric_val}'
name = f'{metric}'
total_df['metric'] = vec

total_df.tail(15)

In [None]:
#plot the jointplots for each of the samples - use seaborn
jp1 = sns.jointplot(data=total_df, x=x_name, y=z_name, kind='kde');
jp2 = sns.jointplot(data=total_df, x=x_name, y=z_name, kind='kde', hue='metric'); 





In [None]:
#Create CDF
cdf = plt.figure()
sns.ecdfplot(data=total_df, x=x_name, hue='metric')


In [None]:
#Save outcomes
figname = 'kolmogorov_smirnov_' + x_name + '_and_' + z_name + 'split_by_age_65'
if os.path.isdir(out_dir) != True:
    os.mkdir(out_dir)
jp1.savefig(os.path.join(out_dir, ('joint_plot_' + figname.split('smirnov')[1] + figname.split('split')[0] + '.png')))
jp2.savefig(os.path.join(out_dir, ('joint_plot_' + figname.split('smirnov')[1] + '.png')))
cdf.savefig(os.path.join(out_dir, ('CDF_plot_' + figname  + '.png')))
outcomes_df.to_csv(os.path.join(out_dir, (figname+'.csv')))
print(f'{figname} saved to: \n {out_dir}')