In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import patches as mpatches
plt.style.use('seaborn')
import seaborn as sns
import time
import sys
import os
import scipy.stats as st

from analysis_utils import metric_analysis
# ignore warnings
import warnings
warnings.filterwarnings('ignore');

# display multiple outputs within a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all";
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import pingouin as pg
import statsmodels.formula.api as smf
import statsmodels.regression.mixed_linear_model as smr
import statsmodels.api as sm
from statsmodels.stats.anova import AnovaRM
from statsmodels.regression.mixed_linear_model import MixedLM
from statsmodels.stats.multitest import fdrcorrection

In [3]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# file with patient data
clinical_file=r'C:\Users\coleb\OneDrive\Desktop\Fall 2021\Neuro\hgcn\data\MEG\MEG.clinical.csv'
clinical_df = pd.read_csv(clinical_file)

In [5]:
def digest_stat_df(stat_df,stat_type):
    """
    function will get all of the cluster stats of each type
    stat_df- a data frame with a row for each scan and columns for the embedding stats
    stat_type- rad,coh,or btw
    """
    # function to get the columns related to
    cols_to_use=[c for c in stat_df.columns if stat_type.lower() in c.lower()]
    return np.array(stat_df[cols_to_use]),cols_to_use

def slice_metric_df(stat_df_full,label_col,conditionals):
    """
    this function will do a statistical analysis based on embedding statistics vs the values in label_col
            and will plot the results 
    
    stat_df_full- a dataframe with all of the patient data, as well as columns for all of
                the cluster rad,cohesion, btw cluster data
    label_col-what metric will we consider to be our label? (diagnosis, CogTr, Pre/Post)
    conditionals-a true/false array of what data to include
    """
    conditional_df=stat_df_full[conditionals]
    scan_labels = conditional_df[label_col].values
    # subtracts 1 from columns that are labeled 1,2 (diagnosis, CogTr)
    if min(scan_labels)>0:
        scan_labels=scan_labels-1  
        
    if label_col in ('diagnosis','diagnosis_inv'):
        group_labels = ['Healthy Control', 'SCD']
    if label_col == 'CogTr':
        # group_labels=['Control','CogTr']
        group_labels=['CogTr','Control']
    if label_col == 'Pre':
        group_labels=['Post','Pre']
        
        
    rc,rc_labels=digest_stat_df(conditional_df,stat_type='rad')
    wc,wc_labels=digest_stat_df(conditional_df,stat_type='coh')
    bc,bc_labels=digest_stat_df(conditional_df,stat_type='btw')
    
    metric_analysis(wc,wc_labels,scan_labels,column_name=['Cluster Cohesion'],
                    graph_label_names=group_labels,sort_vals=True,max_plot=8)
    print('rad')
    metric_analysis(rc,rc_labels, scan_labels,column_name=['Cluster Radius from Origin'],
                    plot_title='',graph_label_names=group_labels,sort_vals=True,max_plot=8)
    print('btw clust')
    metric_analysis(bc,bc_labels,scan_labels,column_name=['Dist Btw Clusters'],
                    plot_title='',graph_label_names=group_labels,sort_vals=True,max_plot=6)
    
    


In [6]:
def train_diagnosis_label(row):
    """
    this will create a unique label for each diagnosis/cogtr combo
    """
    if row['diagnosis']==1:
        if row['CogTr']==1:
            return 0
        else:
            return 1
    else:
        if row['CogTr']==1:
            return 2
        else:
            return 3

In [86]:
def anova_analysis(stat_df_full,stat_type,dist_or_prob='dist',use_difference=False,package='smf'):
    """
    stat_df_full- a data frame with all patient information and associciated embedding cluster stats in each row
    stat_type=- 'rad','coh','btw' what kind of stat are we looking at
    dist_or_prob- for each of those stats, 
        should we look at the hyperbolic or the hyperbolic connectivity probability'dist' or 'prob'
    use_difference- if True, we look at the difference between Post-Pre. Otherwise, we look at the mixed model
    package- what package of analysis should we use (smf or pg). Only smf can do mixed nova w/ >1 between variable
        
    Here is where I need help.
    I have been trying to emulate the analysis from the paper doing mixed ANOVA models
    I wanted to avoid this kind of analysis, but now it seems like the right place to go instead of trying to manufacture
    results with different slices of the data.
    
    Mixed model is related to a repeated measure ANOVA but adds in analysis of between groups as well as within subject
    https://www.datanovia.com/en/lessons/mixed-anova-in-r/ (here is a primer on mixed effects anova)
    
    Here's what they say in the paper:
    "We performed a mixed-effects ANOVA with two between-subjects factors, 
        (i) cognitive status (i.e., HC or SCD) and
        (ii) CogTr (i.e., trained or non-trained),and 
        one within-subject factor (i) stage (i.e., pre-stage or post-stage) indicating the time point of the MEG recording
    We tested the main effects of all three factors and every second- and third-order interaction
        (stage × CogTr, stage × cognitive status, CogTr × cognitive status, stage × CogTr × cognitive status).
        
        
    Here are my problems:
    1. 
     I'm having problems doing a proper mixed effects analysis.
        There are two packages with the options:
            statsmodel-https://www.statsmodels.org/dev/generated/statsmodels.formula.api.mixedlm.html
            pingouin-https://pingouin-stats.org/generated/pingouin.mixed_anova.html
            
        pingouin will only accept a single between group variable (CogTr OR diagnosis),
             which seems to be very significant for us.

        statsmodel accepts 2 group variables, but does not give values for the interaction 
            between the within variable (time) and the between group variables. it only looks at interactions 
            between the between group variable and uses a fixed effect for the group

        there must be an easy solution, but I can't find it on python. our collaborators do their analysis in matlab, 
          which I could do with some guidance if any of y'all have experience


    2.
      Results- In general, there is a strong interaction effect between diagnosis and cognitive score
              and even some primary effects of diagnosis. It doesn't really match up with our collaborators' paper
                  which described heavy changes between pre->post.
                I don't understand how to make these into a story.
                
    3. Checking my work- there are all sorts of assumptions to make sure ANOVAs are valid. I want to make sure that I'm
        doing clean work, but I'm out of my depth.
        
    4. With all that said about the "Mixed Anova", I did a standard anova with the value being the difference between pre and
        post scan. This loses some information, but is similar to what I've done in the past. There are significances
        for FPN,VAN and DAN in the interaction between CogTr and Diagnosis. Hopefully we can find a compelling story.
        . Maybe this is the right way to go. (you can do this by adding "use_difference" in anova_analysis)
    

    """
    # TODO go back and check the chain of command for every fucking individual.
    # make sure you know every step of getting their data to your plate.
    assert stat_type in ('rad','coh','btw')
    assert dist_or_prob in ('dist','prob')
    rad_str = 'radprob' if dist_or_prob=='prob' else 'rad'
    coh_str = 'cohprob' if dist_or_prob=='prob' else 'coh'
    dist_str = 'distprob' if dist_or_prob=='prob' else 'dist'
#     conditionals=((stat_df_full['Pre']==1)) ## this would only analyze the pretraining scans
    conditionals=((stat_df_full['CogTr']<20))
    stat_df_full=stat_df_full[conditionals]
    patient_cols = ['ID','diagnosis','CogTr','Pre']
    rad_cols = [c for c in stat_df_full.columns if 'rad' in c.lower()]
    coh_cols = [c for c in stat_df_full.columns if 'coh' in c.lower()]
    bc_cols = [c for c in stat_df_full.columns if 'btw' in c.lower()]
    
    type_to_cols={'rad':rad_cols , 'coh':coh_cols, 'btw':bc_cols}
    cols=type_to_cols[stat_type]
    if dist_or_prob=='prob':
        cols=[c for c in cols if 'prob' in c.lower()]
    else:
        cols=[c for c in cols if 'prob' not in c.lower()]
    first=True
    # this defines what package we will use
    if (use_difference) and package=='smf':
        print('\"use difference\" not implemented with package smf yet')
        print('switching to package pg')
        package='pg'
    
    stat_df_full=stat_df_full.sort_values('Scan Index')
    pre_df=stat_df_full[stat_df_full['Pre']==1].sort_values('Scan Index')
    post_df=stat_df_full[stat_df_full['Pre']==0].sort_values('Scan Index')
    
    # make sure cols isn't empty
    assert cols
    for c in cols:
        print(c)
        if use_difference:
            c_use=c+'_dif'
            post_df[c_use]=post_df[c]-pre_df[c]
#             stat_df_full=post_df
        else:
            stat_df_full[c]=stat_df_full[c].astype(float)
            stat_df_full=stat_df_full
            c_use=c
        if package=='smf':
            assert not use_difference  ### should only use smf if we are doing mixed anova
            #here I added a age as
            # to check and make sure we weren't taking any of the influence from the uneven distribution of age
            # we end up with the same  error corrected q values-- there we're many more corrections, 
#                 but the p values were even lower
            # suggesting the analysis is robust (i think)
            #additionally, the age itself becomes significant... which could tell a new story
            
            
#            Here the C(x) tells the model that it is a categorial variable
#             res = smf.mixedlm(c+"~ C(diagnosis)*C(CogTr)", stat_df_full, groups=stat_df_full["ID"]).fit()
#             sjsjsj
#             smf.lm
#             res = smf.mixedlm(c+"~ C(diagnosis)", stat_df_full, groups=stat_df_full["ID"]).fit()
            res = smf.ols(c+"~ C(diagnosis)", data=stat_df_full).fit()
            print(stat_df_full[stat_df_full['diagnosis']==2][c].mean(),'SCD MEAN')
            print(stat_df_full[stat_df_full['diagnosis']==1][c].mean(),'HC MEAN')
            
            print(stat_df_full[stat_df_full['CogTr']==1][c].mean(),'Training MEAN')
            print(stat_df_full[stat_df_full['CogTr']==2][c].mean(),'NO TRAINING MEAN')
#             res = smf.mixedlm(c+"~ C(CogTr)", stat_df, groups=stat_df["ID"]).fit()
            # formatting table, eliminates rows row *intercept and *group value which do not have p values
            print(res.summary())
            res=res.summary().tables[1][1:-1] # eliminates rows w/o pvalues
            res=pd.DataFrame(res)  
            print(res,'res')
            try:
                res['p-unc']=res['P>|z|'].astype(float)
            except:
                res['p-unc']=res['P>|t|'].astype(float)
            

            
        else:
            # pg.mixed anova does not support analyzing multiple between columns
            #however, it does give you a p-value for the interaction terms related to the within group
            
            print(stat_df_full[stat_df_full['diagnosis']==2][c].mean(),'SCD MEAN')
            print(stat_df_full[stat_df_full['diagnosis']==1][c].mean(),'HC MEAN')
            print(stat_df_full[stat_df_full['CogTr']==1][c].mean(),'Training MEAN')
            print(stat_df_full[stat_df_full['CogTr']==2][c].mean(),'NO TRAINING MEAN')
            print(stat_df_full[stat_df_full['Pre']==0][c].mean(),'AFTER MEAN')
            print(stat_df_full[stat_df_full['Pre']==1][c].mean(),'BEFORE MEAN')
            
            if use_difference:
                print('use difference in pg')
                res = pg.anova(dv=c, between=['diagnosis'], data=stat_df)
#                 res = pg.anova(dv=c, between=['diagnosis'], data=stat_df_full)
#                 res = pg.rm_anova(dv=c, within=['Pre'],subject='ID', data=stat_df_full,detailed=True)[:-1]
#                 print(res,'RESULUTE')
        
#         res = pg.rm_anova(dv='num_leaves', within=['time', 'year'], subject='plants', 
#                   data=df, detailed=True)
        
    
            else:
                res = pg.mixed_anova(dv=c, between='diagnosis',within='Pre',subject='ID', data=stat_df_full,correction=True)
#                 res = pg.mixed_anova(dv=c, between='CogTr',within='Pre',subject='ID', data=stat_df_full)
                ## this is what we would like to be able to do, but only takes one between term
#                 res = pg.mixed_anova(dv=c, between=['diagnosis','CogTr'],within='Pre',subject='ID', data=stat_df)
            print(res)
#             sds


        res['metric']=c

#         print(res)
        if first:
            combined_results=res
            first=False
        else:
            combined_results=pd.concat([combined_results,res],axis=0)

    print(combined_results,'COMBINED RESULTS')
    fdr=fdrcorrection(pvals=combined_results['p-unc'].values,alpha=.05)    
    combined_results['Q Sig']=fdr[0]
    combined_results['Q Val']=fdr[1]

    return combined_results
#     print(fdrcorrection(pvals=combined_results['p-unc'].values,alpha=.05))
#     print(combined_results)

In [176]:
def simple_anova(stat_df_full,stat_type,label,dist_or_prob='dist',anova_type='anova'):
    """
    anova- (anova,rm,mixed)
    label- )'diagnosis', 'CogTr')
    """

    assert stat_type in ('rad','coh','btw')
    assert dist_or_prob in ('dist','prob')
    if type(label)==list and anova_type=='mixed':
        raise Exception('CANNOT DO MIXED ANOVA with more than one between effect in pg package')
    rad_str = 'radprob' if dist_or_prob=='prob' else 'rad'
    coh_str = 'cohprob' if dist_or_prob=='prob' else 'coh'
    dist_str = 'distprob' if dist_or_prob=='prob' else 'dist'
#     conditionals=((stat_df_full['Pre']==1)) ## this would only analyze the pretraining scans
    conditionals=((stat_df_full['CogTr']<20))
    stat_df_full=stat_df_full[conditionals]
    patient_cols = ['ID','diagnosis','CogTr','Pre']
    rad_cols = [c for c in stat_df_full.columns if 'rad' in c.lower()]
    coh_cols = [c for c in stat_df_full.columns if 'coh' in c.lower()]
    bc_cols = [c for c in stat_df_full.columns if 'btw' in c.lower()]
    
    type_to_cols={'rad':rad_cols , 'coh':coh_cols, 'btw':bc_cols}
    cols=type_to_cols[stat_type]
    if dist_or_prob=='prob':
        cols=[c for c in cols if 'prob' in c.lower()]
    else:
        cols=[c for c in cols if 'prob' not in c.lower()]
    first=True
    # this defines what package we will use

    stat_df_full=stat_df_full.sort_values('Scan Index')

    for c in cols:
        if anova_type=='anova':
            if type(label)==list:
                res = pg.anova(dv=c, between=label, data=stat_df_full)
                res=res[:-1]
            else:
                res = pg.anova(dv=c, between=[label], data=stat_df_full)
            
        elif anova_type=='rm':
            res = pg.rm_anova(dv=c, within=['Pre'], subject='ID', data=stat_df_full, detailed=True)[:1]
        elif anova_type=='mixed':
            res = pg.mixed_anova(dv=c, between=label,within='Pre',subject='ID', data=stat_df_full)
        else:
            raise Exception('anova type must be anova rm or mixed not {}'.format(anova_tye))
        
        res['metric']=c

        if first:
            combined_results=res
            first=False
        else:
            combined_results=pd.concat([combined_results,res],axis=0)

    print(combined_results,'COMBINED RESULTS')
    fdr=fdrcorrection(pvals=combined_results['p-unc'].values,alpha=.05)    
    combined_results['Q Sig']=fdr[0]
    combined_results['Q Val']=fdr[1]

    return combined_results

    

In [177]:
# def mixed_anova(data,split_labels,time_labels,ids):
def mixed_anova(data,split_labels,clinical_df):
    df = pd.DataFrame(data)
    stat_columns=df.columns
    df['label']=split_labels
    df['time_labels']=clinical_df['Pre']
    df['ID']=clinical_df['ID']
    
    print(df)
    first=True
    for c in stat_columns:
        print(c,'C')
        res = pg.mixed_anova(dv=c, between='label',within='time_labels',subject='ID', data=df)
        res['metric']=c
        if first:
            combined_results=res
            first=False
        else:
            combined_results=pd.concat([combined_results,res],axis=0)
    
    fdr=fdrcorrection(pvals=combined_results['p-unc'].values,alpha=.05)    
    combined_results['Q Sig']=fdr[0]
    combined_results['Q Val']=fdr[1]
    
    label_only=combined_results[combined_results['Source']=='label']

    return label_only

In [161]:
stat_df=pd.read_csv(os.path.join(os.getcwd(),'embedding_stats_avg5.csv'))

In [170]:
if clinical_df is not None and len(stat_df_use['ID'].unique())*2<=len(stat_df_use['ID']):
    repeated=True
else:
    repeated=False
repeated

False

In [172]:
# now we just need to add all of this to embedding utils
# and MAKE SURE THAT CLINICAL DATA MAKES IT IN MATCHED UP!!!!
stat_df_use=stat_df[stat_df['CogTr']==1]
data=stat_df_use[[c for c in stat_df.columns if 'radprob' in c.lower()]]
# time_labels=stat_df['Pre'] c.lower()]]
labels=stat_df['diagnosis']
data=np.array(data)
combined_results=mixed_anova(data,labels,stat_df)
combined_results

           0         1         2         3         4         5         6  \
0   0.695409  0.476373  0.457375  0.518196  0.683547  0.488161  0.635774   
1   0.693673  0.485168  0.448937  0.478577  0.653848  0.477987  0.628902   
2   0.674865  0.503121  0.489607  0.521319  0.661555  0.524615  0.654986   
3   0.707007  0.512073  0.394902  0.492551  0.646397  0.478060  0.685068   
4   0.701885  0.540213  0.549953  0.535716  0.689908  0.550854  0.670898   
..       ...       ...       ...       ...       ...       ...       ...   
87  0.654021  0.513796  0.498532  0.533503  0.720123  0.476602  0.602962   
88  0.690217  0.607102  0.610435  0.594593  0.651247  0.594111  0.694050   
89  0.710229  0.632334  0.601329  0.602869  0.667757  0.609225  0.698211   
90  0.697216  0.534120  0.574257  0.574912  0.698601  0.549484  0.655954   
91  0.733224  0.547590  0.584119  0.571188  0.683851  0.586361  0.696303   

           7  label  time_labels        ID  
0   0.438276      1            1  UMEC-002

Unnamed: 0,Source,SS,DF1,DF2,MS,F,p-unc,np2,eps,metric,Q Sig,Q Val
0,label,0.000438,1,44,0.000438,0.559735,0.458347,0.012561,,0,False,0.785738
0,label,0.004244,1,44,0.004244,0.903431,0.347055,0.020119,,1,False,0.750015
0,label,0.031606,1,44,0.031606,10.499876,0.002277,0.192659,,2,True,0.039662
0,label,0.010797,1,44,0.010797,9.571804,0.003428,0.178672,,3,True,0.039662
0,label,7.9e-05,1,44,7.9e-05,0.081754,0.776277,0.001855,,4,False,0.931532
0,label,0.014317,1,44,0.014317,8.753782,0.004958,0.165937,,5,True,0.039662
0,label,0.002112,1,44,0.002112,1.036162,0.314281,0.023007,,6,False,0.750015
0,label,0.020581,1,44,0.020581,4.988532,0.030649,0.101831,,7,False,0.147113


In [109]:
# extra columns
stat_df['age_stand']=(stat_df['age']-stat_df['age'].mean())*stat_df['age'].std()
stat_df['diag_train']=stat_df.apply(lambda row: train_diagnosis_label(row),axis=1 )

In [197]:
# mixed anova with Pre indicator as within group variable
# stat_df_use=stat_df[stat_df['CogTr']==1]
# stat_df_use=stat_df[((stat_df['Pre']==1) & (stat_df['CogTr']==1))]
# stat_df_use=stat_df[( (stat_df['CogTr']==1 ))]
# stat_df_use=stat_df[((stat_df['diagnosis']==1))]
stat_df_use=stat_df[((stat_df['Pre']==0))]
# stat_df_use=stat_df
# results=anova_analysis(stat_df_use,stat_type='rad',dist_or_prob='dist',use_difference=True,package='pg')
# results=simple_anova(stat_df_use,stat_type='rad',label='diagnosis',dist_or_prob='prob',anova_type='mixed')
results=simple_anova(stat_df_use,stat_type='rad',label=['diagnosis','CogTr'],dist_or_prob='dist',anova_type='anova')
# results=simple_anova(stat_df_use,stat_type='rad',label=['diagnosis'],dist_or_prob='dist',anova_type='anova')

print('significant results only')
print(results)
results
results['P Val']=results['p-unc']
results[results['Q Sig']==True][['Source','metric','P Val','Q Sig','Q Val']]

              Source        SS   DF        MS          F     p-unc       np2  \
0          diagnosis  0.017510  1.0  0.017510   0.871768  0.353080  0.010035   
1              CogTr  0.001127  1.0  0.001127   0.056106  0.813324  0.000652   
2  diagnosis * CogTr  0.048910  1.0  0.048910   2.435065  0.122322  0.027535   
0          diagnosis  0.156366  1.0  0.156366   2.056406  0.155193  0.023353   
1              CogTr  0.114383  1.0  0.114383   1.504277  0.223362  0.017191   
2  diagnosis * CogTr  0.294847  1.0  0.294847   3.877603  0.052154  0.043143   
0          diagnosis  0.046926  1.0  0.046926   1.156310  0.285240  0.013267   
1              CogTr  0.015212  1.0  0.015212   0.374847  0.541989  0.004340   
2  diagnosis * CogTr  0.589326  1.0  0.589326  14.521791  0.000260  0.144464   
0          diagnosis  0.000199  1.0  0.000199   0.015699  0.900583  0.000183   
1              CogTr  0.001179  1.0  0.001179   0.092864  0.761303  0.001079   
2  diagnosis * CogTr  0.197104  1.0  0.1

Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2,metric,Q Sig,Q Val
0,diagnosis,0.01751,1.0,0.01751,0.871768,0.35308,0.010035,pDMN_Rad,False,0.651841
1,CogTr,0.001127,1.0,0.001127,0.056106,0.813324,0.000652,pDMN_Rad,False,0.848686
2,diagnosis * CogTr,0.04891,1.0,0.04891,2.435065,0.122322,0.027535,pDMN_Rad,False,0.403401
0,diagnosis,0.156366,1.0,0.156366,2.056406,0.155193,0.023353,aDMN_Rad,False,0.413849
1,CogTr,0.114383,1.0,0.114383,1.504277,0.223362,0.017191,aDMN_Rad,False,0.536069
2,diagnosis * CogTr,0.294847,1.0,0.294847,3.877603,0.052154,0.043143,aDMN_Rad,False,0.208616
0,diagnosis,0.046926,1.0,0.046926,1.15631,0.28524,0.013267,DAN_Rad,False,0.622341
1,CogTr,0.015212,1.0,0.015212,0.374847,0.541989,0.00434,DAN_Rad,False,0.747691
2,diagnosis * CogTr,0.589326,1.0,0.589326,14.521791,0.00026,0.144464,DAN_Rad,True,0.003115
0,diagnosis,0.000199,1.0,0.000199,0.015699,0.900583,0.000183,FPN_Rad,False,0.900583


Unnamed: 0,Source,metric,P Val,Q Sig,Q Val
2,diagnosis * CogTr,DAN_Rad,0.00026,True,0.003115
2,diagnosis * CogTr,FPN_Rad,0.000165,True,0.003115
2,diagnosis * CogTr,VAN_Rad,0.001011,True,0.008084


In [39]:
## looks at change over time
# standard anova with Change~ CogTr*Diagnosis
results=anova_analysis(stat_df,stat_type='rad',dist_or_prob='dist',use_difference=True,package='pg')
print('significant results only')
results[results['Q Sig']==True]

pDMN_Rad
1.1464417400918367 SCD MEAN
1.1216149170487804 HC MEAN
1.1318267604456522 Training MEAN
1.1385869518863638 NO TRAINING MEAN
1.126768850611111 AFTER MEAN
1.1434946352444444 BEFORE MEAN
  Source        SS  DF        MS         F     p-unc       ng2  eps
0    Pre  0.012589   1  0.012589  1.456069  0.230754  0.003887  1.0
aDMN_Rad
1.8282300313979591 SCD MEAN
1.7663054559634142 HC MEAN
1.8177406145000001 Training MEAN
1.7814937946818181 NO TRAINING MEAN
1.7904081215888885 AFTER MEAN
1.809631772477778 BEFORE MEAN
  Source       SS  DF       MS         F     p-unc       ng2  eps
0    Pre  0.01663   1  0.01663  0.654986  0.420494  0.001332  1.0
DAN_Rad
1.8855045060918365 SCD MEAN
1.9049916081097562 HC MEAN
1.8631536356630434 Training MEAN
1.9270297611477274 NO TRAINING MEAN
1.8863504602888892 AFTER MEAN
1.9024134670666668 BEFORE MEAN
  Source        SS  DF        MS         F     p-unc       ng2  eps
0    Pre  0.011611   1  0.011611  0.823281  0.366673  0.001298  1.0
FPN_Rad
1.8049649

Unnamed: 0,Source,SS,DF,MS,F,p-unc,ng2,eps,metric,Q Sig,Q Val


In [12]:
from sklearn.metrics import accuracy_score, classification_report, f1_score,roc_auc_score
from sklearn.model_selection import cross_val_predict,GridSearchCV,StratifiedKFold,LeaveOneOut,train_test_split,KFold

In [13]:
def create_splits(scan_df,k,seed=100):
#     KFold(n_splits=5,shuffle=True, random_state=seed)

    pre_df=scan_df[scan_df['Pre']==1]
    post_df=scan_df[scan_df['Pre']==0]
    patients=scan_df['ID'].unique()
    train_pats,test_pats=train_test_split(patients)
    
    kf = KFold(n_splits=k)
    fold_split=kf.get_n_splits(scan_df)
    print(fold_split)
    train_pats=patients
    stat_cols=[c for c in scan_df if 'adprob' in c]
    meta_cols=['PreID','PostID','Same']
    data=[]
    for ti in train_pats:
        print(ti,'TI')
        t_pre=pre_df[pre_df['ID']==ti]
        t_pre_stat=t_pre[stat_cols]
        for tj in train_pats:
            t_post=pre_df[pre_df['ID']==tj]
            t_post_stat=t_post[stat_cols]
            difference=t_post_stat-t_pre_stat
            label=1 if tj==ti else 0
            meta=[ti,tj,label]
            difference[['ti',tj,label]]=meta
            

            data.append(difference)
    print(data.shape,'DATA SHAPE')
    
        
            
            
            

In [14]:
create_splits(stat_df,k=4,seed=100)

4
UMEC-002 TI
UMEC-020 TI
UMEC-022 TI
UMEC-026 TI
UMEC-028 TI
UMEC-031 TI
UMEC-038 TI
UMEC-066 TI
UMEC-073 TI
UMEC-075 TI
UMEC-079 TI
UMEC-082 TI
UMEC-083 TI
UMEC-124 TI
UMEC-125 TI
UMEC-147 TI
UMEC-160 TI
UMEC-161 TI
UMEC-186 TI
UMEC-208 TI
UMEC-225 TI
UMEC-226 TI
UMEC-004 TI
UMEC-005 TI
UMEC-017 TI
UMEC-030 TI
UMEC-032 TI
UMEC-053 TI
UMEC-115 TI
UMEC-132 TI
UMEC-146 TI
UMEC-155 TI
UMEC-157 TI
UMEC-159 TI
UMEC-162 TI
UMEC-164 TI
UMEC-168 TI
UMEC-173 TI
UMEC-174 TI
UMEC-176 TI
UMEC-192 TI
UMEC-194 TI
UMEC-201 TI
UMEC-206 TI
UMEC-217 TI
UMEC-219 TI
UMEC-008 TI
UMEC-009 TI
UMEC-076 TI
UMEC-078 TI
UMEC-080 TI
UMEC-081 TI
UMEC-127 TI
UMEC-136 TI
UMEC-138 TI
UMEC-150 TI
UMEC-181 TI
UMEC-182 TI
UMEC-184 TI
UMEC-185 TI
UMEC-190 TI
UMEC-196 TI
UMEC-197 TI
UMEC-210 TI
UMEC-227 TI
UMEC-036 TI
UMEC-039 TI
UMEC-054 TI
UMEC-060 TI
UMEC-061 TI
UMEC-064 TI
UMEC-068 TI
UMEC-100 TI
UMEC-101 TI
UMEC-104 TI
UMEC-121 TI
UMEC-139 TI
UMEC-141 TI
UMEC-145 TI
UMEC-148 TI
UMEC-151 TI
UMEC-152 TI
UMEC-154 TI
UM

AttributeError: 'list' object has no attribute 'shape'