In [1]:
from pathlib import Path 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
from collections import OrderedDict
import sys
import os
import seaborn as sns
import researchpy as rp
import statsmodels.formula.api as smf
import scipy.stats as stats
import warnings

from statsmodels.nonparametric.smoothers_lowess import lowess



#sys.path.append('/Users/alina/Desktop/MIT/code/ADHD/MTA/helper')
from helper import rr, prep, var_dict

%load_ext autoreload
%autoreload 2

In [2]:
if Path('/Volumes/Samsung_T5/MIT/mta').exists():
    data_root =     '/Volumes/Samsung_T5/MIT/mta'
    data_derived = '/Volumes/Samsung_T5/MIT/mta/output/derived_data'
else: 
    data_root = '/Users/alina/Desktop/MIT/code/data'
    data_derived = '/Users/alina/Desktop/MIT/code/data/output/derived_data'

In [3]:
baseline_var = ['src_subject_id', 'interview_date', 'interview_age', 'sex', 'site', 'days_baseline']
dtypes_baseline = { 'src_subject_id' : 'str',
                    'interview_date': 'str' , 
                    'interview_age' : 'int64' ,
                    'sex' : 'str', 
                    'site' : 'int64' ,
                    'days_baseline':  'int64',
                    'version_form': 'str'}

version_form = ['version_form']
treat_group_file = 'treatment_groups.csv'
trt_names = pd.read_csv(Path(data_derived, treat_group_file))

qsts =['snap', 'ssrs', 'masc', 'pc', 'wechsler']

In [4]:
snap_file = 'snap01.txt'
ssrs_file = 'ssrs01.txt'
masc_file = 'masc_p01.txt'
parent_child_file = 'pcrc01.txt'
wechsler_file = 'wiat_iiip201.txt'
treat_group_file = 'treatment_groups.csv'
#outcome variablles 
snap_vars = ['snainatx', 'snahix', 'snaoddx'] #inattention_mean, hyperactie mean
ssrs_vars = ['sspintx', 'ssptossx']# social skills mean, internalizing mean 
masc_vars = ['masc_masctotalt']
pc_vars = ['pcrcpax', 'pcrcprx'] # power assertion, personal closeness
wechsler_vars = ['w1readb','w2math','w3spell' ]

outcomes_list = np.concatenate((snap_vars ,ssrs_vars,masc_vars, pc_vars, wechsler_vars ))
outcomes_dict  = dict(zip(qsts, [snap_vars ,ssrs_vars,masc_vars, pc_vars, wechsler_vars ])) #, 'wechsler': wechsler_vars}

interaction_predictors = ['days_baseline', 'site', 'trtname'] #time, site, treatment group

raters = ['Teacher', 'Parent']



In [5]:

snap_file = 'snap01.txt'
ssrs_file = 'ssrs01.txt'
masc_file = 'masc_p01.txt'
parent_child_file = 'pcrc01.txt'
wechsler_file = 'wiat_iiip201.txt'
treat_group_file = 'treatment_groups.csv'

In [6]:
# load files, drop rows if missing date, drop duplicates 

snap_file = 'snap01.txt'
ssrs_file = 'ssrs01.txt'
masc_file = 'masc_p01.txt'
parent_child_file = 'pcrc01.txt'
wechsler_file = 'wiat_iiip201.txt'
treat_group_file = 'treatment_groups.csv'

treat_group = pd.read_csv(Path(data_derived, treat_group_file))


snap = prep.get_data(Path(data_root, snap_file), columns= [baseline_var, snap_vars, version_form], treat_group= treat_group, set_dtypes= True, version_form= True, split_timepoints= None)
ssrs = prep.get_data(Path(data_root, ssrs_file), columns= [baseline_var, ssrs_vars, version_form], treat_group= treat_group, set_dtypes= True, version_form= True, split_timepoints= None)
pc = prep.get_data(Path(data_root, parent_child_file), columns= [baseline_var, pc_vars], treat_group= treat_group, set_dtypes= True, version_form= False, split_timepoints= None)
masc = prep.get_data(Path(data_root,masc_file), columns= [baseline_var, masc_vars], treat_group= treat_group, set_dtypes= True, version_form= False, split_timepoints= None)
wechsler = prep.get_data(Path(data_root, wechsler_file), columns= [baseline_var, wechsler_vars], treat_group= treat_group, set_dtypes= True, version_form= False, split_timepoints= None)
data_dict = dict(zip(qsts, [snap, ssrs, masc, pc, wechsler]))

Success
Success
Success
Success
Success


In [7]:
outcomes_list

array(['snainatx', 'snahix', 'snaoddx', 'sspintx', 'ssptossx',
       'masc_masctotalt', 'pcrcpax', 'pcrcprx', 'w1readb', 'w2math',
       'w3spell'], dtype='<U15')

In [8]:
#outcome variables 
outcomes_written = ['SNAP Inattention', 'SNAP Hyperactivity-Impulsivity', 'SNAP Aggressive',
                    'SSRS Internalizing', 'SSRS Social Skills', 
                    'MASC total Score', 
                    'Parent-Child Power Assertion', 'Parent-Child Personal Closeness'
                    'Wechsler Scaled Reading Score', 'Wechsler Scaled Mathematics Score','Wechsler Scaled Spelling Score']
outcomes_dict_fig = dict(zip(outcomes_list, outcomes_written))
print(outcomes_dict_fig)

# treatment names 
trt_dict = {'M': 'Medication Management', 'P': 'Behavioral Treatment', 'C': 'Combined Treatment'}

#mediators 
# med_written = ['CD or ODD', 'Anxiety', 'Public Assistance', 'Prior Medication', 'Initial Acceptance of Treatment Arm', 'Sex']
# med_options = [['No', ''],['No', ''] ,['No', ''],['No', ''], ['Low', 'High'], ['Male', 'Female']]

# # mediator variable names spelled out 
# med_dict_fig = {} #for figure titles 
# options_dict = {}
# for i, med in enumerate(med_mod_list):
#     med_dict_fig[med] = [med_options[i][j] + ' ' + med_written[i] for j in range(2)]
# med_dict_fig


{'snainatx': 'SNAP Inattention', 'snahix': 'SNAP Hyperactivity-Impulsivity', 'snaoddx': 'SNAP Aggressive', 'sspintx': 'SSRS Internalizing', 'ssptossx': 'SSRS Social Skills', 'masc_masctotalt': 'MASC total Score', 'pcrcpax': 'Parent-Child Power Assertion', 'pcrcprx': 'Parent-Child Personal ClosenessWechsler Scaled Reading Score', 'w1readb': 'Wechsler Scaled Mathematics Score', 'w2math': 'Wechsler Scaled Spelling Score'}


In [9]:
for key in data_dict.keys():
    #for time, df in data_dict[key].items():

    data_dict[key] = data_dict[key][data_dict[key]['trtname'] != 'L']

In [10]:
df_outcomes = pd.read_csv(Path(data_derived, "outcome_means_b_14.csv"))

In [11]:
df_baseline_values = df_outcomes[
    [
        'Outcome Domain', 
        'Measure and Rater',
        'Combined Treatment - Baseline (SD) [No. of Subjects]',
        'Medication Management - Baseline (SD) [No. of Subjects]',
        'Behavioral Treatment - Baseline (SD) [No. of Subjects]',
        'Assessment and Referral - Baseline (SD) [No. of Subjects]'
    ]
]

# Display the new DataFrame
df_14_values = df_outcomes[
    [
        'Outcome Domain', 
        'Measure and Rater',
        'Combined Treatment - 14 mo (SD) [No. of Subjects]',
        'Medication Management - 14 mo (SD) [No. of Subjects]',
        'Behavioral Treatment - 14 mo (SD) [No. of Subjects]',
        'Assessment and Referral - 14 mo (SD) [No. of Subjects]'
    ]
]


In [12]:
timepoints = [46,168,319,500]

In [13]:
from scipy.stats import ttest_ind
#(data_dict[qst]["days_baseline"] > -50) &(data_dict[qst]["days_baseline"] < 150)
def get_conditions(time, qst, trt, rater = None):
    if rater is not None: 
        if time == "b":
            conditions = (data_dict[qst]["days_baseline"] < 46) & (data_dict[qst]["trtname"] == trt) & (data_dict[qst]["version_form"] == rater)
        elif time == "14":
            conditions = (data_dict[qst]["days_baseline"] >= 46 ) &(data_dict[qst]["days_baseline"] < 168)    & (data_dict[qst]["trtname"] == trt) & (data_dict[qst]["version_form"] == rater)
    else:
        if time == "b":
            conditions = (data_dict[qst]["days_baseline"]< 46)& (data_dict[qst]["trtname"] == trt)
            
        elif time == "14":
            conditions = (data_dict[qst]["days_baseline"] >= 46  ) &(data_dict[qst]["days_baseline"] < 168)  & (data_dict[qst]["trtname"] == trt)
    return conditions

In [14]:
def get_col_names(time, trt):
    
    if time == "b":     
        if trt == 'C':
            column_name = 'Replicated Combined Treatment - Baseline (SD) [No. of Subjects]'
            original_column = 'Combined Treatment - Baseline (SD) [No. of Subjects]'
            p_value_column = 'p-value (Original vs. Replicated) - Combined Treatment'
        elif trt == 'M':
            column_name = 'Replicated Medication Management - Baseline (SD) [No. of Subjects]'
            original_column = 'Medication Management - Baseline (SD) [No. of Subjects]'
            p_value_column = 'p-value (Original vs. Replicated) - Medication Management'
        elif trt == 'P':
            column_name = 'Replicated Behavioral Treatment - Baseline (SD) [No. of Subjects]'
            original_column = 'Behavioral Treatment - Baseline (SD) [No. of Subjects]'
            p_value_column = 'p-value (Original vs. Replicated) - Behavioral Treatment'
        elif trt == 'A':
            column_name = 'Replicated Assessment and Referral - Baseline (SD) [No. of Subjects]'
            original_column = 'Assessment and Referral - Baseline (SD) [No. of Subjects]'
            p_value_column = 'p-value (Original vs. Replicated) - Assessment and Referral'
            
    elif time == "14":
        if trt == "C":
            column_name = 'Replicated Combined Treatment - 14 mo (SD) [No. of Subjects]'
            original_column = 'Combined Treatment - 14 mo (SD) [No. of Subjects]'
            p_value_column = 'p-value (Original vs. Replicated) - Combined Treatment'
        elif trt == 'M':
            column_name = 'Replicated Medication Management - 14 mo (SD) [No. of Subjects]'
            original_column = 'Medication Management - 14 mo (SD) [No. of Subjects]'
            p_value_column = 'p-value (Original vs. Replicated) - Medication Management'
        elif trt == 'P':
            column_name = 'Replicated Behavioral Treatment - Baseline (SD) [No. of Subjects]'
            original_column = 'Behavioral Treatment - 14 mo (SD) [No. of Subjects]'
            p_value_column = 'p-value (Original vs. Replicated) - Behavioral Treatment'
        elif trt == 'A':
            column_name = 'Replicated Assessment and Referral - Baseline (SD) [No. of Subjects]'
            original_column = 'Assessment and Referral - 14 mo (SD) [No. of Subjects]'
            p_value_column = 'p-value (Original vs. Replicated) - Assessment and Referral'
            
    return column_name, original_column, p_value_column

In [15]:
written_out_dict = {'snainatx' : "natten", 'snahix': "yperact", 'snaoddx': "gress", 
                     'sspintx': "ntern", 'ssptossx':"SSRS",
                     'masc_masctotalt': 'MASC',
                     'pcrcpax': "ower", 'pcrcprx': "lose",
                     'w1readb':'ead','w2math': 'ath', 'w3spell':'pell'
                     }
written_out_dict

{'snainatx': 'natten',
 'snahix': 'yperact',
 'snaoddx': 'gress',
 'sspintx': 'ntern',
 'ssptossx': 'SSRS',
 'masc_masctotalt': 'MASC',
 'pcrcpax': 'ower',
 'pcrcprx': 'lose',
 'w1readb': 'ead',
 'w2math': 'ath',
 'w3spell': 'pell'}

In [22]:
time = "14"
if time == "b":
    df = df_baseline_values
elif time == "14":
    df = df_14_values
    

raters =["Parent", "Teacher"]



for qst in qsts:
    
    rater = raters[1] if (qst == 'snap' or qst == 'ssrs') else None
    outcomes = outcomes_dict[qst]  # Outcomes of interest for the current questionnaire

    # Loop over each treatment
    for trt in data_dict[qst]["trtname"].unique():
        # Filter the data for baseline (days_baseline = 0) and the current treatment

        conditions = get_conditions(time, qst, trt, rater)
        data = data_dict[qst][conditions]

        
        # Calculate the mean, std, and number of subjects for each outcome
        for outcome in outcomes:
            mean_value = data[outcome].mean(0)
            std_value = data[outcome].std()
            Nsubjects = data[outcome].count()

            # Format the string as "mean (std) [Nsubjects]"
            formatted_value = f"{mean_value:.2f} ({std_value:.2f}) [{Nsubjects}]"

            # Determine the corresponding column name for the replicated data based on the treatment
            column_name, original_column, p_value_column = get_col_names(time, trt)
            
            # Add the new column if it doesn't exist in the DataFrame
            if column_name not in df.columns:
                df[column_name] = ""

            # Locate the appropriate row in the DataFrame by matching the outcome and qst
            if rater is not None: 
                row_index = df[
                    (df['Measure and Rater'].str.contains(written_out_dict[outcome])) & 
                    (df['Measure and Rater'].str.contains(rater))
                ].index
            else: 
                row_index = df[
                    (df['Measure and Rater'].str.contains(written_out_dict[outcome])) 
                ].index
            
            # Update the DataFrame with the calculated formatted string in the new column
            df.loc[row_index, column_name] = formatted_value

            # Perform the statistical test to compare the original and replicated data
            try:
                # Extract original mean, std, and N from the original column
                original_str = df.loc[row_index, original_column].values[0]
                original_mean = float(original_str.split(' ')[0])
                original_std = float(original_str.split('(')[1].split(')')[0])
                original_n = int(original_str.split('[')[1].split(']')[0])
                
                # Generate random samples for the original and replicated values
                original_data = np.random.normal(original_mean, original_std, original_n)
                #replicated_data = np.random.normal(mean_value, std_value, Nsubjects)
                
                # Perform a t-test
                t_stat, p_value = ttest_ind(original_data, data[outcome])

                # Add the p-value to a new column in the DataFrame
                if p_value_column not in df.columns:
                    df[p_value_column] = ""

                # Insert the p-value into the appropriate cell
                df.loc[row_index, p_value_column] = f"{p_value:.4f}"

            except (IndexError, ValueError):
                # If the original data cannot be parsed or the test fails, skip the p-value calculation
                continue

In [23]:
df_baseline_values

Unnamed: 0,Outcome Domain,Measure and Rater,Combined Treatment - Baseline (SD) [No. of Subjects],Medication Management - Baseline (SD) [No. of Subjects],Behavioral Treatment - Baseline (SD) [No. of Subjects],Assessment and Referral - Baseline (SD) [No. of Subjects],Replicated Combined Treatment - Baseline (SD) [No. of Subjects],p-value (Original vs. Replicated) - Combined Treatment,Replicated Medication Management - Baseline (SD) [No. of Subjects],p-value (Original vs. Replicated) - Medication Management,Replicated Assessment and Referral - Baseline (SD) [No. of Subjects],p-value (Original vs. Replicated) - Assessment and Referral,Replicated Behavioral Treatment - Baseline (SD) [No. of Subjects],p-value (Original vs. Replicated) - Behavioral Treatment
0,ADHD symptoms,Inattention - Teacher,2.16 (0.67) [137],2.27 (0.61) [135],2.28 (0.64) [136],2.19 (0.69) [135],,,,,,,,
1,ADHD symptoms,Inattention - Parent,2.07 (0.61) [140],2.03 (0.64) [140],1.99 (0.63) [139],2.05 (0.65) [142],2.04 (0.66) [368],0.8604,2.05 (0.65) [372],0.6421,2.08 (0.68) [369],0.2806,2.04 (0.68) [365],0.1301
2,ADHD symptoms,Hyperactive/impulsive - Teacher,1.89 (0.77) [137],2.08 (0.71) [135],2.05 (0.75) [136],1.93 (0.81) [135],,,,,,,,
3,ADHD symptoms,Hyperactive/impulsive - Parent,1.91 (0.69) [140],1.89 (0.62) [140],1.89 (0.64) [140],1.95 (0.67) [142],1.88 (0.68) [368],0.9167,1.90 (0.64) [372],0.4073,1.91 (0.69) [369],0.626,1.88 (0.67) [365],0.6116
4,ADHD symptoms,Hyperactive/impulsive - Classroom observer,0.33 (0.22) [122],0.31 (0.21) [119],0.37 (0.26) [120],0.38 (0.27) [118],,,,,,,,
5,Aggression-ODD,ODD aggression - Teacher,1.29 (0.91) [137],1.39 (0.92) [120],1.43 (0.86) [136],1.35 (0.88) [135],,,,,,,,
6,Aggression-ODD,ODD aggression - Parent,1.39 (0.71) [140],1.45 (0.80) [139],1.37 (0.70) [140],1.49 (0.70) [142],1.41 (0.73) [368],0.0451,1.52 (0.79) [372],0.1266,1.53 (0.72) [369],0.808,1.43 (0.74) [365],0.0166
7,Aggression-ODD,ODD aggression - Classroom observer,0.018 (0.038) [122],0.014 (0.025) [119],0.020 (0.046) [120],0.019 (0.026) [118],,,,,,,,
8,Internalizing symptoms,SSRS internalizing symptoms - Teacher,0.73 (0.51) [113],0.79 (0.47) [117],0.82 (0.45) [115],0.78 (0.44) [115],,,,,,,,
9,Internalizing symptoms,SSRS internalizing symptoms - Parent,0.98 (0.37) [138],0.97 (0.37) [137],0.93 (0.43) [133],0.97 (0.35) [137],1.07 (0.24) [239],0.0027,1.02 (0.24) [230],0.4229,1.04 (0.25) [224],0.0,1.04 (0.23) [222],0.0018


In [24]:
def highlight_red(val):
    """
    Apply red background color if the value is below 0.05.
    """
    try:
        color = 'blue' if float(val) > 0.05 else ''
    except ValueError:
        return
    return f'background-color: {color}'

p_value_columns = [
    'p-value (Original vs. Replicated) - Combined Treatment',
    'p-value (Original vs. Replicated) - Medication Management',
    'p-value (Original vs. Replicated) - Behavioral Treatment',
    'p-value (Original vs. Replicated) - Assessment and Referral'
]

# Apply the formatting to the DataFrame for all p-value columns
styled_df_b = df_14_values.style.applymap(highlight_red, subset=p_value_columns)
styled_df_b

  styled_df_b = df_14_values.style.applymap(highlight_red, subset=p_value_columns)


Unnamed: 0,Outcome Domain,Measure and Rater,Combined Treatment - 14 mo (SD) [No. of Subjects],Medication Management - 14 mo (SD) [No. of Subjects],Behavioral Treatment - 14 mo (SD) [No. of Subjects],Assessment and Referral - 14 mo (SD) [No. of Subjects],Replicated Combined Treatment - 14 mo (SD) [No. of Subjects],p-value (Original vs. Replicated) - Combined Treatment,Replicated Medication Management - 14 mo (SD) [No. of Subjects],p-value (Original vs. Replicated) - Medication Management,Replicated Assessment and Referral - Baseline (SD) [No. of Subjects],p-value (Original vs. Replicated) - Assessment and Referral,Replicated Behavioral Treatment - Baseline (SD) [No. of Subjects],p-value (Original vs. Replicated) - Behavioral Treatment
0,ADHD symptoms,Inattention - Teacher,1.12 (0.75) [134],1.11 (0.77) [120],1.47 (0.81) [119],1.48 (0.82) [128],1.08 (0.70) [70],0.921,1.36 (0.74) [62],0.0167,1.79 (0.79) [60],0.0339,1.94 (0.67) [70],0.0
1,ADHD symptoms,Inattention - Parent,1.02 (0.66) [133],1.12 (0.70) [121],1.40 (0.68) [129],1.49 (0.67) [130],1.15 (0.55) [189],0.882,1.22 (0.70) [161],0.0175,1.59 (0.71) [170],0.0469,1.54 (0.67) [180],0.0346
2,ADHD symptoms,Hyperactive/impulsive - Teacher,0.75 (0.71) [134],0.82 (0.69) [120],1.10 (0.77) [119],1.25 (0.84) [128],0.93 (0.57) [70],0.0091,1.16 (0.74) [62],0.0197,1.53 (0.85) [60],0.0379,1.52 (0.70) [70],0.0
3,ADHD symptoms,Hyperactive/impulsive - Parent,1.85 (0.63) [133],0.91 (0.65) [121],1.24 (0.72) [129],1.35 (0.72) [130],1.09 (0.61) [189],0.0,1.08 (0.63) [161],0.0071,1.50 (0.74) [170],0.0256,1.46 (0.65) [180],0.0125
4,ADHD symptoms,Hyperactive/impulsive - Classroom observer,0.21 (0.20) [114],0.16 (0.15) [110],0.29 (0.26) [107],0.18 (0.15) [109],,,,,,,,
5,Aggression-ODD,ODD aggression - Teacher,0.61 (0.68) [134],0.65 (0.68) [120],0.97 (0.80) [119],1.00 (0.84) [128],0.58 (0.51) [70],0.8907,0.94 (0.82) [62],0.0099,1.19 (0.86) [60],0.0555,1.13 (0.76) [70],0.3796
6,Aggression-ODD,ODD aggression - Parent,0.76 (0.64) [133],0.94 (0.74) [121],1.05 (0.74) [129],1.11 (0.67) [130],0.95 (0.58) [189],0.0005,0.99 (0.74) [161],0.4233,1.20 (0.72) [170],0.0484,1.12 (0.68) [180],0.3114
7,Aggression-ODD,ODD aggression - Classroom observer,0.007 (0.015) [114],0.004 (0.011) [108],0.010 (0.018) [107],0.006 (0.014) [109],,,,,,,,
8,Internalizing symptoms,SSRS internalizing symptoms - Teacher,0.68 (0.44) [108],0.63 (0.47) [99],0.58 (0.40) [102],0.69 (0.44) [105],1.14 (0.29) [72],0.0,1.05 (0.33) [61],0.0,0.95 (0.31) [62],0.0001,0.87 (0.27) [69],0.0002
9,Internalizing symptoms,SSRS internalizing symptoms - Parent,0.67 (0.37) [127],0.67 (0.39) [120],0.77 (0.40) [131],0.82 (0.43) [125],1.12 (0.21) [182],0.0,1.08 (0.24) [162],0.0,1.11 (0.26) [170],0.0,1.08 (0.24) [178],0.0


In [None]:
˜