In [1]:
from pathlib import Path 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
from collections import OrderedDict
import sys
import os
import seaborn as sns
import researchpy as rp
import statsmodels.formula.api as smf
import scipy.stats as stats
import ipympl

#sys.path.append('/Users/alina/Desktop/MIT/code/ADHD/MTA/helper')
from helper import rr, prep, var_dict

%load_ext autoreload
%autoreload 2

In [2]:
if Path('/Volumes/Samsung_T5/MIT/mta').exists():
    data_root =     '/Volumes/Samsung_T5/MIT/mta'
    data_derived = '/Volumes/Samsung_T5/MIT/mta/output/derived_data'
else: 
    data_root = '/Users/alina/Desktop/MIT/code/data'
    data_derived = '/Users/alina/Desktop/MIT/code/data/output/derived_data'

In [3]:
baseline_var = ['src_subject_id', 'interview_date', 'interview_age', 'sex', 'site', 'days_baseline']
baseline_var_short = ['src_subject_id', 'days_baseline']

dtypes_baseline = { 'src_subject_id' : 'str',
                    'interview_date': 'str' , 
                    'interview_age' : 'int64' ,
                    'sex' : 'str', 
                    'site' : 'int64' ,
                    'days_baseline':  'int64',
                    'version_form': 'str'}

version_form = ['version_form']

qsts = ['snap', 'ssrs',  'masc', 'pc']##, 'wechsler'] #masc to many missing data 

In [4]:
interaction_predictors = ['days_baseline', 'site', 'trtname'] #time, site, treatment group

# mediator variables
comorb_mediators  = ['cdorodd' , 'pso', 'psoi', 'pag', 'pagi', 'pga', 'pgai' ,'psa'] #ODD/CD or anx excluding specific phobia 
services_mediators =  ['demo61'] #reciept of public assistance 
prev_med_mediators = ['hi_24'] #medication intake prior to study 

#moderator variables 
accept_moderator = ['d2dresp'] # initail acceptance of treatment 
raters = ['Teacher', 'Parent']

med_mod_list = np.concatenate([[comorb_mediators[0]], ['anx'], services_mediators, prev_med_mediators, accept_moderator])
med_mod_list

array(['cdorodd', 'anx', 'demo61', 'hi_24', 'd2dresp'], dtype='<U7')

In [5]:
treat_group_file = 'treatment_groups.csv'
treat_group = pd.read_csv(Path(data_derived, treat_group_file))

In [6]:
diags1_file  = 'diagpsx01.txt' #comorbid anx and cd/odd 
demog_file = 'demgr01.txt' ##public assistance 
health_qst_file  = 'health01.txt' #prev medication 
initial_sat_file = 'debrief01.txt' #initial acceptance of treatment arm 

diags1 = pd.read_csv(Path(data_root, diags1_file), delimiter = '\t', skiprows=[1])
demog = pd.read_csv(Path(data_root, demog_file), delimiter= '\t', skiprows=[1])
health_qst = pd.read_csv(Path(data_root, health_qst_file), delimiter='\t', skiprows=[1])
init_sat = pd.read_csv(Path(data_root, initial_sat_file), delimiter='\t', skiprows=[1])

med_qsts = ['diags', 'demog', 'heath_qst', 'init_sat']
med_qsts_dict = dict(zip(med_qsts, [diags1, demog, health_qst, init_sat]))


  diags1 = pd.read_csv(Path(data_root, diags1_file), delimiter = '\t', skiprows=[1])
  health_qst = pd.read_csv(Path(data_root, health_qst_file), delimiter='\t', skiprows=[1])


In [7]:
odd_cd_vars =[ 'cdorodd'] # CD or ODD 
anx_vars = ['pso', 'psoi', 'pag', 'pagi', 'pga', 'pgai' ,'psa', 'psai'] #poa : overanxious disorder: see if included in alaysis
comorb_vars = np.concatenate([odd_cd_vars, anx_vars])

assist_vars = ['demo61']# public assistance 

prev_med_vars = ['hi_24'] #prev medication 

accept_vars = ['d2dresp']# initial acceptance 

In [8]:
for qst in med_qsts_dict.values():
    print(version_form[0] in qst.keys())

True
False
False
False


In [9]:
# comorb = pd.read_csv(Path(data_derived, 'comorb.csv'), index_col= 0) # comorbid ODD/CD anxiety
# prev_med = pd.read_csv(Path(data_derived, 'prev_med.csv'), index_col= 0) #'prevous stimulant medictaion exposure
# assist = pd.read_csv(Path(data_derived, 'assist.csv'), index_col= 0) #reciept of public assistance 
# accept = pd.read_csv(Path(data_derived, 'accept.csv'), index_col= 0) #initial acceptance of treatment arm
# medi = {'comorb': comorb, 'prev_med': prev_med, 'assist': assist, 'accept': accept}

# comorbidities

In [10]:
comorb = diags1[np.concatenate([baseline_var_short,version_form, comorb_vars])]
assist = demog[np.concatenate([baseline_var_short, assist_vars])]
prev_med = health_qst[np.concatenate([baseline_var_short, prev_med_vars])]
init_acc = init_sat[np.concatenate([baseline_var_short, accept_vars])]

In [11]:
comorb['anx'] = pd.NA
anx_vars = [ 'pso', 'psoi', 'pag', 'pagi', 'pga', 'pgai', 'psa', 'psai']
mask = (comorb[anx_vars].fillna(0) != 0).any(axis=1)  # Extract if any anxiety disdoers, comorb anx = True 
comorb.loc[mask, 'anx'] = 1
mask = (comorb[anx_vars] == 0).all(axis=1)  # If none of the disorder, Comorb anx = False, else NaN 
comorb.loc[mask, 'anx'] = 0
comorb = comorb.drop(columns=anx_vars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comorb['anx'] = pd.NA


## split data from timepoints so that only 1 diagnosis per timepoint per subject

In [146]:
(comorb['days_baseline'] < 150).sum()

867

In [None]:
timepoints = [50, 213, 578, 912] 

In [186]:
def find_unique_subjects(df, lower_bound_time=None, select='min'):
    if lower_bound_time is not None:
        df = df.loc[df['days_baseline'] > lower_bound_time].copy()
    
    # Fix for the error - Using value_counts and filtering properly
    duplicates = df['src_subject_id'].value_counts()[df['src_subject_id'].value_counts() > 1].index
    duplicates_df = df[df['src_subject_id'].isin(duplicates)]

    if select == 'min':
        idx_to_keep = duplicates_df.groupby('src_subject_id')['days_baseline'].idxmin()
    elif select == 'max':
        idx_to_keep = duplicates_df.groupby('src_subject_id')['days_baseline'].idxmax()
    else:
        raise ValueError('Please specify how to select unique subjects')
    
    return df.loc[df.index.difference(duplicates_df.index).union(idx_to_keep)]

def get_unique_subjects_split(df, timepoints_unique = None, select='min', timepoints_split = None):
    df_split = prep.split_data_from_timepoints(df, timepoints_split)
    
    if timepoints_unique is not None:
        df_split_unique = [
            find_unique_subjects(df_timepoint, time, select) for df_timepoint, time in zip(df_split.values(), timepoints_unique)
        ]
    else:
        df_split_unique = [
            find_unique_subjects(df_timepoint, None, select) for df_timepoint in df_split.values()
        ]
    
    for df_ in df_split_unique:
        if (df_['src_subject_id'].value_counts() > 1).sum() == 0:
            print('Success')
        else:
            print('Found {} duplicates remaining.'.format((df_['src_subject_id'].value_counts() > 1).sum()))
    
    return df_split_unique


In [173]:
timepoints

[50, 213, 578, 912]

In [174]:
comorb_split = prep.split_data_from_timepoints(comorb)
comorb_split_unique = [ find_unique_subjects(comorb_timepoint, None, select='min') for comorb_timepoint, time in zip(comorb_split.values(), timepoints) ]
for df in comorb_split_unique:
    print((df['src_subject_id'].value_counts() > 1).sum())

0
0
0
0


In [188]:
comorb_split_unique = get_unique_subjects_split(comorb, None, select='min', timepoints_split=[213, 578, 912, 1095])

Success
Success
Success
Success


In [189]:
comorb_split_unique[0].shape

(867, 5)

## Previous medication 

In [190]:
prev_med_split_unique = get_unique_subjects_split(prev_med,  None, select='min', timepoints_split=[213, 578, 912, 1095])

Success
Success
Success
Success


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  except ValueError as e :


In [193]:
prev_med

Unnamed: 0,src_subject_id,days_baseline,hi_24
0,P1001,3029,
1,P1001,3836,1.0
2,P1001,4786,1.0
3,P1001,5297,1.0
4,P1002,4942,2.0
...,...,...,...
2822,P1867,4919,1.0
2823,P1868,3114,
2824,P1868,3828,1.0
2825,P1868,4627,1.0


In [65]:
prev_med['days_baseline'].value_counts()

days_baseline
3734    8
3760    6
3821    6
3653    6
3711    6
       ..
4687    1
3935    1
5578    1
4864    1
5251    1
Name: count, Length: 1812, dtype: int64

In [68]:
prev_med_split 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for i, col in zip(range(df.shape[1]), df.columns):


In [195]:
prev_med_unique = find_unique_subjects(prev_med)
prev_med_unique

Unnamed: 0,src_subject_id,days_baseline,hi_24
0,P1001,3029,
4,P1002,4942,2.0
7,P1003,3928,
11,P1004,3646,
15,P1005,5180,2.0
...,...,...,...
2808,P1864,3019,
2812,P1865,2999,
2815,P1866,2988,
2819,P1867,2616,


### recipet of public assistance 

In [196]:
assist_unique = find_unique_subjects(assist)
assist_unique

Unnamed: 0,src_subject_id,days_baseline,demo61
0,P1001,1612,2.0
3,P1003,2647,2.0
6,P1004,2633,2.0
9,P1006,2560,2.0
12,P1007,2852,2.0
...,...,...,...
3855,P1780,4108,
3912,P1804,3901,
3946,P1818,4530,
3963,P1826,4858,


### Initial acceptance is already unique

In [197]:
comorb_b = comorb_split_unique[0]

In [198]:
print(comorb_b.shape, prev_med_unique.shape, assist_unique.shape, init_acc.shape)

(867, 5) (762, 3) (798, 3) (579, 3)


drop days baseline and version form for merging 

In [199]:
comorb_b = comorb_b.drop(columns='version_form')

In [200]:
dfs_clean = [df.drop(columns='days_baseline') for df in [comorb_b, prev_med_unique, assist_unique, init_acc]]

In [201]:
dfs_clean

[     src_subject_id  cdorodd anx
 6663          P1001      0.0   0
 6665          P1002      NaN   0
 6668          P1003      1.0   1
 6672          P1004      0.0   1
 6676          P1005      1.0   0
 ...             ...      ...  ..
 9323          P1864      0.0   0
 9325          P1865      0.0   0
 9327          P1866      0.0   0
 9329          P1867      0.0   0
 9331          P1868      0.0   1
 
 [867 rows x 3 columns],
      src_subject_id  hi_24
 0             P1001    NaN
 4             P1002    2.0
 7             P1003    NaN
 11            P1004    NaN
 15            P1005    2.0
 ...             ...    ...
 2808          P1864    NaN
 2812          P1865    NaN
 2815          P1866    NaN
 2819          P1867    NaN
 2823          P1868    NaN
 
 [762 rows x 2 columns],
      src_subject_id  demo61
 0             P1001     2.0
 3             P1003     2.0
 6             P1004     2.0
 9             P1006     2.0
 12            P1007     2.0
 ...             ...     ...

## Merge with data for the regression 

In [202]:
baseline_var = ['src_subject_id', 'interview_date', 'interview_age', 'sex', 'site', 'days_baseline']
dtypes_baseline = { 'src_subject_id' : 'str',
                    'interview_date': 'str' , 
                    'interview_age' : 'int64' ,
                    'sex' : 'str', 
                    'site' : 'int64' ,
                    'days_baseline':  'int64',
                    'version_form': 'str'}

version_form = ['version_form']

qsts = ['snap', 'ssrs',  'masc', 'pc']##, 'wechsler'] #masc to many missing data 

In [203]:
snap_file = 'snap01.txt'
ssrs_file = 'ssrs01.txt'
masc_file = 'masc_p01.txt'
parent_child_file = 'pcrc01.txt'
wechsler_file = 'wiat_iiip201.txt'
treat_group_file = 'treatment_groups.csv'
#outcome variablles 
snap_vars = ['snainatx', 'snahypax', 'snaoddx'] #inattention_mean, hyperactie mean
ssrs_vars = ['sspintx', 'ssptossx']# social skills mean, internalizing mean 
masc_vars = ['masc_masctotalt']
pc_vars = ['pcrcpax', 'pcrcprx'] # power assertion, personal closeness
#wechsler_vars = ['w1readb','w2math','w3spell' ]
outcomes_dict  = {'snap' : snap_vars, 'ssrs' : ssrs_vars, 'masc':masc_vars ,'pc': pc_vars} #, 'wechsler': wechsler_vars}

In [204]:
treat_group = pd.read_csv(Path(data_derived, treat_group_file))

snap = prep.get_data(Path(data_root, snap_file), columns= [baseline_var, snap_vars, version_form], treat_group= treat_group, set_dtypes= True, version_form= True, split_timepoints= True)
ssrs = prep.get_data(Path(data_root, ssrs_file), columns= [baseline_var, ssrs_vars, version_form], treat_group= treat_group, set_dtypes= True, version_form= True, split_timepoints= True)
masc = prep.get_data(Path(data_root, masc_file), columns= [baseline_var, masc_vars], treat_group= treat_group, set_dtypes= True, version_form= False, split_timepoints= True)

pc = prep.get_data(Path(data_root, parent_child_file), columns= [baseline_var, pc_vars], treat_group= treat_group, set_dtypes= True, version_form= False, split_timepoints= True)
#wechsler = prep.get_data(Path(data_root, wechsler_file), columns= [baseline_var, wechsler_vars], set_dtypes= True, version_form= False, split_timepoints= True)
data_dict = dict(zip(qsts, [snap, ssrs, masc, pc]))

Success
Success
Success
Success


In [205]:
data_14_months = [data['14'] for data in data_dict.values()]

In [206]:
test = [pd.merge(data, dfs_clean[0], on= 'src_subject_id') for data in data_14_months]
test2 = [pd.merge(data, dfs_clean[1], on= 'src_subject_id') for data in test]
test3 = [pd.merge(data, dfs_clean[2], on= 'src_subject_id') for data in test2]
test4 = [pd.merge(data, dfs_clean[3], on= 'src_subject_id') for data in test3]


In [207]:
# test4[0].to_csv(Path(data_derived, 'snap_14_and_mediators.csv'))
# test4[1].to_csv(Path(data_derived, 'ssrs_14_and_mediators.csv'))
# test4[2].to_csv(Path(data_derived, 'masc_14_and_mediators.csv'))
# test4[3].to_csv(Path(data_derived, 'pc_14_and_mediators.csv'))

In [143]:
len(test4)

4