# Pre-processing the childmind questionnaire data

This notebook collects helping scripts and functions aiming to:

1. Drop aggregate variables;
2. Process the diagnosis columns into binary variables;
3. Print markdown for linking to keys;
4. Print string to keep variables.

The code below is quite hacky - in most other repos I would not check it in. But since I haven't figured out versioning for this process...

### 1. Drop aggregate variables

In [1]:
import ast
import glob
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(
    'resources/data/Dx_and_q.csv'
)
qmri = pd.read_csv(
    "resources/data/questions_and_mri_regions.csv"
)

In [3]:
df.columns

Index([u'ADHD subtype', u'APQ_P_01', u'APQ_P_02', u'APQ_P_03', u'APQ_P_04',
       u'APQ_P_05', u'APQ_P_06', u'APQ_P_07', u'APQ_P_08', u'APQ_P_09',
       ...
       u'SWAN_17', u'SWAN_18', u'Sex', u'SocAnx_01', u'SocAnx_02',
       u'SocAnx_03', u'SocAnx_04A', u'SocAnx_04B', u'SocAnx_05',
       u'financialsupport'],
      dtype='object', length=1002)

In [4]:
for col in df.columns:
    suffix = col.split('_')[-1]
    if col in ['EID', 'Sex', 'Age', 'Dx']: 
        pass
    else:
        suffix = col.split('_')[-1]
        if not suffix.isdigit():
            del df[col]

In [5]:
df.to_csv('resources/data/drop_aggregates.csv',index=False)
df = df.drop_duplicates("EID")

### 2. Process the diagnosis columns into binary variables

In [6]:
all_diagnoses = []
for diags in df['Dx'].values.flatten():
    all_diagnoses.extend(ast.literal_eval(diags))

In [7]:
n = len(df)

In [8]:
for diagnosis in np.unique(all_diagnoses):
    print diagnosis
    df[diagnosis] = df['Dx'].copy()
    def diag_to_binary(entry):
        return diagnosis in entry
    df[diagnosis] = df[diagnosis].apply(diag_to_binary)
for c in ["Dx", "Anx", "adhd", "asd"]:
    try:
        del df[c]
    except:
        pass

ADHD Inattentive Type
ADHD Inattentive type
ADHD-Combined Type
ADHD-Hyperactive/Impulsive Type
ADHD-Inattentive Type
Acute Stress Disorder
Adjustment Disorder, with Mixed Emotions and Conduct
Adjustment Disorder, with mixed disturbance of emtions and conduct
Adjustment Disorders
Agoraphobia
Alcohol Use Disorder
Attention Deficit Hyperactivity Disorder
Attention Deficit Hyperactivity Disorder Combined Presentation
Attention-Deficit Hyperactivity Disorder
Attention-Deficit/Hyperactivity Disorder
Attention-Deficit/Hyperactivity Disorder 
Autism Spectrum Disorder
Avoidant/Restrictive Food Intake Disorder
Binge-Eating Disorder
Bipolar I Disorder
Bipolar II Disorder
Borderline Intellectual Functioning
Bulimia Nervosa
Cannabis Use Disorder
Child Onset Fluency Disorder (Stuttering)
Conduct Disorder-Adolescent-onset type
Conduct Disorder-Childhood-onset type
Conversion Disorder
Developmental Coordination Disorder
Disruptive Mood Dysregulation Disorder
Encopresis
Enuresis
Excoriation (Skin-Picki

In [9]:
df

Unnamed: 0,APQ_P_01,APQ_P_02,APQ_P_03,APQ_P_04,APQ_P_05,APQ_P_06,APQ_P_07,APQ_P_08,APQ_P_09,APQ_P_10,...,Unspecified Bipolar and Related Disorder,Unspecified Intellectual Disability,autism spectrum disorder,language disorder,n/a,other specified neurodevelopmental disorder,specific learning disorder with impairment in mathematics,specific learning disorder with impairment in reading,specific learning disorder with impairment in written expression,unspecified anxiety disorder
0,,,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
1,4.0,5.0,2.0,4.0,4.0,,4.0,2.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,False
2,4.0,5.0,2.0,3.0,5.0,2.0,4.0,1.0,5.0,1.0,...,True,False,False,False,False,False,False,False,False,False
3,5.0,5.0,3.0,2.0,3.0,3.0,3.0,2.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,False
4,3.0,5.0,3.0,5.0,5.0,1.0,5.0,3.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,False
5,4.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,5.0,2.0,...,False,False,False,False,False,False,False,False,False,False
6,5.0,5.0,2.0,5.0,3.0,1.0,4.0,1.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,False
7,4.0,4.0,3.0,3.0,2.0,2.0,4.0,2.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,False
8,5.0,5.0,3.0,4.0,3.0,1.0,4.0,3.0,5.0,1.0,...,False,False,False,False,False,False,False,False,False,False
9,4.0,4.0,3.0,3.0,4.0,1.0,3.0,2.0,4.0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [10]:
dx = df[
    ["EID"] + list(
        set(
            all_diagnoses
        )
    )
]

In [11]:
dx

Unnamed: 0,EID,Other Specified Anxiety Disorder,Other Specified Depressive Disorder,"Adjustment Disorder, with mixed disturbance of emtions and conduct",Language Disorder,Unspecified Anxiety Disorder,Substane/Medication-Induced Depressive Disorder,specific learning disorder with impairment in mathematics,Other Specified Attention-Deficit/Hyperactivity Disorder,ADHD Inattentive type,...,other specified neurodevelopmental disorder,Obsessive-Compulsive Disorder,Adjustment Disorders,specific learning disorder with impairment in reading,Generalized Anxiety Disorder,ADHD-Combined Type,Reactive Attachment Disorder,Attention-Deficit/Hyperactivity Disorder,"Adjustment Disorder, with Mixed Emotions and Conduct",Specific Learning Disorder with Impairment in Reading
0,NDARNN368BDH,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
1,NDARGM645PL4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,NDARMA875ARE,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,NDARXC367LA4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,NDARZK659DWX,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
5,NDARJT730WP0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
6,NDARNT939YMG,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,NDARNV332JF2,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
8,NDARWX380JJK,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
9,NDARPP703NX4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [12]:
adhd_columns = {c for c in dx.columns if "ADHD" in c or "Attention" in c}
adhd_columns

{'ADHD Inattentive Type',
 'ADHD Inattentive type',
 'ADHD-Combined Type',
 'ADHD-Hyperactive/Impulsive Type',
 'ADHD-Inattentive Type',
 'Attention Deficit Hyperactivity Disorder',
 'Attention Deficit Hyperactivity Disorder Combined Presentation',
 'Attention-Deficit Hyperactivity Disorder',
 'Attention-Deficit/Hyperactivity Disorder',
 'Attention-Deficit/Hyperactivity Disorder ',
 'Other Specified Attention-Deficit/Hyperactivity Disorder',
 'Unspecified Attention-Deficit/Hyperactivity Disorder'}

In [13]:
dx=dx.assign(
    ADHD=dx[
        list(adhd_columns)
    ].any(
        axis=1
    ),
    Anxiety=dx[
        list(
            {
                c for dxlabel in [
                    "Anxiety",
                    "anxiety"
                ] for c in dx.columns if dxlabel in c
            }
        )
    ].any(
        axis=1
    ),
    Other=dx[
        list(
            {
                c for c in dx.columns if c not in [
                    "Anxiety",
                    "Autism Spectrum Disorder",
                    "ADHD",
                    "EID",
                    "No Diagnosis Given",
                    "No diagnosis given",
                    "n/a"
                ]
            }
        )
    ].any(
        axis=1
    )
)

In [14]:
dx_oi = dx[
    [
        "Anxiety",
        "Autism Spectrum Disorder",
        "ADHD",
        "EID",
        "Other"
    ]
].drop_duplicates()
dx_oi["Autism"] = dx_oi["Autism Spectrum Disorder"]
dx_oi = dx_oi.drop(
    "Autism Spectrum Disorder",
    axis=1
)

In [15]:
dx_oi = dx_oi.assign(
    EID=dx_oi.EID.str.encode(
        "UTF-8"
    ).str.decode(
        "UTF-8"
    )
)
qmri = qmri.assign(
    EID=qmri.EID.str.encode(
        "UTF-8"
    ).str.decode(
        "UTF-8"
    )
)

In [16]:
dx_oi.to_csv(
    "resources/data/dx_of_interest.csv",
    index=False
)

In [17]:
merged_dx = qmri.merge(
    dx_oi,
    on="EID",
    how="left"
)

In [18]:
merged_dx

Unnamed: 0,APQ_P_01,APQ_P_02,APQ_P_03,APQ_P_04,APQ_P_05,APQ_P_06,APQ_P_07,APQ_P_08,APQ_P_09,APQ_P_10,...,left_putamen_volume-per-freesurfer-label,right_putamen_volume-per-freesurfer-label,left_thalamus_volume-per-freesurfer-label,right_thalamus_volume-per-freesurfer-label,left_unsegmentedwhitematter_volume-per-freesurfer-label,right_unsegmentedwhitematter_volume-per-freesurfer-label,Anxiety,ADHD,Other,Autism
0,,,,,,,,,,,...,,,,,,,False,True,True,False
1,4.0,5.0,2.0,4.0,4.0,,4.0,2.0,5.0,1.0,...,,,,,,,False,False,False,True
2,4.0,5.0,2.0,3.0,5.0,2.0,4.0,1.0,5.0,1.0,...,,,,,,,False,True,True,False
3,5.0,5.0,3.0,2.0,3.0,3.0,3.0,2.0,5.0,1.0,...,,,,,,,True,False,True,False
4,3.0,5.0,3.0,5.0,5.0,1.0,5.0,3.0,5.0,1.0,...,,,,,,,False,True,True,False
5,4.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,5.0,2.0,...,,,,,,,True,False,True,False
6,5.0,5.0,2.0,5.0,3.0,1.0,4.0,1.0,5.0,1.0,...,,,,,,,False,False,False,False
7,4.0,4.0,3.0,3.0,2.0,2.0,4.0,2.0,5.0,1.0,...,,,,,,,True,True,True,False
8,5.0,5.0,3.0,4.0,3.0,1.0,4.0,3.0,5.0,1.0,...,,,,,,,True,False,True,False
9,4.0,4.0,3.0,3.0,4.0,1.0,3.0,2.0,4.0,1.0,...,,,,,,,False,True,True,False


In [19]:
merged_dx.to_csv(
    'resources/data/smri_questions.csv',
    index=False
)

### 3. Print markdown for linking to keys

In [None]:
for file in sorted(glob.glob("resources/column-keys/*.csv")):
    name = file.split('/')[-1][:-4]
    print '- [' + name + ': ' + pd.read_csv(file).columns[0].strip() + '](resources/html/%s.html)' % name
    

### 4. Print string to keep variables 

In [None]:
keepstr = ' '.join(pd.read_csv('resources/keepers.txt')['varname'].values)
print '-- keep ' + keepstr