# Selective Mutism response paradigm analysis: voice

Authors:
    - Jon Clucas, 2017  <jon.clucas@childmind.org>
Copyright ©2017‒2018, Apache v2.0 License

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

Imports & function definitions:

In [2]:
import json
import numpy as np
import os
import pandas as pd
import sys
import urllib
sm_rpa_v = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if sm_rpa_v not in sys.path: 
    sys.path.append(sm_rpa_v)
from voice_functions import *
with open(os.path.join('../config/config.json')) as cfgf:
    osf = json.load(cfgf)['OSF_urls']

---
Collect all combinations of config files, experimental conditions, and noise replacement methods into a (48-item) list of 3-tuples:

In [3]:
experimental_conditions = {
    k for k in osf[
        'emobase'
    ] if k != "features"
}
config_exp_nr = [
    (
        cf,
        exp,
        nr
    ) for cf in [
        "emobase",
        "ComParE_2016"
    ] for exp in experimental_conditions for nr in {
        k for k in osf[
            'emobase'
        ][
            list(
                experimental_conditions
            )[0]
        ]
    }
]

Load features from [OSF](https://osf.io/7kemj/):

In [4]:
c = {}
for cen in config_exp_nr:
    c[cen[0]] = {} if cen[0] not in c else c[cen[0]]
    c[cen[0]][cen[1]] = {} if cen[1] not in c[cen[0]] else c[cen[0]][cen[1]]
    c[cen[0]][cen[1]][cen[2]] = {"DataFrame": load_from_osf(*cen)}

/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_anal

Fill in originals where no noise replacements occurred:

In [5]:
originals = pd.read_csv(
    urllib.request.urlopen(
        osf["use original"]
    ),
    index_col="Unnamed: 0"
)
for config_file in [
    "emobase",
    "ComParE_2016"
]:
    for stranger in [
        "no",
        "with"
    ]:
        for condition in [
            "button",
            "vocal"
        ]:
            for noise_replacement in c[
                config_file
            ][
                "{0}, {1} stranger".format(
                    condition,
                    stranger
                )
            ]:
                if noise_replacement not in [
                    "adults only",
                    "original"
                ]:
                    c[
                        config_file
                    ][
                        "{0}, {1} stranger".format(
                            condition,
                            stranger
                        )
                    ][
                        noise_replacement
                    ][
                        "DataFrame"
                    ] = c[
                        config_file
                    ][
                        "{0}, {1} stranger".format(
                            condition,
                            stranger
                        )
                    ][
                        noise_replacement
                    ][
                        "DataFrame"
                    ].append(
                        c[
                            config_file
                        ][
                            "{0}, {1} stranger".format(
                                condition,
                                stranger
                            )
                        ][
                            'original'
                        ][
                            "DataFrame"
                        ].loc[
                            eval(
                                originals.loc[
                                    stranger,
                                    condition
                                ]
                            )
                        ].copy(),
                        ignore_index=True
                    )

---
## Random Forests
Run random forests on each config file × exprimental condition × noise replacement method,
outputting OOB confidence in a table:

In [6]:
cdf = pd.DataFrame(
    np.nan,
    index=pd.MultiIndex.from_tuples(
        {(con[1], con[2]) for con in config_exp_nr},
        names=[
            "experimental condition",
            "noise replacement"
        ]
    ),
    columns={con[0] for con in config_exp_nr}
).sort_index()
for cen in config_exp_nr:
    c[
        cen[0]
    ][
        cen[1]
    ][
        cen[2]
    ][
        "features"
    ], c[
        cen[0]
    ][
        cen[1]
    ][
        cen[2]
    ][
        "random forest model"
    ] = SM_forest(c, cen[0], cen[1], cen[2])
    cdf.loc[(cen[1], cen[2]), cen[0]] = c[
        cen[0]
    ][
        cen[1]
    ][
        cen[2]
    ][
        "random forest model"
    ].oob_score_

Most predictive feature for emobase config file × button, no stranger × original was lspFreq_sma[7]_maxPos with an importance score of 0.07214965344315415
Most predictive feature for emobase config file × button, no stranger × adults replaced: clone was lspFreq_sma[7]_maxPos with an importance score of 0.08149464064208757
Most predictive feature for emobase config file × button, no stranger × adults replaced: pink noise was mfcc_sma[12]_linregc2 with an importance score of 0.055370963553855186
Most predictive feature for emobase config file × button, no stranger × adults removed was mfcc_sma[12]_linregc2 with an importance score of 0.04738973283436931
Most predictive feature for emobase config file × button, no stranger × adults only was mfcc_sma[3]_minPos with an importance score of 0.035
Most predictive feature for emobase config file × button, no stranger × adults timeshifted was lspFreq_sma[7]_maxPos with an importance score of 0.07182625929661883
Most predictive feature for emobas

In [7]:
cdf

Unnamed: 0_level_0,Unnamed: 1_level_0,emobase,ComParE_2016
experimental condition,noise replacement,Unnamed: 2_level_1,Unnamed: 3_level_1
"button, no stranger",adults only,0.088313,0.161751
"button, no stranger",adults removed,0.006607,0.151439
"button, no stranger",adults replaced: clone,0.158022,-0.007451
"button, no stranger",adults replaced: pink noise,0.004438,0.063859
"button, no stranger",adults timeshifted,0.083865,-0.010564
"button, no stranger",original,0.07842,0.095189
"button, with stranger",adults only,0.395052,0.204719
"button, with stranger",adults removed,0.02246,-0.00083
"button, with stranger",adults replaced: clone,0.123059,-0.055061
"button, with stranger",adults replaced: pink noise,0.106113,0.007091


Features are stored in

`c[`*config file*`][`*experimental condition*`][`*noise replacement*`]["features"]`

and models are stored in

`c[`*config file*`][`*experimental condition*`][`*noise replacement*`]["random forest model"]`

In [8]:
most_predictive = {
    "button": {},
    "vocal": {}
}
for cen in config_exp_nr:
    mp = [
        (
            cen[1].split(",")[0],
            c[
                cen[0]
            ][
                cen[1]
            ][
                cen[2]
            ][
                "features"
            ].ix[i].name
        ) for i in range(len(c[
            cen[0]
        ][
            cen[1]
        ][
            cen[2]
        ][
            "features"
        ])) if c[
            cen[0]
        ][
            cen[1]
        ][
            cen[2]
        ][
            "features"
        ].ix[i].importance > 0.0009
    ]
    for p in mp:
        most_predictive[
            p[0]
        ][
            p[1]
        ] = int(
            most_predictive[
                p[0]
            ][
                p[1]
            ] + 1
        ) if p[1] in most_predictive[
            p[0]
        ] else int(1)

Let's see which features are most often predictive for our data:

In [9]:
most_often = pd.DataFrame(
    most_predictive
).sort_values(
    [
        'vocal',
        'button'
    ],
    ascending=False
)
most_often

Unnamed: 0,button,vocal
mfcc_sma_de[2]_kurtosis,,20.0
mfcc_sma_de[12]_kurtosis,1.0,15.0
mfcc_sma_de[10]_stddev,6.0,13.0
mfcc_sma[12]_kurtosis,5.0,13.0
mfcc_sma_de[10]_iqr2-3,9.0,12.0
mfcc_sma_de[10]_iqr1-2,2.0,12.0
mfcc_sma_de[3]_kurtosis,2.0,12.0
F0env_sma_quartile1,,12.0
mfcc_sma[3]_min,7.0,11.0
mfcc_sma[12]_min,4.0,11.0


Out of a maximum of:

In [10]:
len(config_exp_nr)

48

In [11]:
list(most_often.index)

['mfcc_sma_de[2]_kurtosis',
 'mfcc_sma_de[12]_kurtosis',
 'mfcc_sma_de[10]_stddev',
 'mfcc_sma[12]_kurtosis',
 'mfcc_sma_de[10]_iqr2-3',
 'mfcc_sma_de[10]_iqr1-2',
 'mfcc_sma_de[3]_kurtosis',
 'F0env_sma_quartile1',
 'mfcc_sma[3]_min',
 'mfcc_sma[12]_min',
 'mfcc_sma[4]_min',
 'mfcc_sma_de[10]_linregerrA',
 'mfcc_sma_de[10]_range',
 'mfcc_sma[12]_iqr1-2',
 'mfcc_sma_de[3]_iqr1-2',
 'mfcc_sma_de[3]_quartile3',
 'F0env_sma_linregc2',
 'mfcc_sma_de[10]_linregerrQ',
 'mfcc_sma[9]_max',
 'mfcc_sma[7]_min',
 'F0_sma_linregerrA',
 'mfcc_sma[2]_lpc3',
 'audSpec_Rfilt_sma[1]_quartile1',
 'mfcc_sma[12]_stddev',
 'mfcc_sma[6]_lpc3',
 'mfcc_sma[7]_lpc2',
 'mfcc_sma_de[3]_quartile1',
 'pcm_RMSenergy_sma_de_lpc0',
 'pcm_RMSenergy_sma_de_lpc1',
 'voiceProb_sma_de_stddev',
 'mfcc_sma[6]_min',
 'mfcc_sma_de[8]_stddev',
 'pcm_loudness_sma_quartile1',
 'mfcc_sma_de[4]_range',
 'mfcc_sma_de[3]_iqr1-3',
 'mfcc_sma[12]_linregerrA',
 'mfcc_sma[8]_lpc4',
 'mfcc_sma_de[6]_kurtosis',
 'audSpec_Rfilt_sma[4]_quar

In [12]:
n = pd.DataFrame(
    most_predictive
).sort_values(
    [
        'vocal',
        'button'
    ],
    ascending=False
)
n = n.loc[n['vocal']>=12]

In [13]:
kinds_of_features = {}
for i in n.iterrows():
    kof = i[0].split("_")[0] if "_" in i[0] else i[0]
    if "[" in i[0]:
        kinds_of_features[
            kof
        ] = kinds_of_features[
            kof
        ] if kof in kinds_of_features else {}
        coefficient = i[0].split("[")[1].split("]")[0]
        kinds_of_features[
            kof
        ][
            coefficient
        ] = kinds_of_features[
            kof
        ][
            coefficient
        ] + 1 if coefficient in kinds_of_features[
            kof
        ] else 1
    else:
        kinds_of_features[
            kof
        ] = kinds_of_features[
            kof 
        ] + 1 if kof in kinds_of_features else 1
kinds_of_features

{'F0env': 1, 'mfcc': {'10': 3, '12': 2, '2': 1, '3': 1}}

In [14]:
list(pd.Series(kinds_of_features['mfcc']).sort_values(ascending=False).index)

['10', '12', '3', '2']

---
Let's also describe our sample:

In [15]:
sample = pd.read_csv(
    urllib.request.urlopen(
        osf["dx"]
    ),
    index_col="URSI"
)
participant_conditions = pd.read_csv(
    urllib.request.urlopen(
        osf["conditions"]
    ),
    index_col="URSI"
)
sample = sample.loc[
    sample.index.isin(
        participant_conditions.index
    )
].copy()

In [16]:
sample.Age.describe()

count    42.000000
mean      7.619048
std       1.899324
min       5.000000
25%       6.000000
50%       8.000000
75%       9.000000
max      11.000000
Name: Age, dtype: float64

In [17]:
sample.Sex.describe()

count     42
unique     2
top        F
freq      24
Name: Sex, dtype: object