# Selective Mutism response paradigm analysis: voice

Authors:
    - Jon Clucas, 2017  <jon.clucas@childmind.org>
Copyright ©2017‒2018, Apache v2.0 License

In [9]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

Imports & function definitions:

In [227]:
import json
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
import pandas as pd
import sys
import urllib
sm_rpa_v = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if sm_rpa_v not in sys.path: 
    sys.path.append(sm_rpa_v)
from utilities import fetch_data
from voice_functions import *
with open(os.path.join('../config/config.json')) as cfgf:
    osf = json.load(cfgf)['OSF_urls']

---
Collect all combinations of config files, experimental conditions, and noise replacement methods into a (48-item) list of 3-tuples:

In [142]:
experimental_conditions = {
    k for k in osf[
        'emobase'
    ] if k != "features"
}
config_exp_nr = [
    (
        cf,
        exp,
        nr
    ) for cf in [
        "emobase",
        "ComParE_2016"
    ] for exp in experimental_conditions for nr in {
        k for k in osf[
            'emobase'
        ][
            list(
                experimental_conditions
            )[0]
        ]
    }
]

Load features from [OSF](https://osf.io/7kemj/):

In [219]:
c = {}
for cen in config_exp_nr:
    c[cen[0]] = {} if cen[0] not in c else c[cen[0]]
    c[cen[0]][cen[1]] = {} if cen[1] not in c[cen[0]] else c[cen[0]][cen[1]]
    c[cen[0]][cen[1]][cen[2]] = {"DataFrame": load_from_osf(*cen)}

/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_anal

Fill in originals where no noise replacements occurred:

In [220]:
originals = pd.read_csv(
    urllib.request.urlopen(
        osf["use original"]
    ),
    index_col="Unnamed: 0"
)
for config_file in [
    "emobase",
    "ComParE_2016"
]:
    for stranger in [
        "no",
        "with"
    ]:
        for condition in [
            "button",
            "vocal"
        ]:
            for noise_replacement in c[
                config_file
            ][
                "{0}, {1} stranger".format(
                    condition,
                    stranger
                )
            ]:
                if noise_replacement != "original":
                    c[
                        config_file
                    ][
                        "{0}, {1} stranger".format(
                            condition,
                            stranger
                        )
                    ][
                        noise_replacement
                    ][
                        "DataFrame"
                    ] = c[
                        config_file
                    ][
                        "{0}, {1} stranger".format(
                            condition,
                            stranger
                        )
                    ][
                        noise_replacement
                    ][
                        "DataFrame"
                    ].append(
                        c[
                            config_file
                        ][
                            "{0}, {1} stranger".format(
                                condition,
                                stranger
                            )
                        ][
                            'original'
                        ][
                            "DataFrame"
                        ].loc[
                            eval(
                                originals.loc[
                                    stranger,
                                    condition
                                ]
                            )
                        ].copy(),
                        ignore_index=True
                    )

---
## Random Forests
Run random forests on each config file × exprimental condition × noise replacement method,
outputting OOB confidence in a table:

In [229]:
cdf = pd.DataFrame(
    np.nan,
    index=pd.MultiIndex.from_tuples(
        {(con[1], con[2]) for con in config_exp_nr},
        names=[
            "experimental condition",
            "noise replacement"
        ]
    ),
    columns={con[0] for con in config_exp_nr}
).sort_index()
for cen in config_exp_nr:
    c[
        cen[0]
    ][
        cen[1]
    ][
        cen[2]
    ][
        "features"
    ], c[
        cen[0]
    ][
        cen[1]
    ][
        cen[2]
    ][
        "random forest model"
    ] = SM_forest(c, cen[0], cen[1], cen[2])
    cdf.loc[(cen[1], cen[2]), cen[0]] = c[
        cen[0]
    ][
        cen[1]
    ][
        cen[2]
    ][
        "random forest model"
    ].oob_score_

In [239]:
cdf

Unnamed: 0_level_0,Unnamed: 1_level_0,ComParE_2016,emobase
experimental condition,noise replacement,Unnamed: 2_level_1,Unnamed: 3_level_1
"button, no stranger",adults only,0.619048,0.714286
"button, no stranger",adults removed,0.642857,0.571429
"button, no stranger",adults replaced: clone,0.619048,0.642857
"button, no stranger",adults replaced: pink noise,0.619048,0.619048
"button, no stranger",adults timeshifted,0.642857,0.642857
"button, no stranger",original,0.619048,0.738095
"button, with stranger",adults only,0.595238,0.452381
"button, with stranger",adults removed,0.571429,0.595238
"button, with stranger",adults replaced: clone,0.595238,0.714286
"button, with stranger",adults replaced: pink noise,0.52381,0.642857


Features are stored in

`c[`*config file*`][`*experimental condition*`][`*noise replacement*`]["features"]`

and models are stored in

`c[`*config file*`][`*experimental condition*`][`*noise replacement*`]["random forest model"]`

In [265]:
most_predictive = {
    "button": {},
    "vocal": {}
}
for cen in config_exp_nr:
    mp = [
        (
            cen[1].split(",")[0],
            c[
                cen[0]
            ][
                cen[1]
            ][
                cen[2]
            ][
                "features"
            ].ix[i].name
        ) for i in range(len(c[
            cen[0]
        ][
            cen[1]
        ][
            cen[2]
        ][
            "features"
        ])) if c[
            cen[0]
        ][
            cen[1]
        ][
            cen[2]
        ][
            "features"
        ].ix[i].importance > 0.0009
    ]
    for p in mp:
        most_predictive[
            p[0]
        ][
            p[1]
        ] = int(
            most_predictive[
                p[0]
            ][
                p[1]
            ] + 1
        ) if p[1] in most_predictive[
            p[0]
        ] else int(1)

Let's see which features are most often predictive for our data:

In [266]:
pd.DataFrame(
    most_predictive
).sort_values(
    [
        'vocal',
        'button'
    ],
    ascending=False
)

Unnamed: 0,button,vocal
mfcc_sma_de[10]_stddev,7.0,23.0
mfcc_sma_de[2]_kurtosis,6.0,21.0
mfcc_sma_de[10]_iqr1-2,1.0,18.0
mfcc_sma[12]_kurtosis,7.0,17.0
mfcc_sma_de[10]_range,7.0,17.0
mfcc_sma_de[10]_iqr2-3,6.0,17.0
mfcc_sma_de[3]_iqr1-3,4.0,17.0
mfcc_sma_de[10]_quartile1,2.0,17.0
mfcc_sma_de[6]_stddev,2.0,17.0
mfcc_sma_de[12]_kurtosis,1.0,17.0


Out of a maximum of:

In [267]:
len(config_exp_nr)

48