# Selective Mutism response paradigm analysis: voice

Authors:
    - Jon Clucas, 2017  <jon.clucas@childmind.org>
Copyright ©2017‒2018, Apache v2.0 License

In [9]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

Imports & function definitions:

In [141]:
import json
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
import pandas as pd
import sys
import urllib
sm_rpa_v = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if sm_rpa_v not in sys.path: 
    sys.path.append(sm_rpa_v)
from utilities import fetch_data
from voice_functions import *
with open(os.path.join('../config/config.json')) as cfgf:
    osf = json.load(cfgf)['OSF_urls']

---
Collect all combinations of config files, experimental conditions, and noise replacement methods into a (48-item) list of 3-tuples:

In [142]:
experimental_conditions = {
    k for k in osf[
        'emobase'
    ] if k != "features"
}
config_exp_nr = [
    (
        cf,
        exp,
        nr
    ) for cf in [
        "emobase",
        "ComParE_2016"
    ] for exp in experimental_conditions for nr in {
        k for k in osf[
            'emobase'
        ][
            list(
                experimental_conditions
            )[0]
        ]
    }
]

In [None]:
c = {}
cdf = pd.DataFrame(
    np.nan,
    index=pd.MultiIndex.from_tuples(
        {(con[1], con[2]) for con in config_exp_nr},
        names=[
            "experimental condition",
            "noise replacement"
        ]
    ),
    columns={con[0] for con in config_exp_nr}
).sort_index()
for cen in config_exp_nr:
    c[cen[0]] = {} if cen[0] not in c else c[cen[0]]
    c[cen[0]][cen[1]] = {} if cen[1] not in c[cen[0]] else c[cen[0]][cen[1]]
    c[cen[0]][cen[1]][cen[2]] = load_from_osf(*cen)
    cdf.loc[
        (
            cen[1],
            cen[2]
        ),
        cen[0]
    ] = 1

/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_analysis/emobase/emobase_features.csv
/home/jclucas/selective-mutism-eeg/SM_response_paradigm_anal

In [None]:
c

In [None]:
cdf

---
Load data from OSF:

In [None]:
emobase = pd.read_csv(osf['emobase']['features'])
ComParE_2016 = pd.read_csv(osf['ComParE_2016']['features'])
conditions = pd.read_csv(osf['conditions'])
dx = pd.read_csv(osf['dx'])

Harmonize data formats:

In [None]:
emobase, ComParE_2016, conditions, dx = update_encoding(
                                             [emobase, ComParE_2016, conditions, dx],
                                             [{"M00494594":"M00494954"},
                                              {"_":False, np.nan:False, "SM":True}
                                             ],
                                             ["URSI", "Dx?"],
                                             [None, bool]
                                         )

In [None]:
emobase.drop("Unnamed: 0", axis=1)

Merge datatables as necessary and integerize categorical data:

In [None]:
emobase = int_categorize(combine_data(emobase, conditions, dx))
ComParE_2016 = int_categorize(combine_data(ComParE_2016, conditions, dx))

## Random Forests

Import and initialize:

In [None]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier()

We can only have 1-D Y:

In [None]:
eX, eY = make_forest(emobase)

Try with 100 estimators:

In [None]:
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(eX, eY)

In [None]:
clf.feature_importances_

In [None]:
features = pd.DataFrame.from_dict(dict(zip(
               emobase.columns.difference(["Dx?"]), clf.feature_importances_
               )), orient='index'
           ).rename(columns={0:"importance"}
           ).sort_values("importance", ascending=False)
print(features)