# Selective Mutism response paradigm analysis: voice

Authors:
    - Jon Clucas, 2017  <jon.clucas@childmind.org>
Copyright ©2017, Apache v2.0 License

Imports & function definitions:

In [10]:
import json
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
import pandas as pd
import sys
sm_eeg = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if sm_eeg not in sys.path: 
    sys.path.append(sm_eeg)
from utilities import fetch_data
with open(os.path.join('../config/config.json')) as cfgf:
    osf = json.load(cfgf)['OSF_urls']
    
    
def combine_data(openSMILE, conditions, dx):
    """
    Function to combine imported DataFrames.
    
    Parameters
    ----------
    openSMILE: pandas DataFrame
        features from openSMILE
    
    conditions: pandas DataFrame
        notes about each recording
        
    dx: pandas DataFrame
        diagnostic information
        
    Returns
    -------
    d2: pandas DataFrame
        merged DataFrame
    """
    openSMILE['stranger'] = openSMILE['stranger'] == 'w'
    d = pd.merge(dx, openSMILE, on='URSI', how='right')
    d2 = pd.merge(d, conditions, on=['URSI', 'stranger'], how='left')
    return(d2)


def int_categorize(df):
    """
    Function to take a dataframe with some categorical data and
    to reencode those categories as integers.
    
    Parameter
    ---------
    df : DataFrame
        DataFrame to reencode
        
    Returns
    -------
    df : DataFrame
        Reencoded DataFrame
    """
    if "Dx?" in df.columns:
        df["Dx?"] = df["Dx?"].fillna(False).astype(bool)
    up = []
    for c in list(df.columns):
        if(str(df[c].dtype) == "object"):
            up.append(c)
    dicts = [dict() for u in up]
    df = update_encoding(df, dicts, up, 'category')
    for u in up:
        df = update_encoding(
                df,
                {m: i for i, m in enumerate(list(df[u].cat.categories))},
                u,
                int)
    return(df)

def reencode(df, mapping, field, dtype=None):
    """
    Private function to assist `update_encoding` to update encodings
    in a given DataFrame or list of DataFrames.
    
    Parameters
    ----------
    df : pandas DataFrame or list thereof
        DataFrame to correct data in
    
    mapping : dictionary or list thereof
        {incorrect:correct} to be applied to (all) given DataFrame(s)
        
    field : string
        the column name or list thereof to update in the given DataFrame(s);
        if a list, must be the same length as mapping and in the same order
        
    Returns
    -------
    df(2) : pandas DataFrame or list thereof
        same shape as input, but with corrected URSIs
    """
    if field in df.columns:
        mapping = {**mapping, **{ursi:ursi for ursi in df[field] if ursi not in mapping}}
        if dtype:
            df[field] = df[field].map(mapping).astype(dtype)
        else:
            df[field] = df[field].map(mapping)
    return(df)


def update_encoding(df, mapping, field, dtype=None):
    """
    Function to update encodings in a given DataFrame or list of DataFrames.
    
    Parameters
    ----------
    df : pandas DataFrame or list thereof
        DataFrame to correct data in
    
    mapping : dictionary or list thereof
        {incorrect:correct} to be applied to (all) given DataFrame(s)
        
    field : string
        the column name or list thereof to update in the given DataFrame(s);
        if a list, must be the same length as `mapping` and in the same order
        
    dtype : type (optional)
        datatype or list thereof to recast the given column; like `field`,
        if a list, must be the same length as `mapping` and in the same order
        
    Returns
    -------
    df(2) : pandas DataFrame or list thereof
        same shape as input, but with corrected URSIs
    """
    if(type(df) == list):
        df2 = list()
        for d in df:
            df2.append(update_encoding(d, mapping, field))
        return(df2)
    else:
        if(type(mapping) == list):
            for i, m in enumerate(mapping):
                df = reencode(df, m, field[i], dtype)
        else:
            df = reencode(df, mapping, field, dtype)
        return(df)
        

def make_forest(configed_df):
    """
    Function to get training and target data, filling in unaltered rows when no
    altered row exists

    Parameters
    ----------     
    configed_df : pandas DataFrame
        DataFrame with openSMILE output and demographic features

    Returns
    -------
    x_trees : numpy array
        array of [n_participants × n_features] size
        filled with training data (features)

    y_trees : numpy array
        array of [n_participants × n_dx_features] size
        filled with target data (diagnoses)
    """
    # ycols = [col for col in configed_df.columns if ('smq' in col)] + ['Dx?','URSI']
    ycols = ['Dx?']
    xcols = configed_df.columns.difference(ycols)
    for col in xcols:
        try:
            float(configed_df[col][0])
        except:
            enc = LabelEncoder()
            enc.fit(np.array(configed_df[col]))
            configed_df[col] = enc.transform(np.array(configed_df[col]))
    xtrees = np.array(configed_df[xcols]).reshape(len(configed_df), len(xcols))
    ytrees = np.array(configed_df[ycols]).reshape(len(configed_df), len(ycols)).ravel()
    return (xtrees, ytrees)

Load data from OSF:

In [2]:
emobase = pd.read_csv(osf['emobase'])
ComParE_2016 = pd.read_csv(osf['ComParE_2016'])
conditions = pd.read_csv(osf['conditions'])
dx = pd.read_csv(osf['dx'])

Harmonize data formats:

In [3]:
emobase, ComParE_2016, conditions, dx = update_encoding(
                                             [emobase, ComParE_2016, conditions, dx],
                                             [{"M00494594":"M00494954"},
                                              {"_":False, np.nan:False, "SM":True}
                                             ],
                                             ["URSI", "Dx?"],
                                             [None, bool]
                                         )

Merge datatables as necessary and integerize categorical data:

In [4]:
emobase = int_categorize(combine_data(emobase, conditions, dx))
ComParE_2016 = int_categorize(combine_data(ComParE_2016, conditions, dx))

## Random Forests

Import and initialize:

In [5]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

We can only have 1-D Y:

In [11]:
eX, eY = make_forest(emobase)

Try with 100 estimators:

In [51]:
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(eX, eY)

In [52]:
clf.feature_importances_

array([  8.50634095e-03,   1.40944341e-04,   1.78407353e-04,
         9.51655545e-05,   1.70747061e-04,   7.14497810e-03,
         0.00000000e+00,   3.03428326e-04,   1.42546420e-03,
         4.24884016e-04,   6.52819808e-05,   2.05507614e-04,
         1.48314946e-04,   9.57261342e-05,   1.83882760e-04,
         0.00000000e+00,   0.00000000e+00,   9.47757744e-05,
         4.94393325e-04,   3.73559818e-04,   1.75651477e-04,
         0.00000000e+00,   4.15377785e-04,   1.85538233e-04,
         1.45947786e-04,   3.70260313e-03,   4.78626072e-03,
         1.25236646e-03,   0.00000000e+00,   2.53576576e-04,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   3.06961623e-03,   9.55722450e-05,
         1.01224851e-03,   2.74733621e-03,   4.21636162e-04,
         1.66936561e-04,   4.58183557e-04,   0.00000000e+00,
         0.00000000e+00,   3.15725897e-04,   1.55677639e-04,
         4.53262453e-04,   8.38944205e-05,   2.58210587e-04,
         1.86239427e-04,

In [53]:
features = pd.DataFrame.from_dict(dict(zip(
               emobase.columns.difference(["Dx?"]), clf.feature_importances_
               )), orient='index'
           ).rename(columns={0:"importance"}
           ).sort_values("importance", ascending=False)
print(features)

                                 importance
mfcc_sma[4]_min                    0.022855
mfcc_sma[12]_linregerrA            0.022768
lspFreq_sma[5]_linregerrQ          0.021298
URSI                               0.020257
pcm_loudness_sma_stddev            0.018530
mfcc_sma[5]_quartile1              0.015928
mfcc_sma[4]_linregerrQ             0.015332
mfcc_sma[6]_linregerrQ             0.015037
lspFreq_sma[5]_stddev              0.014630
mfcc_sma[6]_quartile3              0.013871
mfcc_sma[2]_stddev                 0.013162
mfcc_sma[1]_stddev                 0.011472
lspFreq_sma[2]_quartile3           0.011140
voiceProb_sma_linregerrQ           0.010732
mfcc_sma[4]_amean                  0.010653
mfcc_sma[7]_min                    0.010540
mfcc_sma[8]_stddev                 0.009735
mfcc_sma[8]_amean                  0.009066
pcm_loudness_sma_linregerrA        0.009056
voiceProb_sma_stddev               0.009052
mfcc_sma[5]_quartile3              0.009006
pcm_loudness_sma_quartile1      