# Selective Mutism response paradigm analysis: voice

Authors:
    - Jon Clucas, 2017  <jon.clucas@childmind.org>
Copyright ©2017, Apache v2.0 License

In [20]:
import json
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
import pandas as pd
import sys
sm_eeg = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if sm_eeg not in sys.path: 
    sys.path.append(sm_eeg)
from utilities import fetch_data
with open(os.path.join('../config/config.json')) as cfgf:
    osf = json.load(cfgf)['OSF_urls']
    
    
def combine_data(openSMILE, SM, conditions, dx):
    """
    Function to combine imported DataFrames.
    
    Parameters
    ----------
    openSMILE: pandas DataFrame
        features from openSMILE
    
    SM: pandas DataFrame
        SM diagnotic measures
    
    conditions: pandas DataFrame
        notes about each recording
        
    dx: pandas DataFrame
        diagnostic information
        
    Returns
    -------
    d2: pandas DataFrame
        merged DataFrame
    """
    openSMILE['stranger'] = openSMILE['stranger'] == 'w'
    d0 = pd.merge(SM, dx, on='URSI', how='left')
    d = pd.merge(d0, openSMILE, on='URSI', how='right')
    d2 = pd.merge(d, conditions, on=['URSI', 'stranger'], how='left')
    return(d2)


def reencode(df, mapping, field, dtype=None):
    """
    Private function to assist `update_encoding` to update encodings
    in a given DataFrame or list of DataFrames.
    
    Parameters
    ----------
    df : pandas DataFrame or list thereof
        DataFrame to correct data in
    
    mapping : dictionary or list thereof
        {incorrect:correct} to be applied to (all) given DataFrame(s)
        
    field : string
        the column name or list thereof to update in the given DataFrame(s);
        if a list, must be the same length as mapping and in the same order
        
    Returns
    -------
    df(2) : pandas DataFrame or list thereof
        same shape as input, but with corrected URSIs
    """
    if field in df.columns:
        mapping = {**mapping, **{ursi:ursi for ursi in df[field] if ursi not in mapping}}
        if dtype:
            df[field] = df[field].map(mapping).astype(dtype)
        else:
            df[field] = df[field].map(mapping)
    return(df)

def update_encoding(df, mapping, field, dtype=None):
    """
    Function to update encodings in a given DataFrame or list of DataFrames.
    
    Parameters
    ----------
    df : pandas DataFrame or list thereof
        DataFrame to correct data in
    
    mapping : dictionary or list thereof
        {incorrect:correct} to be applied to (all) given DataFrame(s)
        
    field : string
        the column name or list thereof to update in the given DataFrame(s);
        if a list, must be the same length as `mapping` and in the same order
        
    dtype : type (optional)
        datatype or list thereof to recast the given column; like `field`,
        if a list, must be the same length as `mapping` and in the same order
        
    Returns
    -------
    df(2) : pandas DataFrame or list thereof
        same shape as input, but with corrected URSIs
    """
    if(type(df) == list):
        df2 = list()
        for d in df:
            df2.append(update_encoding(d, mapping, field))
        return(df2)
    else:
        if(type(mapping) == list):
            for i, m in enumerate(mapping):
                df = reencode(df, m, field[i])
        else:
            df = reencode(df, mapping, field)
        return(df)
        

def make_forest(configed_df):
    """
    Function to get training and target data, filling in unaltered rows when no
    altered row exists

    Parameters
    ----------     
    configed_df : pandas DataFrame
        DataFrame with openSMILE output and demographic features

    Returns
    -------
    x_trees : numpy array
        array of [n_participants × n_features] size
        filled with training data (features)

    y_trees : numpy array
        array of [n_participants × n_dx_features] size
        filled with target data (diagnoses)
    """
    ycols = [col for col in configed_df.columns if ('smq' in col)] + ['Dx?','URSI']
    xcols = configed_df.columns.difference(ycols)
    for col in xcols:
        try:
            float(configed_df[col][0])
        except:
            enc = LabelEncoder()
            enc.fit(np.array(configed_df[col]).dropna())
            configed_df[col] = enc.transform(np.array(configed_df[col]).dropna())
    xtrees = np.array(configed_df[xcols]).reshape(len(configed_df), len(xcols))
    ytrees = np.array(configed_df[ycols]).reshape(len(configed_df), len(ycols))
    return (xtrees, ytrees)

In [21]:
emobase = pd.read_csv(osf['emobase'])
ComParE_2016 = pd.read_csv(osf['ComParE_2016'])
SMQ = pd.read_csv(osf['SMQ'])
conditions = pd.read_csv(osf['conditions'])
dx = pd.read_csv(osf['dx'])

In [22]:
emobase, ComParE_2016, SMQ, conditions = update_encoding(
                                             [emobase, ComParE_2016, SMQ, conditions],
                                             [{"M00494594":"M00494954"},
                                              {"_":False, np.nan:False, "SM":True}],
                                             ["URSI", "Dx?"],
                                             [None, bool]
                                         )

In [23]:
emobase = combine_data(emobase, SMQ, conditions, dx)
ComParE_2016 = combine_data(ComParE_2016, SMQ, conditions, dx)

In [24]:
emobase["Dx?"].unique()

array(['_', 'SM', nan], dtype=object)

In [25]:
for c in [col for col in list(emobase.columns) if col not in ["URSI", "Dx?"]]:
        print(": ".join([c, str(emobase[c].dtype)]))

smq_as: float64
smq_hf: float64
smq_ss: float64
smq_id: float64
Unnamed: 0: int64
F0_sma_amean: float64
F0_sma_de_amean: float64
F0_sma_de_iqr1-2: float64
F0_sma_de_iqr1-3: float64
F0_sma_de_iqr2-3: float64
F0_sma_de_kurtosis: float64
F0_sma_de_linregc1: float64
F0_sma_de_linregc2: float64
F0_sma_de_linregerrA: float64
F0_sma_de_linregerrQ: float64
F0_sma_de_max: float64
F0_sma_de_maxPos: float64
F0_sma_de_min: float64
F0_sma_de_minPos: float64
F0_sma_de_quartile1: float64
F0_sma_de_quartile2: float64
F0_sma_de_quartile3: float64
F0_sma_de_range: float64
F0_sma_de_skewness: float64
F0_sma_de_stddev: float64
F0_sma_iqr1-2: float64
F0_sma_iqr1-3: float64
F0_sma_iqr2-3: float64
F0_sma_kurtosis: float64
F0_sma_linregc1: float64
F0_sma_linregc2: float64
F0_sma_linregerrA: float64
F0_sma_linregerrQ: float64
F0_sma_max: float64
F0_sma_maxPos: float64
F0_sma_min: float64
F0_sma_minPos: float64
F0_sma_quartile1: float64
F0_sma_quartile2: float64
F0_sma_quartile3: float64
F0_sma_range: float64
F

mfcc_sma[9]_quartile3: float64
mfcc_sma[9]_range: float64
mfcc_sma[9]_skewness: float64
mfcc_sma[9]_stddev: float64
mfcc_sma_de[10]_amean: float64
mfcc_sma_de[10]_iqr1-2: float64
mfcc_sma_de[10]_iqr1-3: float64
mfcc_sma_de[10]_iqr2-3: float64
mfcc_sma_de[10]_kurtosis: float64
mfcc_sma_de[10]_linregc1: float64
mfcc_sma_de[10]_linregc2: float64
mfcc_sma_de[10]_linregerrA: float64
mfcc_sma_de[10]_linregerrQ: float64
mfcc_sma_de[10]_max: float64
mfcc_sma_de[10]_maxPos: float64
mfcc_sma_de[10]_min: float64
mfcc_sma_de[10]_minPos: float64
mfcc_sma_de[10]_quartile1: float64
mfcc_sma_de[10]_quartile2: float64
mfcc_sma_de[10]_quartile3: float64
mfcc_sma_de[10]_range: float64
mfcc_sma_de[10]_skewness: float64
mfcc_sma_de[10]_stddev: float64
mfcc_sma_de[11]_amean: float64
mfcc_sma_de[11]_iqr1-2: float64
mfcc_sma_de[11]_iqr1-3: float64
mfcc_sma_de[11]_iqr2-3: float64
mfcc_sma_de[11]_kurtosis: float64
mfcc_sma_de[11]_linregc1: float64
mfcc_sma_de[11]_linregc2: float64
mfcc_sma_de[11]_linregerrA: fl

## Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier()

In [None]:
eX, eY = make_forest(emobase)

In [None]:
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(eX, eY)