# OpenSMILE Analysis
This notebook loads OpenSMILE csv- data, cleans and plots it

## Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

## Load .csv data with results of OpenSMILE Analysis
First we load .csv data and clean it (removing of NaNs), then we store information of all files in seperate panda dataframes containing information about affect, emotion and valence/arousal for all participants.

In [2]:
data = pd.read_csv("UIST2019_OpenSMILE.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

## Let's load information about the speakers
The speaker ID is saved in a single .csv file containing four important columns: ID, Age, Sex and Acadedmic Status. Since before loaded OpenSMILE csv files are named using the corresponding index (ex. speaker with id 0 has two files 0_a.csv and 0_b.csv), so that a link can be created

In [3]:
char_data = pd.read_csv("UIST2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Safe new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

## Now have a look at correlation
### Have a look at arousal and what it corrlates with

In [4]:
#'Clean' our dataFrames so that we can call the panda .corr function (default is spearman)
#Let's start with arousal and look how correlation is between emotions
print('Emotions and Arousal')
cor_ar_emo = hp.correlations(df_ar_val['Arousal'], df_emotion, emotion_label)
# Now arousal and affect
print('Affect and Arousal')
cor_ar_aff = hp.correlations(df_ar_val['Arousal'], df_affect, affect_label)
# Now arousal and level of Interest
print('Level of Interest and Arousal')
cor_ar_loi = hp.correlations(df_ar_val['Arousal'], df_loi, loi_label)

Emotions and Arousal
Correlation between Arousal and Anger: -0.11960116467375333
Cohen d: 1.0038101525157788
Correlation between Arousal and Boredom: -0.15724156022576966
Cohen d: 0.9280649625886008
Correlation between Arousal and Disgust: 0.31142090322066945
Cohen d: 0.12619986564219535
Correlation between Arousal and Fear: -0.12065193543111381
Cohen d: 1.0088509049605552
Correlation between Arousal and Happiness: -0.12745701355512387
Cohen d: 1.0068969820425526
Correlation between Arousal and Emo_Neutral: -0.12625517556888255
Cohen d: 1.0000073098884006
Correlation between Arousal and Sadness: -0.2690114288749918
Cohen d: -5.460177225486265
Affect and Arousal
Correlation between Arousal and Aggressiv: 0.06684792387301247
Cohen d: 0.048486623323264935
Correlation between Arousal and Cheerful: -0.5678322970999742
Cohen d: -1.2806979634109281
Correlation between Arousal and Intoxicated: 0.614456354498157
Cohen d: -1.1121919431101954
Correlation between Arousal and Nervous: 0.20051069137

### Now look at valence

In [5]:
print('Emotions and valence')
cor_val_emo = hp.correlations(df_ar_val['Valence'], df_emotion, emotion_label)

# Now valence and affect
print('Affect and valence')
cor_val_aff = hp.correlations(df_ar_val['Valence'], df_affect, affect_label)

# Now arousal and level of Interest
print('Level of Interest and valence')
cor_val_loi = hp.correlations(df_ar_val['Valence'], df_loi, loi_label)

Emotions and valence
Correlation between Valence and Anger: -0.3124995201247417
Cohen d: 1.4200472018669366
Correlation between Valence and Boredom: -0.26509745567227183
Cohen d: 1.3594280990223377
Correlation between Valence and Disgust: -0.20672906887783082
Cohen d: 0.5867104641016552
Correlation between Valence and Fear: -0.27483848312832276
Cohen d: 1.4239631878102155
Correlation between Valence and Happiness: -0.3065434608342312
Cohen d: 1.422446394407046
Correlation between Valence and Emo_Neutral: -0.24688029616822277
Cohen d: 1.417063981453608
Correlation between Valence and Sadness: 0.26306821038460826
Cohen d: -4.373471293321692
Affect and valence
Correlation between Valence and Aggressiv: -0.3335699261573498
Cohen d: 0.6561805645814583
Correlation between Valence and Cheerful: -0.7241748894708327
Cohen d: -0.6553855428137522
Correlation between Valence and Intoxicated: 0.5310386061614225
Cohen d: -0.6546721565220764
Correlation between Valence and Nervous: 0.6158725167513683

### Now look at the different emotions and how they correlate with affect and level of interest

In [6]:
# Now emotion and affect
print('ANGER')
print('Affect and anger')
cor_ang_aff = hp.correlations(df_emotion['Anger'], df_affect, affect_label)
# Now arousal and level of Interest
print('Level of Interest and anger')
cor_ang_loi = hp.correlations(df_emotion['Anger'], df_loi, loi_label)
##########
# Now look at boredom
print('BOREDOM')
print('Affect and boredom')
cor_bor_aff = hp.correlations(df_emotion['Boredom'], df_affect, affect_label)
# Now arousal and level of Interest
print('Level of Interest and boredom')
cor_bor_loi = hp.correlations(df_emotion['Boredom'], df_loi, loi_label)
##########
# Disgust
print('DISGUST')
print('Affect and disgust')
cor_dis_aff = hp.correlations(df_emotion['Disgust'], df_affect, affect_label)
# Now level of Interest
print('Level of Interest and disgust')
cor_dis_loi = hp.correlations(df_emotion['Disgust'], df_loi, loi_label)
######
# fear
print('FEAR')
print('Affect and fear')
cor_fea_aff = hp.correlations(df_emotion['Fear'], df_affect, affect_label)
# Now arousal and level of Interest
print('Level of Interest and fear')
cor_fea_loi = hp.correlations(df_emotion['Fear'], df_loi, loi_label)
##########
# happiness
print('HAPPINESS')
print('Affect and happiness')
cor_hap_aff = hp.correlations(df_emotion['Happiness'], df_affect, affect_label)
# Now level of Interest
print('Level of Interest and happiness')
cor_hap_loi = hp.correlations(df_emotion['Happiness'], df_loi, loi_label)
##########
# neutral
print('NEUTRAL')
print('Affect and neutral')
cor_eneu_aff = hp.correlations(df_emotion['Emo_Neutral'], df_affect, affect_label)
# Now arousal and level of Interest
print('Level of Interest and neutral')
cor_eneu_loi = hp.correlations(df_emotion['Emo_Neutral'], df_loi, loi_label)
##########
# Sadness
print('SADNESS')
print('Affect and sadness')
cor_sad_aff = hp.correlations(df_emotion['Sadness'], df_affect, affect_label)
# Now level of Interest
print('Level of Interest and sadness')
cor_sad_loi = hp.correlations(df_emotion['Sadness'], df_loi, loi_label)

ANGER
Affect and anger
Correlation between Anger and Aggressiv: 0.23964088160619282
Cohen d: -2.575375652135281
Correlation between Anger and Cheerful: 0.2766279052354653
Cohen d: -2.54244685327253
Correlation between Anger and Intoxicated: -0.19822753557486464
Cohen d: -1.8417008864025526
Correlation between Anger and Nervous: -0.13735598175422528
Cohen d: -1.3004006304271691
Correlation between Anger and Aff_Neutral: -0.16259822887082206
Cohen d: -2.34163026077441
Correlation between Anger and Tired: 0.1105955221813773
Cohen d: -1.5252241417571457
Level of Interest and anger
Correlation between Anger and Disinterest: -0.16507842650147914
Cohen d: -1.0834350225209848
Correlation between Anger and Normal: 0.010935129350005316
Cohen d: -7.514696690964597
Correlation between Anger and High Interest: 0.07164551352517692
Cohen d: -0.9999633842840118
BOREDOM
Affect and boredom
Correlation between Boredom and Aggressiv: 0.07834589344162006
Cohen d: -2.1330200744748296
Correlation between Bor

### Now have a look at affect

In [7]:
##########
# Aggressiv
print('AGGRESSIV')
print('Level of Interest and aggressiv')
cor_agg_loi = hp.correlations(df_affect['Aggressiv'], df_loi, loi_label)
##########
# Cheerful
print('CHEERFUL')
print('Level of Interest and cheerful')
cor_che_loi = hp.correlations(df_affect['Cheerful'], df_loi, loi_label)
##########
# Intoxicated
print('INTOXICATED')
print('Level of Interest and intoxicated')
cor_tox_loi = hp.correlations(df_affect['Intoxicated'], df_loi, loi_label)
##########
# Nervous
print('NERVOUS')
print('Level of Interest and nervous')
cor_ner_loi = hp.correlations(df_affect['Nervous'], df_loi, loi_label)
##########
# Neutral
print('NEUTRAL')
print('Level of Interest and neutral')
# Now intoxicated and level of Interest
cor_aneu = hp.correlations(df_affect['Aff_Neutral'], df_loi, loi_label)
#########
# Tired
print('TIRED')
print('Level of Interest and tired')
cor_tir_loi = hp.correlations(df_affect['Tired'], df_loi, loi_label)

AGGRESSIV
Level of Interest and aggressiv
Correlation between Aggressiv and Disinterest: -0.37600554602965947
Cohen d: 0.6172006300170615
Correlation between Aggressiv and Normal: 0.18040556873532998
Cohen d: -6.405055252844228
Correlation between Aggressiv and High Interest: -0.018773756064177527
Cohen d: -0.054025108125872405
CHEERFUL
Level of Interest and cheerful
Correlation between Cheerful and Disinterest: -0.5724896790302542
Cohen d: 1.887980017620989
Correlation between Cheerful and Normal: 0.2795570728565973
Cohen d: -3.553168536617402
Correlation between Cheerful and High Interest: -0.03429455210080225
Cohen d: 1.2709946787499047
INTOXICATED
Level of Interest and intoxicated
Correlation between Intoxicated and Disinterest: 0.28524946241995097
Cohen d: 1.4654871740406061
Correlation between Intoxicated and Normal: -0.13644370042294082
Cohen d: -2.6319160155207424
Correlation between Intoxicated and High Interest: 0.013754189913378435
Cohen d: 1.106291756382935
NERVOUS
Level of

## Chi-squared Test of Independence
We Start with characteristic sex. The null hypothesis states that the two categorical variables sex and e.g. emotion are independent.

For that we convert the previously used data frames which contain probabilities of e.g. emotion, affect etc. to frequency tables using the helper method calcFrequencyTable(). For each row (sample) the function selects the maximum probability and counts it as an occured frequency. Furthermore the function takes in an integer so that the matching labels for emotion are generated (voice features). The second integer defines the character feature e.g. Age, so that it can split the data set into a form of n x m, where n stands for the number of character features (in most cases it's 2, but for age it's 3) and m stands for the number of voice features e.g. number of different emotions.  
The frequency tables are then used as input for the stats.chi2_contingency() function.

If the p-value is significant, residuals are computed to identify the cells, in which the groups differ, i.e. the data sets differ in the emotion 'anger', which contributes to the significant p-value.

In [8]:
emo_sex_chi2 = hp.chi2(df_emotion_char, 'Sex', 0, True)
aff_sec_chi2 = hp.chi2(df_affect_char, 'Sex', 1, True)
ar_val_sec_chi2 = hp.chi2(df_ar_val_char, 'Sex', 2, True)
loi_sec_chi2 = hp.chi2(df_loi_char, 'Sex', 3, True)

TypeError: reduction operation 'argmax' not allowed for this dtype

Now move on to academic status, the hypothesis being that the variables academic status and e.g. emotion are independent.

In [None]:
emo_aca_chi2 = hp.chi2(df_emotion_char, 'Academic Status', 0, True)
aff_aca_chi2 = hp.chi2(df_affect_char, 'Academic Status', 1, True)
ar_val_aca_chi2 = hp.chi2(df_ar_val_char, 'Academic Status', 2, True)
loi_aca_chi2 = hp.chi2(df_loi_char, 'Academic Status', 3, True)

Now let's look if age and e.g. emotion/ affect/ arousal-valence/ level of interest are independent

In [None]:
emo_age_chi2 = hp.chi2(df_emotion_char, 'Age', 0, True)
aff_age_chi2 = hp.chi2(df_affect_char, 'Age', 1, True)
ar_val_age_chi2 = hp.chi2(df_ar_val_char, 'Age', 2, True)
loi_age_chi2 = hp.chi2(df_loi_char, 'Age', 3, True)

## Post-Hoc tests for age, as it has three different groups

In [None]:
print('post-hoc emotions and different groups')
emo_reject_list, emo_corrected_p_vals, emo_combinations, emo_residuals= hp.chi2_post_hoc(emo_age_chi2[1], 'bonferroni', True, True)
print('\n post-hoc affect and different groups')
aff_reject_list, emo_corrected_p_vals, emo_combinations, aff_residuals = hp.chi2_post_hoc(aff_age_chi2[1], 'bonferroni', True, True)
print('\n post-hoc arousal-valence and different groups')
ar_val_reject_list, ar_val_corrected_p_vals, ar_val_combinations, ar_val_residuals = hp.chi2_post_hoc(ar_val_age_chi2[1], 'bonferroni',True, True)
print('\n post-hoc level of intereset and different groups')
loi_reject_list, loi_corrected_p_vals, loi_combinations, loi_residuals = hp.chi2_post_hoc(loi_age_chi2[1], 'bonferroni', True, True)

## ANOVA Test

In [14]:
# Start with Emotion and Sex
emo_temp_sex = df_emotion_char.drop(['Char_ID','ID', 'Filename', 'Age', 'Academic Status'], axis = 1)
anova_emo_sex = hp.f_anova(emo_temp_sex, emotion_label, 'Sex')
#Affect and Sex
aff_temp_sex = df_affect_char.drop(['Char_ID','ID', 'Filename', 'Age', 'Academic Status'], axis = 1)
anova_aff_sex = hp.f_anova(aff_temp_sex, affect_label, 'Sex')
#Level of Interest and Sex
loi_temp_sex = df_loi_char.drop(['Char_ID','ID', 'Filename', 'Age', 'Academic Status'], axis = 1)
anova_loi_sex = hp.f_anova(loi_temp_sex, loi_label, 'Sex')
#Arousal-Valence and Sex
arval_temp_sex = df_ar_val_char.drop(['Char_ID','ID', 'Filename', 'Age', 'Academic Status'], axis = 1)
anova_arval_sex = hp.f_anova(arval_temp_sex, ['Valence', 'Arousal'], 'Sex')

#Emotion and Age
emo_temp_age = df_emotion_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_emo_age = hp.f_anova(emo_temp_age, emotion_label, 'Age')
#Affect and Age
aff_temp_age = df_affect_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_aff_age = hp.f_anova(aff_temp_age, affect_label, 'Age')
#Level of Interest and Age
loi_temp_age = df_loi_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_loi_age = hp.f_anova(loi_temp_age, loi_label, 'Age')
#Arousal-Valence and Age
arval_temp_age = df_ar_val_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_arval_age = hp.f_anova(arval_temp_age, ['Valence', 'Arousal'], 'Age')

#Emotion and Academical Status

print(affect_label)
anova_aff_sex

['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']


[[8.940363845820904,
  0.20106734364852652,
  3.116329600922195,
  4.155098206181356,
  8.418495087119116,
  1.7030488881904489],
 [0.003039101449066641,
  0.6542099781445709,
  0.07860900530563346,
  0.04245554779569083,
  0.004011329140755848,
  0.19296887812501648]]

## Logistic Regression

Now let's try to predict the character features (sex, age, academic status) using the voice features (emotion, affect, arousal-valence, level of interest).

In [11]:
# Let's prepare the datasets containing spaces for logistic regression
temp_emotion_char = df_emotion_char.rename(columns = {'Academic Status' : 'Academic'})
temp_affect_char = df_affect_char.rename(columns = {'Academic Status' : 'Academic'})
temp_loi_char = df_loi_char.rename(columns = {'Academic Status' : 'Academic', 'High Interest': 'High_Interest'})
temp_arval_char = df_ar_val_char.rename(columns = {'Academic Status' : 'Academic'})
#temp_arval_char = 
# Start with model and sex
print('Prediction of Sex')
#temp_emotion_char = temp_emotion_char.drop(['Char_ID','ID','Name','VideoTitle', 'Filename', 'Age', 'Academic'], axis = 1)
logreg_emo_sex = hp.multiLogReg(temp_emotion_char, 'Emotion', 'Sex',True) #boolean set prohibitWarnings to True
logreg_aff_sex = hp.multiLogReg(temp_affect_char, 'Affect', 'Sex', True)
logreg_loi_sex = hp.multiLogReg(temp_loi_char, 'LOI', 'Sex', True) 
logreg_ar_val = hp.multiLogReg(temp_arval_char, 'Arousal-Valence', 'Sex', True) # Yields LinAlgError: Singular Matrix
#print(logreg_ar_val.summary())

# Then Academic Status
print('Prediction of Academic Status')
logreg_emo_aca = hp.multiNomiLogReg(temp_emotion_char, 'Emotion', 'Academic', True) 
logreg_aff_aca = hp.multiNomiLogReg(temp_affect_char, 'Affect', 'Academic', True) 
logreg_loi_aca = hp.multiNomiLogReg(temp_loi_char, 'LOI', 'Academic', True)
logreg_ar_val_aca = hp.multiNomiLogReg(temp_arval_char, 'Arousal-Valence', 'Academic', True)

print('Prediction of Age')
logreg_emo_age = hp.multiNomiLogReg(temp_emotion_char, 'Emotion', 'Age', True)
logreg_aff_age = hp.multiNomiLogReg(temp_affect_char, 'Affect', 'Age', True)
logreg_loi_age = hp.multiNomiLogReg(temp_loi_char, 'LOI', 'Age', True)
logreg_ar_cal_age = hp.multiNomiLogReg(temp_arval_char, 'Arousal-Valence', 'Age', True)

Prediction of Sex
         Current function value: 0.366786
         Iterations: 35
         Current function value: 0.419493
         Iterations: 35
         Current function value: 0.462772
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.396618
         Iterations 7
Prediction of Academic Status
Optimization terminated successfully.
         Current function value: nan
         Iterations 4
         Current function value: 1.176821
         Iterations: 35
         Current function value: 1.290644
         Iterations: 35
Optimization terminated successfully.
         Current function value: 1.264402
         Iterations 7
Prediction of Age
Optimization terminated successfully.
         Current function value: nan
         Iterations 7
         Current function value: 0.549513
         Iterations: 35
         Current function value: 0.566953
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.5

  return eXB/eXB.sum(1)[:,None]
  oldparams) > tol)):


# Test Chi2

In [None]:
# Get Emotion dataset and drop all character columns but gender
df_test = df_emotion_char.drop(['CharacterID', 'file', 'Age', 'Academic Status'], axis = 1)

test_sadness_m = df_test.loc[df_test['Sex'] == 0.0]['sadness']
test_sadness_f = df_test.loc[df_test['Sex'] == 1.0]['sadness']
test_m, counts_m = np.unique(np.around(test_sadness_m, 1), return_counts = True)
test_f, counts_f = np.unique(np.around(test_sadness_f, 1), return_counts = True)
data = ['Sex', '0.0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '1.0']
cont_table = pd.DataFrame(columns = data)
test_m = np.around(np.arange(0, 1.1, 0.1),1)

for i in range(0,11):
    print()
    #print(test_m[i])



#row_m = ['male']
#cont_table = cont_table['Sex'].append(row_m, ignore_index = True)

In [None]:
test_m

In [None]:
df_test