In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("UIST_2019_short_samples_OpenSMILE.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("UIST2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Safe new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

## Let's start with the character feature 'sex'

In [2]:
# Let's prepare the datasets containing spaces for logistic regression
temp_emotion_char = df_emotion_char.rename(columns = {'Academic Status' : 'Academic'})
temp_affect_char = df_affect_char.rename(columns = {'Academic Status' : 'Academic'})
temp_loi_char = df_loi_char.rename(columns = {'Academic Status' : 'Academic', 'High Interest': 'High_Interest'})
temp_arval_char = df_ar_val_char.rename(columns = {'Academic Status' : 'Academic'})

# Start with model and sex
print('Emotion')
logreg_emo_sex = hp.multiLogReg(temp_emotion_char, 'Emotion', 'Sex',True) #boolean set prohibitWarnings to True
print(logreg_emo_sex.summary())
logreg_aff_sex = hp.multiLogReg(temp_affect_char, 'Affect', 'Sex', True)
print('Affect')
print(logreg_aff_sex.summary())
logreg_loi_sex = hp.multiLogReg(temp_loi_char, 'LOI', 'Sex', True) 
print('Level of Interest')
print(logreg_loi_sex.summary())
logreg_ar_val_sex = hp.multiLogReg(temp_arval_char, 'Arousal-Valence', 'Sex', True) # Yields LinAlgError: Singular Matrix
print('Arousal-Valence')
print(logreg_ar_val_sex.summary())

Emotion
         Current function value: 0.385833
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    Sex   No. Observations:                  270
Model:                          Logit   Df Residuals:                      262
Method:                           MLE   Df Model:                            7
Date:                Mon, 05 Oct 2020   Pseudo R-squ.:                  0.1948
Time:                        15:12:31   Log-Likelihood:                -104.17
converged:                      False   LL-Null:                       -129.37
                                        LLR p-value:                 1.206e-08
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     166.6679   2.47e+05      0.001      0.999   -4.83e+05    4.83e+05
Anger        -159.9849   2.47e+05     -0.001      0.99



### Character feature = age

In [3]:
logreg_emo_age = hp.multiNomiLogReg(temp_emotion_char, 'Emotion', 'Age', True)
print('Emotion:')
print(logreg_emo_age.summary())
logreg_aff_age = hp.multiNomiLogReg(temp_affect_char, 'Affect', 'Age', True)
print('Affect: ')
print(logreg_aff_age.summary())
logreg_loi_age = hp.multiNomiLogReg(temp_loi_char, 'LOI', 'Age', True)
print('Level of Interest: ')
print(logreg_loi_age.summary())
logreg_ar_val_age = hp.multiNomiLogReg(temp_arval_char, 'Arousal-Valence', 'Age', True)
print('Arousal-Valence:')
print(logreg_ar_val_age.summary())
temp_emotion_char

         Current function value: 0.498333
         Iterations: 35
Emotion:
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  152
Model:                        MNLogit   Df Residuals:                      136
Method:                           MLE   Df Model:                           14
Date:                Mon, 05 Oct 2020   Pseudo R-squ.:                  0.1427
Time:                        15:12:31   Log-Likelihood:                -75.747
converged:                      False   LL-Null:                       -88.359
                                        LLR p-value:                   0.03242
  y=Age[Old]       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     -177.0699   5.95e+05     -0.000      1.000   -1.17e+06    1.17e+06
Anger          787.8090   5.95e+05      0.001     




         Current function value: 0.491747
         Iterations: 35
Affect: 
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  152
Model:                        MNLogit   Df Residuals:                      138
Method:                           MLE   Df Model:                           12
Date:                Mon, 05 Oct 2020   Pseudo R-squ.:                  0.1541
Time:                        15:12:31   Log-Likelihood:                -74.746
converged:                      False   LL-Null:                       -88.359
                                        LLR p-value:                  0.007169
  y=Age[Old]       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1670.7419   6.14e+05      0.003      0.998    -1.2e+06     1.2e+06
Aggressiv    -1700.3028   6.14e+05     -0.003    



Unnamed: 0,Anger,Boredom,Disgust,Fear,Happiness,Emo_Neutral,Sadness,Filename,Char_ID,ID,VideoTitle,Name,Sex,Academic,Age,VideoID,IsNativeSpeaker
0,0.015207,0.043832,0.110879,0.006380,0.008699,0.006839,0.808164,0_a_a.wav,0,0,3D printed Fabric: Techniques for Design and 3...,Haruki Takahashi,Male,PhD,Young,z07keSZOkO8,Asian Non-Native
1,0.001765,0.107574,0.003542,0.004003,0.002175,0.012042,0.868898,0_b_a.wav,0,0,3D printed Fabric: Techniques for Design and 3...,Haruki Takahashi,Male,PhD,Young,z07keSZOkO8,Asian Non-Native
2,0.002319,0.001360,0.275026,0.000123,0.001068,0.000227,0.719877,100_a_q.wav,100,100,Question in Optimizing Portrait Lighting,Scott Klemmer,Male,PhD,Intermediate,,Native Speaker
3,0.000334,0.000106,0.089250,0.000015,0.000135,0.000040,0.910122,101_a_q.wav,101,101,Question in Optimizing Portrait Lighting,,Male,,,,Asian Non-Native
4,0.009149,0.074864,0.321503,0.003828,0.006701,0.014939,0.569016,102_a_a.wav,102,102,Proxino: Enabling Prototyping of Virtual Circu...,Te-Yen Wu,Male,Grad Student,Young,_Klvg8YSrJQ,Asian Non-Native
5,0.000243,0.004381,0.183230,0.000052,0.000240,0.000524,0.811330,102_b_a.wav,102,102,Proxino: Enabling Prototyping of Virtual Circu...,Te-Yen Wu,Male,Grad Student,Young,_Klvg8YSrJQ,Asian Non-Native
6,0.001232,0.008783,0.040624,0.000219,0.000945,0.001345,0.946852,103_a_q.wav,103,103,Question in Proxino,Andrea Bianchi,Male,PhD,Intermediate,,Europ. Non-Native
7,0.035900,0.411627,0.155997,0.020447,0.032854,0.072970,0.270205,104_a_q.wav,104,104,Question in Proxino,,Female,,,,Native Speaker
8,0.003888,0.011370,0.123527,0.000637,0.002043,0.001069,0.857466,105_a_a.wav,105,105,PseudoBend: Producing Haptic Illusions of Stre...,Seongkook Heo,Male,PhD,Intermediate,rCYeWPUcMZU,Asian Non-Native
9,0.001331,0.006727,0.025838,0.000225,0.000607,0.000883,0.964390,105_b_a.wav,105,105,PseudoBend: Producing Haptic Illusions of Stre...,Seongkook Heo,Male,PhD,Intermediate,rCYeWPUcMZU,Asian Non-Native


### Character feature = academic status

In [4]:
#logreg_emo_aca = hp.multiLogReg(temp_emotion_char, 'Emotion', 'Academic', True) # Raises LinAlg Singular Matrix Error
print('Emotions: ')
#print(logreg_emo_aca.summary())
logreg_aff_aca = hp.multiLogReg(temp_affect_char, 'Affect', 'Academic', True) 
print('Affect: ')
print(logreg_aff_aca.summary())
logreg_loi_aca = hp.multiLogReg(temp_loi_char, 'LOI', 'Academic', True)
print('Level of Interest: ')
print(logreg_loi_aca.summary())
logreg_ar_val_aca = hp.multiLogReg(temp_arval_char, 'Arousal-Valence', 'Academic', True)
print('Arousal-Valence: ')
print(logreg_ar_val_aca.summary())



Emotions: 
         Current function value: 0.586154
         Iterations: 35
Affect: 
                           Logit Regression Results                           
Dep. Variable:               Academic   No. Observations:                  151
Model:                          Logit   Df Residuals:                      144
Method:                           MLE   Df Model:                            6
Date:                Mon, 05 Oct 2020   Pseudo R-squ.:                 0.08967
Time:                        15:12:31   Log-Likelihood:                -88.509
converged:                      False   LL-Null:                       -97.228
                                        LLR p-value:                  0.007805
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept   -1074.4551   2.38e+05     -0.005      0.996   -4.67e+05    4.65e+05
Aggressiv    1063.8181   2.38e+05      0.0

### Native Speaker

In [5]:
#logreg_emo_aca = hp.multiLogReg(temp_emotion_char, 'Emotion', 'Academic', True) # Raises LinAlg Singular Matrix Error
print('Emotions: ')
#print(logreg_emo_aca.summary())
logreg_aff_aca = hp.multiNomiLogReg(temp_affect_char, 'Affect', 'IsNativeSpeaker', True) 
print('Affect: ')
print(logreg_aff_aca.summary())
logreg_loi_aca = hp.multiNomiLogReg(temp_loi_char, 'LOI', 'IsNativeSpeaker', True)
print('Level of Interest: ')
print(logreg_loi_aca.summary())
logreg_ar_val_aca = hp.multiNomiLogReg(temp_arval_char, 'Arousal-Valence', 'IsNativeSpeaker', True)
print('Arousal-Valence: ')
print(logreg_ar_val_aca.summary())

Emotions: 
         Current function value: 0.947131
         Iterations: 35
Affect: 
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  270
Model:                        MNLogit   Df Residuals:                      256
Method:                           MLE   Df Model:                           12
Date:                Mon, 05 Oct 2020   Pseudo R-squ.:                 0.05450
Time:                        15:12:31   Log-Likelihood:                -255.73
converged:                      False   LL-Null:                       -270.47
                                        LLR p-value:                  0.003339
y=IsNativeSpeaker[Europ. Non-Native]       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
Intercept                              102.8165   2.53e+05      0.000   




Optimization terminated successfully.
         Current function value: 0.975689
         Iterations 6
Arousal-Valence: 
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  270
Model:                        MNLogit   Df Residuals:                      264
Method:                           MLE   Df Model:                            4
Date:                Mon, 05 Oct 2020   Pseudo R-squ.:                 0.02599
Time:                        15:12:31   Log-Likelihood:                -263.44
converged:                       True   LL-Null:                       -270.47
                                        LLR p-value:                  0.007103
y=IsNativeSpeaker[Europ. Non-Native]       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
Intercept                            