In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("CHI_UIST2019_OpenSMILE_Data.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("CHI_UIST2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Safe new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

## Let's start with the character feature 'sex'

In [2]:
# Let's prepare the datasets containing spaces for logistic regression
temp_emotion_char = df_emotion_char.rename(columns = {'Academic Status' : 'Academic'})
temp_affect_char = df_affect_char.rename(columns = {'Academic Status' : 'Academic'})
temp_loi_char = df_loi_char.rename(columns = {'Academic Status' : 'Academic', 'High Interest': 'High_Interest'})
temp_arval_char = df_ar_val_char.rename(columns = {'Academic Status' : 'Academic'})

# Start with model and sex
print('Emotion')
logreg_emo_sex = hp.multiLogReg(temp_emotion_char, 'Emotion', 'Sex',True) #boolean set prohibitWarnings to True
print(logreg_emo_sex.summary())
logreg_aff_sex = hp.multiLogReg(temp_affect_char, 'Affect', 'Sex', True)
print('Affect')
print(logreg_aff_sex.summary())
logreg_loi_sex = hp.multiLogReg(temp_loi_char, 'LOI', 'Sex', True) 
print('Level of Interest')
print(logreg_loi_sex.summary())
logreg_ar_val_sex = hp.multiLogReg(temp_arval_char, 'Arousal-Valence', 'Sex', True) # Yields LinAlgError: Singular Matrix
print('Arousal-Valence')
print(logreg_ar_val_sex.summary())

Emotion
         Current function value: 0.510461
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    Sex   No. Observations:                  585
Model:                          Logit   Df Residuals:                      577
Method:                           MLE   Df Model:                            7
Date:                Wed, 21 Oct 2020   Pseudo R-squ.:                  0.2624
Time:                        13:17:46   Log-Likelihood:                -298.62
converged:                      False   LL-Null:                       -404.87
                                        LLR p-value:                 2.579e-42
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept    -167.0348   1.28e+05     -0.001      0.999   -2.51e+05     2.5e+05
Anger         166.5342   1.28e+05      0.001      0.99



### Character feature = academic status

In [3]:
#logreg_emo_aca = hp.multiLogReg(temp_emotion_char, 'Emotion', 'Academic', True) # Raises LinAlg Singular Matrix Error
print('Emotions: ')
#print(logreg_emo_aca.summary())
logreg_aff_aca = hp.multiLogReg(temp_affect_char, 'Affect', 'Academic', True) 
print('Affect: ')
print(logreg_aff_aca.summary())
logreg_loi_aca = hp.multiLogReg(temp_loi_char, 'LOI', 'Academic', True)
print('Level of Interest: ')
print(logreg_loi_aca.summary())
logreg_ar_val_aca = hp.multiLogReg(temp_arval_char, 'Arousal-Valence', 'Academic', True)
print('Arousal-Valence: ')
print(logreg_ar_val_aca.summary())

Emotions: 
         Current function value: 0.658010
         Iterations: 35
Affect: 
                           Logit Regression Results                           
Dep. Variable:               Academic   No. Observations:                  365
Model:                          Logit   Df Residuals:                      358
Method:                           MLE   Df Model:                            6
Date:                Wed, 21 Oct 2020   Pseudo R-squ.:                 0.03917
Time:                        13:17:46   Log-Likelihood:                -240.17
converged:                      False   LL-Null:                       -249.96
                                        LLR p-value:                  0.003286
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept   -1189.3618   1.55e+05     -0.008      0.994   -3.04e+05    3.02e+05
Aggressiv    1186.4584   1.55e+05      0.0



### Native Speaker

In [4]:
#logreg_emo_aca = hp.multiLogReg(temp_emotion_char, 'Emotion', 'Academic', True) # Raises LinAlg Singular Matrix Error
print('Emotions: ')
#print(logreg_emo_aca.summary())
logreg_aff_aca = hp.multiNomiLogReg(temp_affect_char, 'Affect', 'IsNativeSpeaker', True) 
print('Affect: ')
print(logreg_aff_aca.summary())
logreg_loi_aca = hp.multiNomiLogReg(temp_loi_char, 'LOI', 'IsNativeSpeaker', True)
print('Level of Interest: ')
print(logreg_loi_aca.summary())
logreg_ar_val_aca = hp.multiNomiLogReg(temp_arval_char, 'Arousal-Valence', 'IsNativeSpeaker', True)
print('Arousal-Valence: ')
print(logreg_ar_val_aca.summary())

Emotions: 
         Current function value: 1.006842
         Iterations: 35
Affect: 
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  585
Model:                        MNLogit   Df Residuals:                      571
Method:                           MLE   Df Model:                           12
Date:                Wed, 21 Oct 2020   Pseudo R-squ.:                 0.02894
Time:                        13:17:46   Log-Likelihood:                -589.00
converged:                      False   LL-Null:                       -606.56
                                        LLR p-value:                 0.0004496
y=IsNativeSpeaker[Europ. Non-Native]       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
Intercept                              -32.2706   1.76e+05     -0.000   




         Current function value: 1.028785
         Iterations: 35
Level of Interest: 
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  585
Model:                        MNLogit   Df Residuals:                      577
Method:                           MLE   Df Model:                            6
Date:                Wed, 21 Oct 2020   Pseudo R-squ.:                0.007780
Time:                        13:17:46   Log-Likelihood:                -601.84
converged:                      False   LL-Null:                       -606.56
                                        LLR p-value:                    0.1504
y=IsNativeSpeaker[Europ. Non-Native]       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
Intercept                             -159.0550   2.46e+05     -0.001  