In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("UIST2019_OpenSMILE.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("UIST2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Safe new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

## Let's start with the character feature 'sex'

In [8]:
# Let's prepare the datasets containing spaces for logistic regression
temp_emotion_char = df_emotion_char.rename(columns = {'Academic Status' : 'Academic'})
temp_affect_char = df_affect_char.rename(columns = {'Academic Status' : 'Academic'})
temp_loi_char = df_loi_char.rename(columns = {'Academic Status' : 'Academic', 'High Interest': 'High_Interest'})
temp_arval_char = df_ar_val_char.rename(columns = {'Academic Status' : 'Academic'})

# Start with model and sex
print('Emotion')
logreg_emo_sex = hp.multiLogReg(temp_emotion_char, 'Emotion', 'Sex',True) #boolean set prohibitWarnings to True
print(logreg_emo_sex.summary())
logreg_aff_sex = hp.multiLogReg(temp_affect_char, 'Affect', 'Sex', True)
print('Affect')
print(logreg_aff_sex.summary())
logreg_loi_sex = hp.multiLogReg(temp_loi_char, 'LOI', 'Sex', True) 
print('Level of Interest')
print(logreg_loi_sex.summary())
logreg_ar_val_sex = hp.multiLogReg(temp_arval_char, 'Arousal-Valence', 'Sex', True) # Yields LinAlgError: Singular Matrix
print('Arousal-Valence')
print(logreg_ar_val_sex.summary())

Emotion
         Current function value: 0.366786
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    Sex   No. Observations:                  280
Model:                          Logit   Df Residuals:                      272
Method:                           MLE   Df Model:                            7
Date:                Thu, 24 Sep 2020   Pseudo R-squ.:                  0.2440
Time:                        13:56:49   Log-Likelihood:                -102.70
converged:                      False   LL-Null:                       -135.85
                                        LLR p-value:                 8.222e-12
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     655.7292   2.44e+05      0.003      0.998   -4.77e+05    4.78e+05
Anger       -3490.1649   2.44e+05     -0.014      0.98



### Character feature = age

In [9]:
logreg_emo_age = hp.multiNomiLogReg(temp_emotion_char, 'Emotion', 'Age', True)
print('Emotion:')
print(logreg_emo_age.summary())
logreg_aff_age = hp.multiNomiLogReg(temp_affect_char, 'Affect', 'Age', True)
print('Affect: ')
print(logreg_aff_age.summary())
logreg_loi_age = hp.multiNomiLogReg(temp_loi_char, 'LOI', 'Age', True)
print('Level of Interest: ')
print(logreg_loi_age.summary())
logreg_ar_val_age = hp.multiNomiLogReg(temp_arval_char, 'Arousal-Valence', 'Age', True)
print('Arousal-Valence:')
print(logreg_ar_val_age.summary())

Prediction of Age
Optimization terminated successfully.
         Current function value: nan
         Iterations 7
Emotion:
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  156
Model:                        MNLogit   Df Residuals:                      140
Method:                           MLE   Df Model:                           14
Date:                Thu, 24 Sep 2020   Pseudo R-squ.:                     nan
Time:                        13:58:50   Log-Likelihood:                    nan
converged:                       True   LL-Null:                       -89.161
                                        LLR p-value:                       nan
  y=Age[Old]       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept           nan        nan        nan        nan         nan         nan
A

  return eXB/eXB.sum(1)[:,None]
  oldparams) > tol)):
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)



Optimization terminated successfully.
         Current function value: 0.558052
         Iterations 7
Arousal-Valence:
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  156
Model:                        MNLogit   Df Residuals:                      150
Method:                           MLE   Df Model:                            4
Date:                Thu, 24 Sep 2020   Pseudo R-squ.:                 0.02361
Time:                        13:58:51   Log-Likelihood:                -87.056
converged:                       True   LL-Null:                       -89.161
                                        LLR p-value:                    0.3783
  y=Age[Old]       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -0.8478      0.775     -1.093      0.274      -2.368       0.672
Arous

### Character feature = academic status

In [12]:
logreg_emo_aca = hp.multiNomiLogReg(temp_emotion_char, 'Emotion', 'Academic', True) 
print('Emotions: ')
print(logreg_emo_aca.summary())
logreg_aff_aca = hp.multiNomiLogReg(temp_affect_char, 'Affect', 'Academic', True) 
print('Affect: ')
print(logreg_aff_aca.summary())
logreg_loi_aca = hp.multiNomiLogReg(temp_loi_char, 'LOI', 'Academic', True)
print('Level of Interest: ')
print(logreg_loi_aca.summary())
logreg_ar_val_aca = hp.multiNomiLogReg(temp_arval_char, 'Arousal-Valence', 'Academic', True)
print('Arousal-Valence: ')
print(logreg_ar_val_aca.summary())

  return eXB/eXB.sum(1)[:,None]
  oldparams) > tol)):
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Optimization terminated successfully.
         Current function value: nan
         Iterations 4
Emotions: 
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  155
Model:                        MNLogit   Df Residuals:                      115
Method:                           MLE   Df Model:                           35
Date:                Thu, 24 Sep 2020   Pseudo R-squ.:                     nan
Time:                        14:02:32   Log-Likelihood:                    nan
converged:                       True   LL-Null:                       -207.94
                                        LLR p-value:                       nan
y=Academic[Master Student]       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                         nan        nan        nan        nan



                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  155
Model:                        MNLogit   Df Residuals:                      120
Method:                           MLE   Df Model:                           30
Date:                Thu, 24 Sep 2020   Pseudo R-squ.:                  0.1228
Time:                        14:02:32   Log-Likelihood:                -182.41
converged:                      False   LL-Null:                       -207.94
                                        LLR p-value:                  0.009592
y=Academic[Master Student]       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                   1155.4801   4.79e+05      0.002      0.998   -9.38e+05     9.4e+05
Aggressiv                  -1148.5301   4.79e+05     -0.002      0.998    -9.4e+05 



                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  155
Model:                        MNLogit   Df Residuals:                      135
Method:                           MLE   Df Model:                           15
Date:                Thu, 24 Sep 2020   Pseudo R-squ.:                 0.03794
Time:                        14:02:33   Log-Likelihood:                -200.05
converged:                      False   LL-Null:                       -207.94
                                        LLR p-value:                    0.3969
y=Academic[Master Student]       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                   1629.7605   6.83e+05      0.002      0.998   -1.34e+06    1.34e+06
Disinterest                -1644.1139   6.83e+05     -0.002      0.998   -1.34e+06 