In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("CHI_UIST2019_OpenSMILE_Data.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("CHI_UIST2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Save new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

## Let's start with the character feature 'sex'

In [2]:
# Start with model and sex
print('Emotion')
df_emo_X = df_emotion_char[emotion_label]
df_emo_sex_Y = df_emotion_char['Sex']
df_emo_sex_Y.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
logreg_emo_sex = sm.Logit(df_emo_sex_Y, df_emo_X).fit()
print(logreg_emo_sex.summary())

print('\nAffect')
df_aff_X = df_affect_char[affect_label]
df_aff_sex_Y = df_affect_char['Sex']
df_aff_sex_Y.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
logreg_aff_sex = sm.Logit(df_aff_sex_Y, df_aff_X).fit()
print(logreg_aff_sex.summary())

print('Level of Interest')
df_loi_X = df_loi_char[loi_label]
df_loi_sex_Y = df_loi_char['Sex']
df_loi_sex_Y.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
logreg_loi_sex = sm.Logit(df_loi_sex_Y, df_loi_X).fit()
print(logreg_loi_sex.summary())

print('Arousal-Valence')
df_arval_X = df_ar_val_char[['Arousal', 'Valence']]
df_arval_sex_Y = df_ar_val_char['Sex']
df_arval_sex_Y.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
logreg_ar_val_sex = sm.Logit(df_arval_sex_Y, df_arval_X).fit()
print(logreg_ar_val_sex.summary())

Emotion
Optimization terminated successfully.
         Current function value: 0.510460
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                    Sex   No. Observations:                  585
Model:                          Logit   Df Residuals:                      578
Method:                           MLE   Df Model:                            6
Date:                Wed, 21 Oct 2020   Pseudo R-squ.:                  0.2624
Time:                        16:35:38   Log-Likelihood:                -298.62
converged:                       True   LL-Null:                       -404.87
                                        LLR p-value:                 4.136e-43
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Anger          -0.5009      7.117     -0.070      0.944     -14.449      13.448
Boredom         0.

### Character feature = academic status

In [3]:
df_emotion_aca = df_emotion_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'VideoID','IsNativeSpeaker','VideoTitle', 'Sex'], axis = 1)
df_emotion_aca.dropna(inplace = True)
df_affect_aca = df_affect_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'VideoID','VideoTitle','IsNativeSpeaker', 'Sex'], axis = 1)
df_affect_aca.dropna(inplace = True)
df_loi_aca = df_loi_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'VideoID','VideoTitle', 'IsNativeSpeaker','Sex'], axis = 1)
df_loi_aca.dropna(inplace = True)
df_arval_aca = df_ar_val_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'VideoID','VideoTitle', 'IsNativeSpeaker','Sex'], axis = 1)
df_arval_aca.dropna(inplace = True)

# Start with model and sex
print('Emotion')
df_emo_aca_X = df_emotion_aca[emotion_label]
df_emo_aca_Y = df_emotion_aca['Academic Status']
df_emo_aca_Y.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
logreg_emo_aca = sm.Logit(df_emo_aca_Y, df_emo_aca_X).fit()
print(logreg_emo_aca.summary())

print('\nAffect')
df_aff_aca_X = df_affect_aca[affect_label]
df_aff_aca_Y = df_affect_aca['Academic Status']
df_aff_aca_Y.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
logreg_aff_aca = sm.Logit(df_aff_aca_Y, df_aff_aca_X).fit()
print(logreg_aff_aca.summary())

print('Level of Interest')
df_loi_aca_X = df_loi_aca[loi_label]
df_loi_aca_Y = df_loi_aca['Academic Status']
df_loi_aca_Y.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
logreg_loi_aca = sm.Logit(df_loi_aca_Y, df_loi_aca_X).fit()
print(logreg_loi_aca.summary())

print('Arousal-Valence')
df_arval_aca_X = df_arval_aca[['Arousal', 'Valence']]
df_arval_aca_Y = df_arval_aca['Academic Status']
df_arval_aca_Y.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
logreg_ar_val_aca = sm.Logit(df_arval_aca_Y, df_arval_aca_X).fit()
print(logreg_ar_val_aca.summary())

Emotion
Optimization terminated successfully.
         Current function value: 0.677486
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:        Academic Status   No. Observations:                  365
Model:                          Logit   Df Residuals:                      358
Method:                           MLE   Df Model:                            6
Date:                Wed, 21 Oct 2020   Pseudo R-squ.:                 0.01073
Time:                        16:35:38   Log-Likelihood:                -247.28
converged:                       True   LL-Null:                       -249.96
                                        LLR p-value:                    0.4980
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Anger           0.6556      5.300      0.124      0.902      -9.732      11.043
Boredom         6.

### Native Speaker

In [4]:
df_emotion_nat = df_emotion_char.drop(['Char_ID', 'ID', 'Filename', 'Name','Academic Status', 'VideoID','VideoTitle', 'Sex'], axis = 1)
df_emotion_nat.dropna(inplace = True)
df_affect_nat = df_affect_char.drop(['Char_ID', 'ID', 'Filename', 'Name','Academic Status', 'VideoID','VideoTitle', 'Sex'], axis = 1)
df_affect_nat.dropna(inplace = True)
df_loi_nat = df_loi_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'VideoID','Academic Status','VideoTitle', 'Sex'], axis = 1)
df_loi_nat.dropna(inplace = True)
df_arval_nat = df_ar_val_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'Academic Status','VideoID','VideoTitle', 'Sex'], axis = 1)
df_arval_nat.dropna(inplace = True)

# Start with model and sex
print('Emotion')
df_emo_nat_X = df_emotion_nat[emotion_label]
df_emo_nat_Y = df_emotion_nat['IsNativeSpeaker']
logreg_emo_nat = sm.MNLogit(df_emo_nat_Y, df_emo_nat_X).fit()
print(logreg_emo_nat.summary())

print('\nAffect')
df_aff_nat_X = df_affect_nat[affect_label]
df_aff_nat_Y = df_affect_nat['IsNativeSpeaker']
logreg_aff_nat = sm.MNLogit(df_aff_nat_Y, df_aff_nat_X).fit()
print(logreg_aff_nat.summary())

print('Level of Interest')
df_loi_nat_X = df_loi_nat[loi_label]
df_loi_nat_Y = df_loi_nat['IsNativeSpeaker']
logreg_loi_aca = sm.MNLogit(df_loi_nat_Y, df_loi_nat_X).fit()
print(logreg_loi_nat.summary())

print('Arousal-Valence')
df_arval_nat_X = df_arval_nat[['Arousal', 'Valence']]
df_arval_nat_Y = df_arval_nat['IsNativeSpeaker']
logreg_ar_val_aca = sm.MNLogit(df_arval_nat_Y, df_arval_nat_X).fit()
print(logreg_ar_val_aca.summary())

Emotion
Optimization terminated successfully.
         Current function value: 1.027729
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:        IsNativeSpeaker   No. Observations:                  585
Model:                        MNLogit   Df Residuals:                      571
Method:                           MLE   Df Model:                           12
Date:                Wed, 21 Oct 2020   Pseudo R-squ.:                0.008798
Time:                        16:35:38   Log-Likelihood:                -601.22
converged:                       True   LL-Null:                       -606.56
                                        LLR p-value:                    0.5572
IsNativeSpeaker=Europ. Non-Native       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Anger                                 7.2905      9.71

NameError: name 'logreg_loi_nat' is not defined