In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
import sklearn.preprocessing as pp
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("CHI_2019_FULL.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("CHI_2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Save new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

affect_label.remove('Intoxicated')
df_affect_char = df_affect_char.drop(['Intoxicated'], axis = 1)
norm_test = pp.normalize(df_affect_char[affect_label], norm = 'l1')
df_affect_char[affect_label] = norm_test

df_loi_char['Normal Interest'] = df_loi_char['Disinterest'] + df_loi_char['Normal']
df_loi_char = df_loi_char.drop(['Disinterest', 'Normal'], axis = 1)

## Let's start with the character feature 'sex'

In [2]:
print('Emotion')
emotion_label.append('Sex')
df_emotion_char.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
emo_sex_model = smf.logit("Sex ~ Anger + Boredom + Disgust + Fear + Happiness + Sadness", data = df_emotion_char[emotion_label])
emo_sex_results = emo_sex_model.fit()
print(emo_sex_results.summary())

print('\nAffect')
affect_label.append('Sex')
df_affect_char.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
aff_sex_model = smf.logit("Sex ~ Aggressiv + Cheerful + Nervous + Tired", data = df_affect_char[affect_label])
aff_sex_results = aff_sex_model.fit()
print(aff_sex_results.summary())

print('Level of Interest')
loi_label.append('Sex')
df_loi_char.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
loi_sex_model = smf.logit('Sex ~ Q("Normal Interest") + Q("High Interest")', data = df_loi_char[['Normal Interest', 'High Interest', 'Sex']])
loi_sex_results = loi_sex_model.fit()
print(loi_sex_results.summary())

print('Arousal-Valence')
df_ar_val_char.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
ar_val_sex_model = smf.logit('Sex ~ Arousal + Valence', data = df_ar_val_char[['Arousal','Valence','Sex']])
ar_val_sex_results = ar_val_sex_model.fit()
print(ar_val_sex_results.summary())

Emotion
         Current function value: 0.580513
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    Sex   No. Observations:                  635
Model:                          Logit   Df Residuals:                      627
Method:                           MLE   Df Model:                            7
Date:                Wed, 28 Oct 2020   Pseudo R-squ.:                  0.1576
Time:                        13:06:42   Log-Likelihood:                -368.63
converged:                      False   LL-Null:                       -437.59
                                        LLR p-value:                 1.385e-26
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const         520.7794   1.17e+05      0.004      0.996   -2.29e+05     2.3e+05
Anger        -524.8154   1.17e+05     -0.004      0.99



### Character feature = academic status

In [3]:
emotion_label.remove('Sex')
affect_label.remove('Sex')
loi_label.remove('Sex')
emotion_label.append('Academic Status')
affect_label.append('Academic Status')
loi_label.append('Academic Status')

print('Emotion')
df_emotion_char.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
emo_aca_model = smf.logit('Q("Academic Status") ~ Anger + Boredom + Disgust + Fear + Happiness + Sadness', data = df_emotion_char[emotion_label])
emo_aca_results = emo_aca_model.fit()
print(emo_aca_results.summary())

print('\nAffect')
df_affect_char.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
aff_aca_model = smf.logit('Q("Academic Status") ~ Aggressiv + Cheerful + Nervous + Tired', data = df_affect_char[affect_label])
aff_aca_results = aff_aca_model.fit()
print(aff_aca_results.summary())

print('Level of Interest')
df_loi_char.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
loi_aca_model = smf.logit('Q("Academic Status") ~ Q("Normal Interest") + Q("High Interest")', data = df_loi_char[['Normal Interest', 'High Interest', 'Academic Status']])
loi_aca_results = loi_aca_model.fit()
print(loi_aca_results.summary())

print('Arousal-Valence')
df_ar_val_char.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
ar_val_aca_model = smf.logit('Q("Academic Status") ~ Arousal + Valence', data = df_ar_val_char[['Arousal','Valence','Academic Status']])
ar_val_aca_results = ar_val_aca_model.fit()
print(ar_val_aca_results.summary())

Emotion
         Current function value: 0.665023
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:        Academic Status   No. Observations:                  434
Model:                          Logit   Df Residuals:                      426
Method:                           MLE   Df Model:                            7
Date:                Wed, 28 Oct 2020   Pseudo R-squ.:                 0.03985
Time:                        13:06:42   Log-Likelihood:                -288.62
converged:                      False   LL-Null:                       -300.60
                                        LLR p-value:                  0.001158
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const         422.7516   1.31e+05      0.003      0.997   -2.56e+05    2.57e+05
Anger        -417.8702   1.31e+05     -0.003      0.99



### Native Speaker

In [4]:
df_emotion_nat = df_emotion_char.drop(['Char_ID', 'ID', 'Filename', 'Name','Academic Status', 'VideoID','VideoTitle', 'Sex'], axis = 1)
df_emotion_nat.dropna(inplace = True)
df_affect_nat = df_affect_char.drop(['Char_ID', 'ID', 'Filename', 'Name','Academic Status', 'VideoID','VideoTitle', 'Sex'], axis = 1)
df_affect_nat.dropna(inplace = True)
df_loi_nat = df_loi_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'VideoID','Academic Status','VideoTitle', 'Sex'], axis = 1)
df_loi_nat.dropna(inplace = True)
df_arval_nat = df_ar_val_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'Academic Status','VideoID','VideoTitle', 'Sex'], axis = 1)
df_arval_nat.dropna(inplace = True)

# Start with model and sex
print('Emotion')
df_emo_nat_X = df_emotion_nat[emotion_label]
df_emo_nat_X = sm.add_constant(df_emo_nat_X)
df_emo_nat_Y = df_emotion_nat['IsNativeSpeaker']
logreg_emo_nat = sm.MNLogit(df_emo_nat_Y, df_emo_nat_X).fit()
print(logreg_emo_nat.summary())

print('\nAffect')
df_aff_nat_X = df_affect_nat[affect_label]
df_aff_nat_X = sm.add_constant(df_aff_nat_X)
df_aff_nat_Y = df_affect_nat['IsNativeSpeaker']
logreg_aff_nat = sm.MNLogit(df_aff_nat_Y, df_aff_nat_X).fit()
print(logreg_aff_nat.summary())

print('Level of Interest')
df_loi_nat_X = df_loi_nat[loi_label]
df_loi_nat_X = sm.add_constant(df_loi_nat_X)
df_loi_nat_Y = df_loi_nat['IsNativeSpeaker']
logreg_loi_nat = sm.MNLogit(df_loi_nat_Y, df_loi_nat_X).fit()
print(logreg_loi_nat.summary())

print('Arousal-Valence')
df_arval_nat_X = df_arval_nat[['Arousal', 'Valence']]
df_arval_nat_X = sm.add_constant(df_arval_nat_X)
df_arval_nat_Y = df_arval_nat['IsNativeSpeaker']
logreg_ar_val_aca = sm.MNLogit(df_arval_nat_Y, df_arval_nat_X).fit()
print(logreg_ar_val_aca.summary())

Emotion
         Current function value: 1.048679
         Iterations: 35
                          MNLogit Regression Results                          
Dep. Variable:        IsNativeSpeaker   No. Observations:                  635
Model:                        MNLogit   Df Residuals:                      611
Method:                           MLE   Df Model:                           21
Date:                Wed, 28 Oct 2020   Pseudo R-squ.:                 0.02231
Time:                        13:06:42   Log-Likelihood:                -665.91
converged:                      False   LL-Null:                       -681.10
                                        LLR p-value:                   0.08449
IsNativeSpeaker=Europ. Non-Native       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                               429.3267   1.53e+05      0.003      0.998      -3e+05 





Affect
         Current function value: 1.059963
         Iterations: 35




                          MNLogit Regression Results                          
Dep. Variable:        IsNativeSpeaker   No. Observations:                  635
Model:                        MNLogit   Df Residuals:                      614
Method:                           MLE   Df Model:                           18
Date:                Wed, 28 Oct 2020   Pseudo R-squ.:                 0.01179
Time:                        13:06:42   Log-Likelihood:                -673.08
converged:                      False   LL-Null:                       -681.10
                                        LLR p-value:                    0.5886
IsNativeSpeaker=Europ. Non-Native       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                              -599.0850   1.71e+05     -0.004      0.997   -3.35e+05    3.34e+05
Aggressiv                           597.6662   1.71e+05      0




         Current function value: 1.057934
         Iterations 9
                          MNLogit Regression Results                          
Dep. Variable:        IsNativeSpeaker   No. Observations:                  635
Model:                        MNLogit   Df Residuals:                      626
Method:                           MLE   Df Model:                            6
Date:                Wed, 28 Oct 2020   Pseudo R-squ.:                 0.01368
Time:                        13:06:42   Log-Likelihood:                -671.79
converged:                       True   LL-Null:                       -681.10
                                        LLR p-value:                  0.004829
IsNativeSpeaker=Europ. Non-Native       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                                 0.1464      0.190      0.771      0.441      -0.226       0.5