In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
import sklearn.preprocessing as pp
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("UIST2019_OpenSMILE.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

data_short = pd.read_csv("UIST_2019_short_samples_OpenSMILE.csv")

df_emotion_s = data_short[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect_s = data_short[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi_s = data_short[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val_s = data_short[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion_s['Char_ID'] = df_emotion_s['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect_s['Char_ID'] = df_affect_s['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi_s['Char_ID'] = df_loi_s['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val_s['Char_ID'] = df_ar_val_s['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Create new column to keep track of the row being a short or a long sample
df_emotion_s['IsShort'] = 1.0
df_affect_s['IsShort'] = 1.0
df_loi_s['IsShort'] = 1.0
df_ar_val_s['IsShort'] = 1.0

df_emotion['IsShort'] = 0.0
df_affect['IsShort'] = 0.0
df_loi['IsShort'] = 0.0
df_ar_val['IsShort'] = 0.0

#Now Combine two tables underneath
df_emotion = pd.concat([df_emotion,df_emotion_s], ignore_index = True, keys = ['Char_ID', 'IsShort'])
df_affect = pd.concat([df_affect, df_affect_s], ignore_index = True, keys = ['Char_ID', 'IsShort'])
df_loi = pd.concat([df_loi, df_loi_s], ignore_index = True, keys = ['Char_ID', 'IsShort'])
df_ar_val = pd.concat([df_ar_val, df_ar_val_s], ignore_index = True, keys = ['Char_ID', 'IsShort'])

#affect_label.remove('Intoxicated')
#df_affect = df_affect.drop(['Intoxicated'], axis = 1)
#norm_test = pp.normalize(df_affect[affect_label], norm = 'l1')
#df_affect[affect_label] = norm_test

#df_loi['Normal Interest'] = df_loi['Disinterest'] + df_loi['Normal']
#df_loi = df_loi.drop(['Disinterest', 'Normal'], axis = 1)

#Standardize data
scaler_emo = pp.StandardScaler()
scaler_aff = pp.StandardScaler()
scaler_arval = pp.StandardScaler()
scaler_loi = pp.StandardScaler()

df_loi[loi_label] = scaler_loi.fit_transform(df_loi[loi_label])
df_affect[affect_label] = scaler_aff.fit_transform(df_affect[affect_label])
df_emotion[emotion_label] = scaler_emo.fit_transform(df_emotion[emotion_label])
df_ar_val[['Arousal', 'Valence']] = scaler_arval.fit_transform(df_ar_val[['Arousal', 'Valence']])

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [2]:
print('Emotion')
emotion_label.append('IsShort')
#df_emotion.replace({'False': 0.0, 'True':1.0}, inplace = True)
emo_sex_model = smf.logit("IsShort ~ Anger + Boredom + Disgust + Fear + Happiness + Sadness", data = df_emotion[emotion_label])
emo_sex_results = emo_sex_model.fit()
print(emo_sex_results.summary())

print('\nAffect')
affect_label.append('IsShort')
#df_affect.replace({'False': 0.0, 'True':1.0}, inplace = True)
aff_sex_model = smf.logit("IsShort ~ Aggressiv + Cheerful + Intoxicated + Nervous + Tired", data = df_affect[affect_label])
aff_sex_results = aff_sex_model.fit()
print(aff_sex_results.summary())

print('Level of Interest')
loi_label.append('IsShort')
#df_loi.replace({'False': 0.0, 'True':1.0}, inplace = True)
loi_sex_model = smf.logit('IsShort ~ Q("High Interest") + Disinterest', data = df_loi[['Normal','Disinterest', 'High Interest', 'IsShort']])
loi_sex_results = loi_sex_model.fit()
print(loi_sex_results.summary())

print('Arousal-Valence')
#df_ar_val.replace({'False': 0.0, 'True':1.0}, inplace = True)
ar_val_sex_model = smf.logit('IsShort ~ Arousal + Valence', data = df_ar_val[['Arousal','Valence','IsShort']])
ar_val_sex_results = ar_val_sex_model.fit()
print(ar_val_sex_results.summary())

Emotion
Optimization terminated successfully.
         Current function value: 0.616398
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                IsShort   No. Observations:                  540
Model:                          Logit   Df Residuals:                      533
Method:                           MLE   Df Model:                            6
Date:                Thu, 03 Dec 2020   Pseudo R-squ.:                  0.1107
Time:                        14:55:14   Log-Likelihood:                -332.85
converged:                       True   LL-Null:                       -374.30
                                        LLR p-value:                 9.028e-16
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4527      0.154      2.936      0.003       0.150       0.755
Anger          0.3377