In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
import sklearn.preprocessing as pp
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("CHI_2019_FULL.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("CHI_2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Save new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

#Now, we only want to have data containing information about the answers
#For that we need to extract from the filename column, whether the file was part of an answer
#a = answer, p = presentation, q = question
#sentence_type should be the same for all tables, but just to be sure
arval_sentence_type = df_ar_val_char.Filename.str.replace('\d+','').str[3:-4]
df_ar_val_char['SentenceType'] = arval_sentence_type
emo_sentence_type = df_emotion_char.Filename.str.replace('\d+','').str[3:-4]
df_emotion_char['SentenceType'] = emo_sentence_type
aff_sentence_type = df_affect_char.Filename.str.replace('\d+','').str[3:-4]
df_affect_char['SentenceType'] = aff_sentence_type
loi_sentence_type = df_loi_char.Filename.str.replace('\d+','').str[3:-4]
df_loi_char['SentenceType'] = loi_sentence_type

#Now select only those who have SentenceType == 'a'
df_ar_val_char = df_ar_val_char.loc[df_ar_val_char['SentenceType'] == 'p']
df_emotion_char = df_emotion_char.loc[df_emotion_char['SentenceType'] == 'p']
df_affect_char = df_affect_char.loc[df_affect_char['SentenceType'] == 'p']
df_loi_char = df_loi_char.loc[df_loi_char['SentenceType'] == 'p']

#For affect, we will have to drop the intoxication column and thus we will re-normalize the other values
affect_label.remove('Intoxicated')
df_affect_char = df_affect_char.drop(['Intoxicated'], axis = 1)
norm_test = pp.normalize(df_affect_char[affect_label], norm = 'l1')
df_affect_char[affect_label] = norm_test

df_loi_char['Normal Interest'] = df_loi_char['Disinterest'] + df_loi_char['Normal']
df_loi_char = df_loi_char.drop(['Disinterest', 'Normal'], axis = 1)
loi_label = ['Normal Interest', 'High Interest']

#Now let's compensate for multiple samples of the same person
IDs = df_emotion_char['Char_ID'].copy()
IDs.drop_duplicates()

# I know that for loops make the following operations very very slow, but I could not find a way to get this
# working with pd.apply and lambda functions... 

for i in IDs:
    #first for emotion
    for l in emotion_label:    
        emo = df_emotion_char.loc[df_emotion_char.Char_ID == i,l]
        emo_neu = hp.constructMedianSeries(emo)
        df_emotion_char.loc[df_emotion_char.Char_ID == i,l] = emo_neu
    #Then for affect
    for l in affect_label:    
        aff = df_affect_char.loc[df_affect_char.Char_ID == i,l]
        aff_neu = hp.constructMedianSeries(aff)
        df_affect_char.loc[df_affect_char.Char_ID == i,l] = aff_neu  
    for l in loi_label:    
        loi = df_loi_char.loc[df_loi_char.Char_ID == i,l]
        loi_neu = hp.constructMedianSeries(loi)
        df_loi_char.loc[df_loi_char.Char_ID == i,l] = loi_neu  
    for l in ['Arousal', 'Valence']:    
        arval = df_ar_val_char.loc[df_ar_val_char.Char_ID == i,l]
        arval_neu = hp.constructMedianSeries(arval)
        df_ar_val_char.loc[df_ar_val_char.Char_ID == i,l] = arval_neu  
        
#Now let's drop the duplicate values, since we only need one row now per person
df_emotion_char.drop_duplicates(subset=['ID'], inplace = True)
df_affect_char.drop_duplicates(subset=['ID'], inplace = True)
df_loi_char.drop_duplicates(subset=['ID'], inplace = True)
df_ar_val_char.drop_duplicates(subset=['ID'], inplace = True)   

emo_scaler = pp.StandardScaler()
df_emotion_char[emotion_label] = emo_scaler.fit_transform(df_emotion_char[emotion_label])
aff_scaler = pp.StandardScaler()
df_affect_char[affect_label] = aff_scaler.fit_transform(df_affect_char[affect_label])
loi_scaler = pp.StandardScaler()
df_loi_char[loi_label] = loi_scaler.fit_transform(df_loi_char[loi_label])
arval_scaler = pp.StandardScaler()
df_ar_val_char[['Arousal', 'Valence']] = arval_scaler.fit_transform(df_ar_val_char[['Arousal', 'Valence']])

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


## Let's start with the character feature 'sex'

In [2]:
print('Emotion')
emotion_label.append('Sex')
df_emotion_char.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
emo_sex_model = smf.logit("Sex ~ Anger + Boredom + Disgust + Fear + Happiness + Sadness", data = df_emotion_char[emotion_label])
emo_sex_results = emo_sex_model.fit()
print(emo_sex_results.summary())

print('\nAffect')
affect_label.append('Sex')
df_affect_char.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
aff_sex_model = smf.logit("Sex ~ Aggressiv + Cheerful + Nervous + Tired", data = df_affect_char[affect_label])
aff_sex_results = aff_sex_model.fit()
print(aff_sex_results.summary())

print('Level of Interest')
loi_label.append('Sex')
df_loi_char.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
loi_sex_model = smf.logit('Sex ~ Q("High Interest")', data = df_loi_char[['Normal Interest', 'High Interest', 'Sex']])
loi_sex_results = loi_sex_model.fit()
print(loi_sex_results.summary())

print('Arousal-Valence')
df_ar_val_char.replace({'Male': 0.0, 'Female':1.0}, inplace = True)
ar_val_sex_model = smf.logit('Sex ~ Arousal + Valence', data = df_ar_val_char[['Arousal','Valence','Sex']])
ar_val_sex_results = ar_val_sex_model.fit()
print(ar_val_sex_results.summary())

Emotion
Optimization terminated successfully.
         Current function value: 0.516328
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                    Sex   No. Observations:                   81
Model:                          Logit   Df Residuals:                       74
Method:                           MLE   Df Model:                            6
Date:                Tue, 01 Dec 2020   Pseudo R-squ.:                  0.2550
Time:                        14:29:02   Log-Likelihood:                -41.823
converged:                       True   LL-Null:                       -56.139
                                        LLR p-value:                 7.140e-05
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1075      0.282      0.381      0.703      -0.445       0.660
Anger         -0.8581

### Character feature = academic status

In [3]:
emotion_label.remove('Sex')
affect_label.remove('Sex')
loi_label.remove('Sex')
emotion_label.append('Academic Status')
affect_label.append('Academic Status')
loi_label.append('Academic Status')

print('Emotion')
df_emotion_char.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
emo_aca_model = smf.logit('Q("Academic Status") ~ Anger + Boredom + Disgust + Fear + Happiness + Sadness', data = df_emotion_char[emotion_label])
emo_aca_results = emo_aca_model.fit()
print(emo_aca_results.summary())

print('\nAffect')
df_affect_char.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
aff_aca_model = smf.logit('Q("Academic Status") ~ Aggressiv + Cheerful + Nervous + Tired', data = df_affect_char[affect_label])
aff_aca_results = aff_aca_model.fit()
print(aff_aca_results.summary())

print('Level of Interest')
df_loi_char.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
loi_aca_model = smf.logit('Q("Academic Status") ~ Q("High Interest")', data = df_loi_char[['Normal Interest', 'High Interest', 'Academic Status']])
loi_aca_results = loi_aca_model.fit()
print(loi_aca_results.summary())

print('Arousal-Valence')
df_ar_val_char.replace({'Grad Student': 0.0, 'PhD':1.0}, inplace = True)
ar_val_aca_model = smf.logit('Q("Academic Status") ~ Arousal + Valence', data = df_ar_val_char[['Arousal','Valence','Academic Status']])
ar_val_aca_results = ar_val_aca_model.fit()
print(ar_val_aca_results.summary())

Emotion
Optimization terminated successfully.
         Current function value: 0.596540
         Iterations 7
                            Logit Regression Results                            
Dep. Variable:     Q("Academic Status")   No. Observations:                   80
Model:                            Logit   Df Residuals:                       73
Method:                             MLE   Df Model:                            6
Date:                  Tue, 01 Dec 2020   Pseudo R-squ.:                  0.1390
Time:                          14:29:03   Log-Likelihood:                -47.723
converged:                         True   LL-Null:                       -55.427
                                          LLR p-value:                   0.01732
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0128      0.269     -0.047      0.962      -0.540       0.514
Anger

### Native Speaker

In [4]:
#df_emotion_nat = df_emotion_char.drop(['Char_ID', 'ID', 'Filename', 'Name','Academic Status', 'VideoID','VideoTitle', 'Sex'], axis = 1)
#df_emotion_nat.dropna(inplace = True)
#df_affect_nat = df_affect_char.drop(['Char_ID', 'ID', 'Filename', 'Name','Academic Status', 'VideoID','VideoTitle', 'Sex'], axis = 1)
#df_affect_nat.dropna(inplace = True)
#df_loi_nat = df_loi_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'VideoID','Academic Status','VideoTitle', 'Sex'], axis = 1)
#df_loi_nat.dropna(inplace = True)
#df_arval_nat = df_ar_val_char.drop(['Char_ID', 'ID', 'Filename', 'Name', 'Academic Status','VideoID','VideoTitle', 'Sex'], axis = 1)
#df_arval_nat.dropna(inplace = True)
#
## Start with model and sex
#print('Emotion')
#df_emo_nat_X = df_emotion_nat[emotion_label]
#df_emo_nat_X = sm.add_constant(df_emo_nat_X)
#df_emo_nat_Y = df_emotion_nat['IsNativeSpeaker']
#logreg_emo_nat = sm.MNLogit(df_emo_nat_Y, df_emo_nat_X).fit()
#print(logreg_emo_nat.summary())
#
#print('\nAffect')
#df_aff_nat_X = df_affect_nat[affect_label]
#df_aff_nat_X = sm.add_constant(df_aff_nat_X)
#df_aff_nat_Y = df_affect_nat['IsNativeSpeaker']
#logreg_aff_nat = sm.MNLogit(df_aff_nat_Y, df_aff_nat_X).fit()
#print(logreg_aff_nat.summary())
#
#print('Level of Interest')
#df_loi_nat_X = df_loi_nat[loi_label]
#df_loi_nat_X = sm.add_constant(df_loi_nat_X)
#df_loi_nat_Y = df_loi_nat['IsNativeSpeaker']
#logreg_loi_nat = sm.MNLogit(df_loi_nat_Y, df_loi_nat_X).fit()
#print(logreg_loi_nat.summary())
#
#print('Arousal-Valence')
#df_arval_nat_X = df_arval_nat[['Arousal', 'Valence']]
#df_arval_nat_X = sm.add_constant(df_arval_nat_X)
#df_arval_nat_Y = df_arval_nat['IsNativeSpeaker']
#logreg_ar_val_aca = sm.MNLogit(df_arval_nat_Y, df_arval_nat_X).fit()
#print(logreg_ar_val_aca.summary())