In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("CHI_UIST2019_OpenSMILE_Data.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("UIST2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Safe new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

In [2]:
#Because we need information about the Sentence type, i.e. if the wav file is a question/ answers or a snippet from the talk
#This is why we use the merged table containing the Filename which contains information about Sentence Type

#Get Information of Sentence Type: It's coded in the last character, e.g. 9_a_q.wav is the first (a) sample of person with ID = 9, and it's 
#a question (q). If it would be an answer, instead of the q, and a would be inplace. If the sample is taken from the presentation it is indicated with a p
sentence_type = df_ar_val_char.Filename.str.replace('\d+','').str[3:-4]
df_ar_val_char['SentenceType'] = sentence_type

df_questions = df_ar_val_char.loc[df_ar_val_char['SentenceType'] == 'q']
df_questions = df_questions.drop_duplicates(subset = ['Char_ID'], keep = 'first')
df_answers = df_ar_val_char.loc[df_ar_val_char['SentenceType'] == 'a']
df_answers = df_answers.drop_duplicates(subset = ['Char_ID'], keep = 'first')
df_presentation = df_ar_val_char.loc[df_ar_val_char['SentenceType'] == 'p']
df_presentation = df_presentation.drop_duplicates(subset = ['Char_ID'], keep = 'first')

df_ar_val_char = df_ar_val_char.drop_duplicates(subset = ['Char_ID'], keep = 'first')

print('Male/Female overall (in questions and answers):')
print(df_ar_val_char['Sex'].value_counts())
print('Male/Female in Questions:')
print(df_questions['Sex'].value_counts())
print('Male/Female in Answers:')
print(df_answers['Sex'].value_counts())
print('Male/Female in Presentations:')
print(df_presentation['Sex'].value_counts())

print('\nGrad Student/PhD overall (in questions and answers):')
print(char_data['Academic Status'].value_counts())
print('Grad Student/PhD in Questions:')
print(df_questions['Academic Status'].value_counts())
print('Grad Student/PhD in Answers:')
print(df_answers['Academic Status'].value_counts())
print('Grad Student/PhD in Presentations:')
print(df_presentation['Academic Status'].value_counts())

print('\nNative Speaker overall (in questions and answers):')
print(char_data['IsNativeSpeaker'].value_counts())
print('Native Speaker in Questions:')
print(df_questions['IsNativeSpeaker'].value_counts())
print('Native Speaker in Answers:')
print(df_answers['IsNativeSpeaker'].value_counts())
print('Native Speaker in Presentations:')
print(df_presentation['IsNativeSpeaker'].value_counts())

Male/Female overall (in questions and answers):
Male      209
Female    111
Name: Sex, dtype: int64
Male/Female in Questions:
Male      156
Female     62
Name: Sex, dtype: int64
Male/Female in Answers:
Male      53
Female    50
Name: Sex, dtype: int64
Male/Female in Presentations:
Female    41
Name: Sex, dtype: int64

Grad Student/PhD overall (in questions and answers):
Grad Student    69
PhD             49
Name: Academic Status, dtype: int64
Grad Student/PhD in Questions:
PhD             12
Grad Student     4
Name: Academic Status, dtype: int64
Grad Student/PhD in Answers:
Grad Student    65
PhD             37
Name: Academic Status, dtype: int64
Grad Student/PhD in Presentations:
Grad Student    20
PhD             20
Name: Academic Status, dtype: int64

Native Speaker overall (in questions and answers):
Native Speaker       175
Asian Non-Native      99
Europ. Non-Native     52
-                      1
Name: IsNativeSpeaker, dtype: int64
Native Speaker in Questions:
Native Speaker     