# OpenSMILE ANOVA test

In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("UIST2019_OpenSMILE.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("UIST2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Safe new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

FileNotFoundError: File b'UIST_2019_short_samples_OpenSMILE.csv' does not exist

## ANOVA Test
### Starting with Character Feature = Sex

In [None]:
# Start with Emotion and Sex
emo_temp_sex = df_emotion_char.drop(['Char_ID','ID', 'Filename', 'Age', 'Academic Status'], axis = 1)
anova_emo_sex = hp.f_anova(emo_temp_sex, emotion_label, 'Sex')
hp.displayANOVA(anova_emo_sex, emotion_label, 'Emotion', 'Sex')

#Affect and Sex
aff_temp_sex = df_affect_char.drop(['Char_ID','ID', 'Filename', 'Age', 'Academic Status'], axis = 1)
anova_aff_sex = hp.f_anova(aff_temp_sex, affect_label, 'Sex')
hp.displayANOVA(anova_aff_sex, affect_label, 'Affect', 'Sex')

#Level of Interest and Sex
loi_temp_sex = df_loi_char.drop(['Char_ID','ID', 'Filename', 'Age', 'Academic Status'], axis = 1)
anova_loi_sex = hp.f_anova(loi_temp_sex, loi_label, 'Sex')
hp.displayANOVA(anova_loi_sex, loi_label, 'Level of Interest', 'Sex')
#Arousal-Valence and Sex
arval_temp_sex = df_ar_val_char.drop(['Char_ID','ID', 'Filename', 'Age', 'Academic Status'], axis = 1)
anova_arval_sex = hp.f_anova(arval_temp_sex, ['Valence', 'Arousal'], 'Sex')
hp.displayANOVA(anova_arval_sex, ['Valence', 'Arousal'], 'Arousal-Valence', 'Sex')

### CharacterFeature = Age
Since Age can't really be reliably inferred, 'Age' is meant as a time distance to when the person received their degree. At the moment people who got their degree after 2010 (> 2010) are counted as young, people who got their degree between 1990 (> 1990) and 2010 (<= 2010) are counted as intermediate and all others who got their degree before 1990 (<= 1990) are counted as old.

In [None]:
#Emotion and Age
emo_temp_age = df_emotion_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_emo_age = hp.f_anova(emo_temp_age, emotion_label, 'Age')
hp.displayANOVA(anova_emo_age, emotion_label, 'Emotion', 'Age')
#Affect and Age
aff_temp_age = df_affect_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_aff_age = hp.f_anova(aff_temp_age, affect_label, 'Age')
hp.displayANOVA(anova_aff_age, affect_label, 'Affect', 'Age')
#Level of Interest and Age
loi_temp_age = df_loi_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_loi_age = hp.f_anova(loi_temp_age, loi_label, 'Age')
hp.displayANOVA(anova_loi_age, loi_label, 'Level of Interest', 'Age')
#Arousal-Valence and Age
arval_temp_age = df_ar_val_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_arval_age = hp.f_anova(arval_temp_age, ['Valence', 'Arousal'], 'Age')
hp.displayANOVA(anova_arval_age, ['Valence', 'Arousal'], 'Arousal-Valence', 'Age')

### CharacterFeature = Academic Status
So for the moment we have six different academical groups (Master Student, PhD Student, PostDoc, Assistant Professor, Professor and Researcher), which may be too many, since the samples are not equally distributed among the groups... Maybe some of the groups will be grouped together to achieve a higher sample count for groups.

In [None]:
#Emotion and Academic Status
emo_temp_age = df_emotion_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_emo_age = hp.f_anova(emo_temp_age, emotion_label, 'Academic Status')
hp.displayANOVA(anova_emo_age, emotion_label, 'Emotion', 'Academic Status')
#Affect and Academic Status
aff_temp_age = df_affect_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_aff_age = hp.f_anova(aff_temp_age, affect_label, 'Academic Status')
hp.displayANOVA(anova_aff_age, affect_label, 'Affect', 'Academic Status')
#Level of Interest and Academic Status
loi_temp_age = df_loi_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_loi_age = hp.f_anova(loi_temp_age, loi_label, 'Academic Status')
hp.displayANOVA(anova_loi_age, loi_label, 'Level of Interest', 'Academic Status')
#Arousal-Valence and Academic Status
arval_temp_age = df_ar_val_char.drop(['Char_ID','ID', 'Filename', 'Sex', 'Academic Status'], axis = 1)
anova_arval_age = hp.f_anova(arval_temp_age, ['Valence', 'Arousal'], 'Academic Status')
hp.displayANOVA(anova_arval_age, ['Valence', 'Arousal'], 'Arousal-Valence', 'Academic Status')