# Kruskal Wallis Test

## Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
import sklearn.preprocessing as pp
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


## Load .csv data with results of OpenSMILE Analysis
First we load .csv data and clean it (removing of NaNs), then we store information of all files in seperate panda dataframes containing information about affect, emotion and valence/arousal for all participants.

In [2]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

#Load Data
data = pd.read_csv("UIST2019_OpenSMILE.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

data_short = pd.read_csv("UIST_2019_short_samples_OpenSMILE.csv")

df_emotion_s = data_short[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect_s = data_short[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi_s = data_short[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val_s = data_short[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion_s['Char_ID'] = df_emotion_s['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect_s['Char_ID'] = df_affect_s['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi_s['Char_ID'] = df_loi_s['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val_s['Char_ID'] = df_ar_val_s['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Create new column to keep track of the row being a short or a long sample
df_emotion_s['IsShort'] = 'True'
df_affect_s['IsShort'] = 'True'
df_loi_s['IsShort'] = 'True'
df_ar_val_s['IsShort'] = 'True'

df_emotion['IsShort'] = 'False'
df_affect['IsShort'] = 'False'
df_loi['IsShort'] = 'False'
df_ar_val['IsShort'] = 'False'

#Now Combine two tables underneath
df_emotion = pd.concat([df_emotion,df_emotion_s], ignore_index = True, keys = ['Char_ID', 'IsShort'])
df_affect = pd.concat([df_affect, df_affect_s], ignore_index = True, keys = ['Char_ID', 'IsShort'])
df_loi = pd.concat([df_loi, df_loi_s], ignore_index = True, keys = ['Char_ID', 'IsShort'])
df_ar_val = pd.concat([df_ar_val, df_ar_val_s], ignore_index = True, keys = ['Char_ID', 'IsShort'])

## Kruskal-Wallis Test

Scipy stats documentation says to input measurement data, so we go ahead and use our floating point data.

# Kruskal-Wallis Test


In [3]:
print('DOF: 1') #Because of two groups, DOF is 1
print('EMOTION\n')
emo_sex = hp.kruskal_wallis(df_emotion, emotion_label, 'IsShort', True)
print('\nAFFECT\n')
aff_sex = hp.kruskal_wallis(df_affect, affect_label,'IsShort',  True)
print('\nAROUSAL-VALENCE\n')
ar_val_sex = hp.kruskal_wallis(df_ar_val, ['Arousal', 'Valence'], 'IsShort', True)
print('\nLEVEL OF INTEREST\n')
loi_sex = hp.kruskal_wallis(df_loi, loi_label, 'IsShort', True)

DOF: 1
EMOTION



NameError: name 'df_emotion_char' is not defined

Now move on to academic status, the hypothesis being that the variables academic status and e.g. emotion are independent.

In [None]:
print('DOF: 1') # Only two groups, so 2-1 = DOF
print('EMOTION\n')
emo_aca = hp.kruskal_wallis(df_emotion_char, emotion_label, 'Academic Status', True)
print('\nAFFECT\n')
aff_aca = hp.kruskal_wallis(df_affect_char, affect_label,'Academic Status',  True)
print('\nAROUSAL-VALENCE\n')
ar_val_aca = hp.kruskal_wallis(df_ar_val_char, ['Arousal', 'Valence'], 'Academic Status', True)
print('\nLEVEL OF INTEREST\n')
loi_aca = hp.kruskal_wallis(df_loi_char, ['Normal Interest', 'High Interest'], 'Academic Status', True)