# Kruska Wallis Test

## Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

## Load .csv data with results of OpenSMILE Analysis
First we load .csv data and clean it (removing of NaNs), then we store information of all files in seperate panda dataframes containing information about affect, emotion and valence/arousal for all participants.

In [2]:
data = pd.read_csv("CHI_2019_FULL.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

## Let's load information about the speakers
The speaker ID is saved in a single .csv file containing four important columns: ID, Age, Sex and Acadedmic Status. Since before loaded OpenSMILE csv files are named using the corresponding index (ex. speaker with id 0 has two files 0_a.csv and 0_b.csv), so that a link can be created

In [3]:
char_data = pd.read_csv("CHI_2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Safe new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

## Kruska-Wallis Test
We Start with characteristic sex. The null hypothesis states that the two categorical variables sex and e.g. emotion are independent.
Since Kruska-Wallis does not work with measurement data, we convert our data as we did when using the chiÂ² test. We do that by calculating the quartiles and then iterating over the data points and categorize them into '1st quartile', '2nd quartile', '3rd quartile' and '4th quartile'. We do that for every group, e.g. once for females and once for males and use these observation counts as input for our Kruska-Wallis test.
The funktion used is from scipy.stats and is called in the hp.kruskal_wallis() function, which iterates over all attributes of a data frame (e.g. when data frame of emotion is given, it iterates over all different emotions) and generates a table for each attribute of size 2x4 (Male/Female x 4 Quartiles).

In [4]:
print('DOF: 1') #Because of two groups, DOF is 1
print('EMOTION\n')
emo_sex = hp.kruskal_wallis(df_emotion_char, emotion_label, 'Sex', True)
print('\nAFFECT\n')
aff_sex = hp.kruskal_wallis(df_affect_char, affect_label,'Sex',  True)
print('\nAROUSAL-VALENCE\n')
ar_val_sex = hp.kruskal_wallis(df_ar_val_char, ['Arousal', 'Valence'], 'Sex', True)
print('\nLEVEL OF INTEREST\n')
loi_sex = hp.kruskal_wallis(df_loi_char, loi_label, 'Sex', True)

DOF: 1
EMOTION

Anger: 		KruskalResult(statistic=2.1341463414634134, pvalue=0.14405063390965472)
Boredom: 	KruskalResult(statistic=1.3333333333333321, pvalue=0.2482130789899204)
Disgust: 	KruskalResult(statistic=0.75, pvalue=0.3864762307712325)
Fear: 		KruskalResult(statistic=1.3333333333333321, pvalue=0.2482130789899204)
Happiness: 	KruskalResult(statistic=1.7078313253012047, pvalue=0.19126698687886493)
Emo_Neutral: 	KruskalResult(statistic=2.083333333333332, pvalue=0.14891467317876178)
Sadness: 	KruskalResult(statistic=0.33333333333333215, pvalue=0.5637028616507738)

AFFECT

Aggressiv: 	KruskalResult(statistic=0.75, pvalue=0.3864762307712325)
Cheerful: 	KruskalResult(statistic=0.19936708860759494, pvalue=0.6552321995577776)
Intoxicated: 	KruskalResult(statistic=1.3333333333333321, pvalue=0.2482130789899204)
Nervous: 	KruskalResult(statistic=1.7078313253012047, pvalue=0.19126698687886493)
Aff_Neutral: 	KruskalResult(statistic=4.1829268292682915, pvalue=0.040833125861299606)
Tired: 		K

Now move on to academic status, the hypothesis being that the variables academic status and e.g. emotion are independent.

In [5]:
print('DOF: 1') # Only two groups, so 2-1 = DOF
print('EMOTION\n')
emo_aca = hp.kruskal_wallis(df_emotion_char, emotion_label, 'Academic', True)
print('\nAFFECT\n')
aff_aca = hp.kruskal_wallis(df_affect_char, affect_label,'Academic',  True)
print('\nAROUSAL-VALENCE\n')
ar_val_aca = hp.kruskal_wallis(df_ar_val_char, ['Arousal', 'Valence'], 'Academic', True)
print('\nLEVEL OF INTEREST\n')
loi_aca = hp.kruskal_wallis(df_loi_char, loi_label, 'Academic', True)

DOF: 1
EMOTION

Anger: 		KruskalResult(statistic=0.5271084337349385, pvalue=0.4678250772849709)
Boredom: 	KruskalResult(statistic=0.5271084337349385, pvalue=0.4678250772849709)
Disgust: 	KruskalResult(statistic=0.08333333333333215, pvalue=0.7728299926844492)
Fear: 		KruskalResult(statistic=1.4177215189873404, pvalue=0.23377879418404465)
Happiness: 	KruskalResult(statistic=1.0331325301204808, pvalue=0.30942406036465636)
Emo_Neutral: 	KruskalResult(statistic=1.3333333333333321, pvalue=0.2482130789899204)
Sadness: 	KruskalResult(statistic=0.33734939759036026, pvalue=0.5613632102341244)

AFFECT

Aggressiv: 	KruskalResult(statistic=0.021084337349396392, pvalue=0.8845494388529656)
Cheerful: 	KruskalResult(statistic=0.19207317073170732, pvalue=0.6611967079369339)
Intoxicated: 	KruskalResult(statistic=0.75, pvalue=0.3864762307712325)
Nervous: 	KruskalResult(statistic=0.21283783783783786, pvalue=0.6445521705158992)
Aff_Neutral: 	KruskalResult(statistic=0.5271084337349385, pvalue=0.4678250772849

Now let's look at Native Speaker

In [6]:
print('DOF: 2') #2 DOF because group count is 3, so 3-1 = 2
print('EMOTION\n')
emo_sp = hp.kruskal_wallis(df_emotion_char, emotion_label, 'IsNativeSpeaker', True)
print('\nAFFECT\n')
aff_sp = hp.kruskal_wallis(df_affect_char, affect_label,'IsNativeSpeaker',  True)
print('\nAROUSAL-VALENCE\n')
ar_val_sp = hp.kruskal_wallis(df_ar_val_char, ['Arousal', 'Valence'], 'IsNativeSpeaker', True)
print('\nLEVEL OF INTEREST\n')
loi_sp = hp.kruskal_wallis(df_loi_char, loi_label, 'IsNativeSpeaker', True)

DOF: 2
EMOTION

Anger: 		KruskalResult(statistic=7.4201754385964955, pvalue=0.024475376214658978)
Boredom: 	KruskalResult(statistic=7.4726148409894035, pvalue=0.023841979190511814)
Disgust: 	KruskalResult(statistic=7.4201754385964955, pvalue=0.024475376214658978)
Fear: 		KruskalResult(statistic=7.555160142348758, pvalue=0.02287798759336179)
Happiness: 	KruskalResult(statistic=7.523767605633803, pvalue=0.023239919686917417)
Emo_Neutral: 	KruskalResult(statistic=7.449122807017548, pvalue=0.024123678667676726)
Sadness: 	KruskalResult(statistic=7.59154929577465, pvalue=0.02246549622526589)

AFFECT

Aggressiv: 	KruskalResult(statistic=7.423076923076927, pvalue=0.024439894496176142)
Cheerful: 	KruskalResult(statistic=7.410526315789476, pvalue=0.024593744477959093)
Intoxicated: 	KruskalResult(statistic=7.384615384615387, pvalue=0.024914440787632705)
Nervous: 	KruskalResult(statistic=7.645390070921987, pvalue=0.02186878425495871)
Aff_Neutral: 	KruskalResult(statistic=7.4753521126760605, pvalue