# Kruskal Wallis Test

## Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
import seaborn as sns
import Helper as hp

## Load .csv data with results of OpenSMILE Analysis
First we load .csv data and clean it (removing of NaNs), then we store information of all files in seperate panda dataframes containing information about affect, emotion and valence/arousal for all participants.

In [2]:
data = pd.read_csv("CHI_2019_FULL.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

## Let's load information about the speakers
The speaker ID is saved in a single .csv file containing four important columns: ID, Age, Sex and Acadedmic Status. Since before loaded OpenSMILE csv files are named using the corresponding index (ex. speaker with id 0 has two files 0_a.csv and 0_b.csv), so that a link can be created

In [3]:
char_data = pd.read_csv("CHI_2019_CharacterData.csv")  

#Join above tables and Character Tables

#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Safe new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

## Kruskal-Wallis Test

Scipy stats documentation says to input measurement data, so we go ahead and use our floating point data.

# Kruskal-Wallis Test


In [7]:
print('DOF: 1') #Because of two groups, DOF is 1
print('EMOTION\n')
emo_sex = hp.kruskal_wallis(df_emotion_char, emotion_label, 'Sex', True)
print('\nAFFECT\n')
aff_sex = hp.kruskal_wallis(df_affect_char, affect_label,'Sex',  True)
print('\nAROUSAL-VALENCE\n')
ar_val_sex = hp.kruskal_wallis(df_ar_val_char, ['Arousal', 'Valence'], 'Sex', True)
print('\nLEVEL OF INTEREST\n')
loi_sex = hp.kruskal_wallis(df_loi_char, loi_label, 'Sex', True)

DOF: 1
EMOTION

Anger: 		KruskalResult(statistic=12.10792219532011, pvalue=0.00050208051136296)
Boredom: 	KruskalResult(statistic=0.059596815474917526, pvalue=0.8071343284761268)
Disgust: 	KruskalResult(statistic=116.73646522514221, pvalue=3.2786242675600375e-27)
Fear: 		KruskalResult(statistic=0.45714063050047216, pvalue=0.49896334396893793)
Happiness: 	KruskalResult(statistic=7.595306743680559, pvalue=0.005852043797254626)
Emo_Neutral: 	KruskalResult(statistic=0.4818464621593766, pvalue=0.48758713346749205)
Sadness: 	KruskalResult(statistic=98.20940354467814, pvalue=3.763983175886507e-23)

AFFECT

Aggressiv: 	KruskalResult(statistic=63.570988787629176, pvalue=1.546906203680449e-15)
Cheerful: 	KruskalResult(statistic=7.884114171044922, pvalue=0.004987089918653156)
Intoxicated: 	KruskalResult(statistic=45.271512799143714, pvalue=1.7152601582290883e-11)
Nervous: 	KruskalResult(statistic=14.971382699118505, pvalue=0.00010915404621161421)
Aff_Neutral: 	KruskalResult(statistic=5.4971237171

Now move on to academic status, the hypothesis being that the variables academic status and e.g. emotion are independent.

In [8]:
print('DOF: 1') # Only two groups, so 2-1 = DOF
print('EMOTION\n')
emo_aca = hp.kruskal_wallis(df_emotion_char, emotion_label, 'Academic Status', True)
print('\nAFFECT\n')
aff_aca = hp.kruskal_wallis(df_affect_char, affect_label,'Academic Status',  True)
print('\nAROUSAL-VALENCE\n')
ar_val_aca = hp.kruskal_wallis(df_ar_val_char, ['Arousal', 'Valence'], 'Academic Status', True)
print('\nLEVEL OF INTEREST\n')
loi_aca = hp.kruskal_wallis(df_loi_char, loi_label, 'Academic Status', True)

DOF: 1
EMOTION

Anger: 		KruskalResult(statistic=2.4693325441855127, pvalue=0.11608721155874828)
Boredom: 	KruskalResult(statistic=0.10868541662321658, pvalue=0.7416457623387273)
Disgust: 	KruskalResult(statistic=1.548572015012951, pvalue=0.2133463739853536)
Fear: 		KruskalResult(statistic=1.317738251925727, pvalue=0.2509983674584841)
Happiness: 	KruskalResult(statistic=1.9286795815813658, pvalue=0.16490297771826984)
Emo_Neutral: 	KruskalResult(statistic=0.04338775089714166, pvalue=0.8349968664836238)
Sadness: 	KruskalResult(statistic=0.24435273315930278, pvalue=0.6210798663964207)

AFFECT

Aggressiv: 	KruskalResult(statistic=2.488623035421142, pvalue=0.11467201191321429)
Cheerful: 	KruskalResult(statistic=0.2985469884655837, pvalue=0.5847947655265564)
Intoxicated: 	KruskalResult(statistic=2.1282107670656387, pvalue=0.14460946913225348)
Nervous: 	KruskalResult(statistic=0.8843445531604823, pvalue=0.34701449602497425)
Aff_Neutral: 	KruskalResult(statistic=4.610846821487257, pvalue=0.031