In [None]:
import pandas
import numpy
import matplotlib.pyplot as plt

In [None]:
mydata = pandas.read_csv('/content/StudentStatistics.csv', sep=';')
mydata

In [None]:
pandas.crosstab(mydata['Location'], mydata['Mix_NrAct'])

In [None]:
!pip install https://raw.githubusercontent.com/stikpet/stikpetP/main/stikpetP.tar.gz
import stikpetP as ps

In [None]:
mycoding = {'Far too little':1, 'too little': 2, 'Enough': 3, 'Too much': 4, 'Far too much': 5}

In [None]:
ps.vi_bar_stacked_multiple(mydata['Location'], mydata['Mix_NrAct'], mycoding)

In [None]:
'''
objective - if nominal variable is influencing the ordinal variable

nominal variable -  school location
ordinal variable - motivation level

H0 - school location has no influence on motivation level
Ha - school location has influence on motivation level


'''

#Kruskal-Wallis H test

In [None]:
from scipy.stats import kruskal

In [None]:
rotterdamScores = mydata['Mix_NrAct'][mydata['Location'] == 'Rotterdam']
HaarlemScores = mydata['Mix_NrAct'][mydata['Location'] == 'Haarlem']
DiemenScores = mydata['Mix_NrAct'][mydata['Location'] == 'Diemen']

In [None]:
rotterdamScores = rotterdamScores.dropna()
HaarlemScores = HaarlemScores.dropna()
DiemenScores = DiemenScores.dropna()

In [None]:
rotterdamScores = rotterdamScores.replace(mycoding)
HaarlemScores = HaarlemScores.replace(mycoding)
DiemenScores = DiemenScores.replace(mycoding)

In [None]:
'''
KruskalResult(statistic=3.335068971240797, pvalue=0.1887117641109804)
'''

kruskal(rotterdamScores, HaarlemScores, DiemenScores)

In [None]:
'''
p>0.05 - null hypothesis cannot be rejected
'''

pvalue=0.1887117641109804

Unfortunately for the Kruskal-Wallis test there is not a single agreed upon effect size measure. However Epsilon square (ε<sup>2</sup>) (Kelley, 1935) seems to be a good choice (see King & Minium (2009), as cited in Tomczak & Tomczak, 2014).

An epsilon square of 0 would mean no differences (and no influence), while one of 1 would indicate a full dependency. Unfortunately there is no formal way to determine if 0.40 is high or low, and I have not been able to find any rule of thumbs for the interpretation. Since this is a squared variable, I would use the same rule of thumb as for a correlation coefficient, but then squaring the upper and lower bounds of each bin. This would give if we use from Rea and Parker (2014) their interpretation for r, the following:

|ε<sup>2</sup>| Interpretation|
|-------|---------------|
|0.00 < 0.01| Negligible|
|0.01 < 0.04 |Weak|
|0.04 < 0.16| Moderate|
|0.16 < 0.36| Relatively strong|
|0.36 < 0.64| Strong|
|0.64 <= 1.00| Very strong|

Lets find out how we can determine this ε<sup>2</sup> with Python, by example.


\begin{equation*}
\epsilon_{KW}^2 = H\times\frac{n+1}{n^2-1}
\end{equation*}

In [None]:
pandas.crosstab(mydata['Location'], mydata['Mix_NrAct']).sum().sum()

In [None]:
#n = total cases or total rows
#H- test statistics

H =3.335068971240797
n = pandas.crosstab(mydata['Location'], mydata['Mix_NrAct']).sum().sum()

esq = H * ((n + 1)/(n**2 - 1))
esq

In [None]:
if esq < .01:
    qual = 'Negligible'
elif esq < .04:
    qual = 'Weak'
elif esq < .16:
    qual = 'Moderate'
elif esq < .36:
    qual = 'Relatively strong'
elif esq < .64:
    qual = 'Strong'
else:
    qual = 'Very strong'

qual

In [None]:
'Location'
'Teach_Motivate'

'''
statistical testing

1- table
2-visualization
3-hypothesis
4-testing
5-effectsize

Q- what is pvalue
Q- conclusion
Q- effect size
'''

In [None]:
pandas.crosstab(mydata['Location'], mydata['Teach_Motivate'])

In [None]:
mycoding = {'Fully Disagree':1, 'Disagree': 2, 'Neither disagree nor agree': 3, 'Agree': 4, 'Fully agree': 5}

In [None]:
ps.vi_bar_stacked_multiple(mydata['Location'], mydata['Teach_Motivate'], mycoding)

In [None]:
'''
objective - if nominal variable is influencing the ordinal variable

nominal variable -  school location
ordinal variable - motivation level of teachers

H0 - school location has no influence on motivation level of teachers
Ha - school location has influence on motivation level of teachers


'''

In [None]:
from scipy.stats import kruskal

In [None]:
rotterdamScores = mydata['Teach_Motivate'][mydata['Location'] == 'Rotterdam']
HaarlemScores = mydata['Teach_Motivate'][mydata['Location'] == 'Haarlem']
DiemenScores = mydata['Teach_Motivate'][mydata['Location'] == 'Diemen']

In [None]:
rotterdamScores = rotterdamScores.dropna()
HaarlemScores = HaarlemScores.dropna()
DiemenScores = DiemenScores.dropna()

In [None]:
rotterdamScores = rotterdamScores.replace(mycoding)
HaarlemScores = HaarlemScores.replace(mycoding)
DiemenScores = DiemenScores.replace(mycoding)

In [None]:
'''
KruskalResult(statistic=3.335068971240797, pvalue=0.1887117641109804)
'''

kruskal(rotterdamScores, HaarlemScores, DiemenScores)

In [None]:
'''
p>0.05 - null hypothesis cannot be rejected
'''

pvalue=2.3370565284336716e-05

In [None]:
#n = total cases or total rows
#H- test statistics

H =21.328066442489817
n = pandas.crosstab(mydata['Location'], mydata['Teach_Motivate']).sum().sum()

esq = H * ((n + 1)/(n**2 - 1))
esq

In [None]:
if esq < .01:
    qual = 'Negligible'
elif esq < .04:
    qual = 'Weak'
elif esq < .16:
    qual = 'Moderate'
elif esq < .36:
    qual = 'Relatively strong'
elif esq < .64:
    qual = 'Strong'
else:
    qual = 'Very strong'

qual