In [8]:
import numpy as np
import scipy.stats as stats
from sklearn.utils import resample
from sklearn.datasets import load_iris


# Exercise 1

## a)
We can't reject the H0. So we don't have reasons to believe that the data is not normally distributed.

In [11]:
iris = load_iris()
sepal_length = iris.data[iris.target == 0, 0]
print(sepal_length)

shapiro_test = stats.shapiro(sepal_length)
print("Shapiro-Wilk test:", shapiro_test)

[5.1 4.9 4.7 4.6 5.  5.4 4.6 5.  4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1
 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.  5.  5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.
 5.5 4.9 4.4 5.1 5.  4.5 4.4 5.  5.1 4.8 5.1 4.6 5.3 5. ]
Shapiro-Wilk test: ShapiroResult(statistic=np.float64(0.977698549796646), pvalue=np.float64(0.4595131499174534))


## b)

In [16]:
sample_mean = np.mean(sepal_length)
sample_sd = np.std(sepal_length, ddof=1)

D_statistic, p_val = stats.kstest(sepal_length, 'norm', args=(sample_mean, sample_sd))
print("KS test statistic:", D_statistic)

KS test statistic: 0.11485990669608126


## c)

In [18]:
MC = 1000
n = 50
D_simulated = np.zeros(MC)

np.random.seed(42)
for i in range(MC):
    sample_data = np.random.normal(np.mean(sepal_length), np.std(sepal_length, ddof=1), size=n)
    D_simulated[i], _ = stats.kstest(sample_data, 'norm', args=(np.mean(sample_data), np.std(sample_data, ddof=1)))
D_simulated

array([0.06348822, 0.06705608, 0.0685632 , 0.10366147, 0.10360938,
       0.04282221, 0.06712948, 0.06573137, 0.07742559, 0.08907513,
       0.09763026, 0.12146784, 0.05970394, 0.06484729, 0.08391585,
       0.08376318, 0.11071651, 0.10854447, 0.07823841, 0.06544974,
       0.0554619 , 0.09984325, 0.07815942, 0.08676994, 0.11915152,
       0.09209523, 0.08900777, 0.07128814, 0.07218411, 0.05445068,
       0.0653243 , 0.07150796, 0.10127406, 0.0721623 , 0.06763599,
       0.07055909, 0.09018982, 0.10130643, 0.07702605, 0.09319373,
       0.07736798, 0.07194725, 0.06896811, 0.08747589, 0.07069654,
       0.10662964, 0.11312019, 0.10469308, 0.0585508 , 0.06998809,
       0.11071148, 0.0679622 , 0.0635838 , 0.14485301, 0.06317996,
       0.1066234 , 0.12870242, 0.09754828, 0.07697878, 0.07239531,
       0.08653261, 0.05823866, 0.07041475, 0.10973209, 0.09796116,
       0.08519117, 0.1187268 , 0.08364424, 0.07719093, 0.11407824,
       0.0863812 , 0.0693306 , 0.06354893, 0.07659883, 0.07552

## d)
We can't reject the H0. So we don't have reasons to believe that the data is not normally distributed.

In [19]:
p_value_approx = np.mean(D_simulated > D_statistic)
print("Approximate p-value:", p_value_approx)

Approximate p-value: 0.093
