In [1]:
import math
import random

import numpy as np
import scipy.stats

import functions

In [2]:
seed = 42

### **Statistical Analysis of Simulated Data**

In [3]:
# exercise 1

In [4]:
random.seed(seed)

n = 1
Z = random.normalvariate() 

values = [Z]

mean = Z
variance = 0

MSE = variance / n

while (n < 100) or (math.sqrt(MSE) > 0.01):
    
    n = n + 1
    Z = random.normalvariate()

    values.append(Z)
    
    prev = mean
    mean = prev + (Z - prev) / n
    variance = (n - 2) / (n - 1) * variance + n * (mean - prev)**2 

    MSE = variance / n

print(f'- Nsim = {n}\nmean = {mean}\nvariance = {variance}')

- Nsim = 10280
mean = -0.004564181439224627
variance = 1.0278139080411315


In [5]:
# exercise 2

In [6]:
fun_a = lambda x: math.exp(x) / math.sqrt(2 * x)

aux_b = lambda x: x**2 * math.exp(-x**2)
fun_b = lambda y: 2 * aux_b((1 - y) / y) / y**2

In [7]:
random.seed(seed)

fun = fun_a # fun_a, fun_b
tol = 0.01

###################################################################

n = 1
U = random.random()
f = fun(U)

mean = f
variance = 0

MSE = variance / n

while (n < 100) or (math.sqrt(MSE) > tol):
    
    n = n + 1
    U = random.random()
    f = fun(U)
    
    prev = mean
    mean = prev + (f - prev) / n
    variance = (n - 2) / (n - 1) * variance + n * (mean - prev)**2

    MSE = variance / n

###################################################################

print(f'- Nsim = {n}\nI = {mean}\nS = {math.sqrt(variance)}')

- Nsim = 42120
I = 2.067024565424683
S = 2.0522839920653744


In [8]:
def MonteCarlo_integration_v1(fun, tol):

    n = 1
    U = random.random()
    f = fun(U)

    mean = f
    variance = 0

    MSE = variance / n

    while (n < 100) or (math.sqrt(MSE) > tol):

        n = n + 1
        U = random.random()
        f = fun(U)
        
        prev = mean
        mean = prev + (f - prev) / n
        variance = (n - 2) / (n - 1) * variance + n * (mean - prev)**2

        MSE = variance / n

    return n, mean, variance

# test

random.seed(seed)

n, mean, variance = MonteCarlo_integration_v1(fun_a, tol=0.01)
print(f'- Nsim = {n}\nI = {mean}\nS = {math.sqrt(variance)}')

- Nsim = 42120
I = 2.067024565424683
S = 2.0522839920653744


In [9]:
# exercise 3

In [10]:
aux_c = lambda x: math.sin(x) / x
a = math.pi
b = 2 * math.pi
fun_c = lambda y: aux_c(a + (b - a) * y) * (b - a)

aux_d = lambda x: 3 / (3 + x**4)
fun_d = lambda y: aux_d((1 - y) / y) / y**2

In [11]:
# CI(90%) ~> Z_(alpha/2) = 1.64
# CI(95%) ~> Z_(alpha/2) = 1.96
# CI(98%) ~> Z_(alpha/2) = 2.33
# CI(98%) ~> Z_(alpha/2) = 2.58

In [12]:
random.seed(seed)

fun = fun_c # fun_c, fun_d
Z = 1.96    # Z := Z_(alpha/2) => P(|Z| <= 1.96) = 0.95
L = 0.001   # 2 * Z_(alpha/2) * sigma / sqrt(n) <= L 

###################################################################

tol = L / (2 * Z)

n = 1
U = random.random()
f = fun(U)

mean = f
variance = 0

MSE = variance / n

while (n < 100) or (math.sqrt(MSE) > tol):
    
    n = n + 1
    U = random.random()
    f = fun(U)
    
    prev = mean
    mean = prev + (f - prev) / n
    variance = (n - 2) / (n - 1) * variance + n * (mean - prev)**2 

    MSE = variance / n

lower_limit = mean - Z * math.sqrt(variance / n)
upper_limit = mean + Z * math.sqrt(variance / n)

###################################################################

print(f'- Nsim = {n}\nI = {mean}\nS = {math.sqrt(variance)}\nCI(95%) = ({lower_limit}, {upper_limit})')

- Nsim = 681318
I = -0.4338076518672658
S = 0.21056621047634985
CI(95%) = (-0.4343076516564167, -0.43330765207811484)


In [13]:
def MonteCarlo_integration_v2(fun, Z, L):
    
    tol = L / (2 * Z)

    n = 1
    U = random.random()
    f = fun(U)

    mean = f
    variance = 0

    MSE = variance / n

    while (n < 100) or (math.sqrt(MSE) > tol):
        
        n = n + 1
        U = random.random()
        f = fun(U)
        
        prev = mean
        mean = prev + (f - prev) / n
        variance = (n - 2) / (n - 1) * variance + n * (mean - prev)**2 

        MSE = variance / n

    return n, mean, variance

# test

random.seed(seed)

n, mean, variance = MonteCarlo_integration_v2(fun_c, Z=1.96, L=0.001)
lower_limit = mean - Z * math.sqrt(variance / n)
upper_limit = mean + Z * math.sqrt(variance / n)
print(f'- Nsim = {n}\nI = {mean}\nS = {math.sqrt(variance)}\nCI(95%) = ({lower_limit}, {upper_limit})')

- Nsim = 681318
I = -0.4338076518672658
S = 0.21056621047634985
CI(95%) = (-0.4343076516564167, -0.43330765207811484)


In [14]:
# exercise 4

In [15]:
# exercise 5

In [16]:
# exercise 6

In [17]:
random.seed(seed)

tol = 0.01

####################################################

n = 0
p = 0

while n < 100 or math.sqrt((p * (1 - p)) / n) > tol:
    
    n = n + 1

    U = 2 * random.random() - 1
    V = 2 * random.random() - 1

    if U**2 + V**2 < 1:
        X = 1
    else:
        X = 0

    p = p + (X - p) / n
    
####################################################

print(f'- Nsim = {n}\npi = {4 * p}')

- Nsim = 1683
pi = 3.144385026737968


In [18]:
random.seed(seed)

Z = 1.96    # Z_(alpha/2) => P(|Z| <= 1.96) = 0.95
L = 0.01    # 2 * Z_(alpha/2) * sigma / sqrt(n) <= L

####################################################

tol = L / (2 * Z)

n = 0
p = 0

while n < 100 or math.sqrt((p * (1 - p)) / n) > tol:
    
    n = n + 1

    U = 2 * random.random() - 1
    V = 2 * random.random() - 1

    if U**2 + V**2 < 1:
        X = 1
    else:
        X = 0

    p = p + (X - p) / n

lower_limit = p - Z * math.sqrt((p * (1 - p)) / n)
upper_limit = p + Z * math.sqrt((p * (1 - p)) / n)
    
####################################################

print(f'- Nsim = {n}\npi = {4 * p}\nIC(95%) = ({4 * lower_limit}, {4 * upper_limit})')

- Nsim = 25963
pi = 3.138774409736924
IC(95%) = (3.1187750013294018, 3.1587738181444465)


### **Statistical Validation Techniques**

In [19]:
# exercise 7

In [20]:
# H0: Se verifica la teoría genética de Mendel
# H1: NO se verifica la teoría genética de Mendel

# Estadístico: T = sum_{i=1}^{k} (N_{i} - n * p_{i}) / (n * p_{i}) ~ Chi-Squared(k - 1)

n_obs = 564
p_vec = np.array([0.25, 0.50, 0.25])
freq_obs = np.array([141, 291, 132])
freq_exp = n_obs * p_vec

k = len(freq_obs)
t_obs = sum((freq_obs - freq_exp)**2 / freq_exp)

print(f't_obs = {t_obs}')

t_obs = 0.8617021276595745


In [21]:
# Pearson's Chi-Squared Test

p_value = scipy.stats.chi2.sf(t_obs, df=k-1)
print(f'-Pearson\'s Chi-Squared Test\np-value = {p_value}')

-Pearson's Chi-Squared Test
p-value = 0.6499557054800363


In [22]:
np.random.seed(seed)

# Simulation

Nsim = 100000

p_value = 0
for _ in range(Nsim):

    freq_sim = np.zeros(k, dtype=int)

    freq_sim[0] = scipy.stats.binom.rvs(n_obs, p_vec[0])
    for i in range(1, k-1):
        n = n_obs - sum(freq_sim[:i])
        p = p_vec[i] / (1 - sum(p_vec[:i]))
        freq_sim[i] = scipy.stats.binom.rvs(n, p)
    freq_sim[-1] = n_obs - sum(freq_sim[:-1])

    t_sim = sum((freq_sim - freq_exp)**2 / freq_exp)
    if t_sim > t_obs:
        p_value = p_value + 1

p_value = p_value / Nsim
print(f'- Simulation\np-value = {p_value}')

- Simulation
p-value = 0.64231


In [23]:
# exercise 8

In [24]:
# H0: El dado es honesto (D ~ U{1, 6})
# H1: El dado NO es honesto

n_obs = 1000
p_vec = np.array([1 / 6] * 6)
freq_obs = np.array([158, 172, 164, 181, 160, 165])
freq_exp = n_obs * p_vec

k = len(freq_obs)
t_obs = sum((freq_obs - freq_exp)**2 / freq_exp)

print(f't_obs = {t_obs}')

t_obs = 2.18


In [25]:
# Pearson's Chi-Squared Test

p_value = scipy.stats.chi2.sf(t_obs, df=k-1)
print(f'-Pearson\'s Chi-Squared Test\np-value = {p_value}')

-Pearson's Chi-Squared Test
p-value = 0.8237195392577814


In [26]:
np.random.seed(seed)

# Simulation

Nsim = 100000

p_value = 0
for _ in range(Nsim):

    freq_sim = np.zeros(k, dtype=int)
    
    freq_sim[0] = scipy.stats.binom.rvs(n_obs, p_vec[0])
    for i in range(1, k-1):
        n = n_obs - sum(freq_sim[:i])
        p = p_vec[i] / (1 - sum(p_vec[:i]))
        freq_sim[i] = scipy.stats.binom.rvs(n, p)
    freq_sim[-1] = n_obs - sum(freq_sim[:-1])

    t_sim = sum((freq_sim - freq_exp)**2 / freq_exp)
    if t_sim > t_obs:
        p_value = p_value + 1

p_value = p_value / Nsim
print(f'- Simulation\np-value = {p_value}')

- Simulation
p-value = 0.82249


In [27]:
# exercise 9

In [28]:
def Kolmogorov_Smirnov(data, F):
    
    n = len(data)
    data_ = sorted(data.copy())

    d = 0
    for j in range(n):
        d = max(d, (j + 1) / n - F(data_[j]), F(data_[j]) - j / n)

    return d

In [29]:
G = lambda x: x if (0 < x < 1) else 0

In [30]:
# H0: Los siguientes 10 números son aleatorios
# H1: Los siguientes 10 números NO son aleatorios

# D = sup |Fe(x) - F(x)|
#   = max {max(j/n - F(X_(j))), max(F(X_(j)) - (j-1)/n)} , j = 1,...,n

data = np.array([0.12, 0.18, 0.06, 0.33, 0.72, 0.83, 0.36, 0.27, 0.77, 0.74])

n = len(data)
d_obs = Kolmogorov_Smirnov(data, G)
d_obs

0.24

In [31]:
np.random.seed(seed)

# Simulation

Nsim = 100000

p_value = 0
for _ in range(Nsim):

    sample = np.random.random(n)
    d_sim = Kolmogorov_Smirnov(sample, G)
    if d_sim > d_obs:
        p_value = p_value + 1

p_value = p_value / Nsim
print(f'p-value = {p_value}')

p-value = 0.5325


In [32]:
# exercise 10

In [33]:
# H0: Los siguientes 13 valores provienen de una distribución exponencial con media 50
# H1: Los siguientes 13 valores NO provienen de una distribución exponencial

F = lambda x: 1 - np.exp(- x / 50)
data = np.array([86, 133, 75, 22, 11, 144, 78, 122, 8, 146, 33, 41, 99], dtype=float)

n = len(data)
d_obs = Kolmogorov_Smirnov(data, F)
d_obs

0.3922544552361856

In [34]:
np.random.seed(seed)

# Simulation

Nsim = 100000

p_value = 0
for _ in range(Nsim):

    sample = np.random.random(n)
    d_sim = Kolmogorov_Smirnov(sample, G)
    if d_sim > d_obs:
        p_value = p_value + 1

p_value = p_value / Nsim
print(f'p-value = {p_value}')

p-value = 0.02573


In [35]:
# exercise 11

In [36]:
# H0: Los datos obtenidos provienen de una distribución binomial con parámetros (n = 8, p) con p desconocido
# H1: Los datos obtenidos NO provienen de una distribución binomial 

data = np.array([6, 7, 3, 4, 7, 3, 7, 2, 6, 3, 7, 8, 2, 1, 3, 5, 8, 7])
n = 8
p = np.mean(data) / n

n_obs = len(data)
p_vec = np.array([scipy.stats.binom.pmf(i, n, p) for i in range(n + 1)])
freq_obs = np.bincount(data)
freq_exp = n_obs * p_vec

k = len(freq_obs)
m = 1
t_obs = sum((freq_obs - freq_exp)**2 / freq_exp)

print(f't_obs = {t_obs}')

t_obs = 31.499330934155324


In [37]:
# Pearson's Chi-Squared Test

p_value = scipy.stats.chi2.sf(t_obs, df=k-m-1)
print(f'-Pearson\'s Chi-Squared Test\np-value = {p_value}')

-Pearson's Chi-Squared Test
p-value = 5.027994320424078e-05


In [38]:
np.random.seed(seed)

# Simulation

Nsim = 10000

p_value = 0
for _ in range(Nsim):

    sample = [scipy.stats.binom.rvs(n, p) for _ in range(n_obs)]
    p_hat = np.mean(sample) / n

    p_sim = np.array([scipy.stats.binom.pmf(i, n, p_hat) for i in range(n + 1)])
    freq_sim = np.zeros(n + 1, dtype=int)
    for i in sample:
        freq_sim[i] = freq_sim[i] + 1       
    freq_est = n_obs * p_sim

    t_sim = sum((freq_sim - freq_est)**2 / freq_est)
    if t_sim > t_obs:
        p_value = p_value + 1

p_value = p_value / Nsim
print(f'- Simulation\np-value = {p_value}')

- Simulation
p-value = 0.0101


In [39]:
# exercise 12

In [40]:
# exercise 13

In [41]:
# exercise 14

In [42]:
# exercise 15

In [43]:
# H0: Las observaciones provienen de una v.a con distribución exponencial de parametro lambda desconocido
# H1: Las observaciones NO provienen de una v.a con distribución exponencial 

data = np.array([1.6, 10.3, 3.5, 13.5, 18.4, 7.7, 24.3, 10.7, 8.4, 4.9, 7.9, 12, 16.2, 6.8, 14.7])

n_obs = len(data)
l_hat = 1 / np.mean(data)
F_hat = lambda x: 1 - np.exp(-l_hat*x)

d_obs = Kolmogorov_Smirnov(data, F_hat)
print(f'd_obs = {d_obs}')

d_obs = 0.26949936321059237


In [44]:
np.random.seed(seed)

# Simulation 1

Nsim = 10000

p_value = 0
for _ in range(Nsim):

    sample = np.random.random(n_obs)
    d_sim = Kolmogorov_Smirnov(sample, G)
    if d_sim > d_obs:
        p_value = p_value + 1

p_value = p_value / Nsim
print(f'p-value = {p_value}')

p-value = 0.1871


In [45]:
np.random.seed(seed)

# Simulation 2

Nsim = 100000

p_value = 0
for _ in range(Nsim):

    sample = [functions.Exp(l_hat) for _ in range(n_obs)]

    l_sim = 1 / np.mean(sample)
    F_sim = lambda x: 1 - np.exp(-l_sim*x) 

    d_sim = Kolmogorov_Smirnov(sample, F_sim)
    if d_sim > d_obs:
        p_value = p_value + 1

p_value = p_value / Nsim
print(f'p-value = {p_value}')

p-value = 0.04821


In [46]:
# exercise 16