In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import uniform, norm
from sklearn.linear_model import LinearRegression

from Tool_Functions import *
from Tool_Functions import data_gen_process


In [2]:
(beta1, beta2_values, M, n_train, n_test, seed, epsilon,
           x, x_train, x_test, epsilon_train, epsilon_test,
           liste_y, liste_y_train, liste_y_test)  = data_gen_process()


In [None]:
coefs_A, coefs_B, erreurs_A, erreurs_B = models_fit()
pa = erreurs_A[-1]
pb = erreurs_B[-1]
diff = pa - pb

In [None]:
def standard_error():

   mean_A = np.mean(pa)
   mean_B = np.mean(pb)
   mean_diff = np.mean(diff)

   se_A = np.std(pa)/(n_test**0.5)
   se_B = np.std(pb)/(n_test**0.5)
   se_diff = np.std(diff)/(n_test**0.5)

   return mean_A, mean_B, mean_diff, se_A, se_B, se_diff

def standard_error_boot():

   B = 250
   bootstrap_indices = np.random.choice(n_test, size=(B, n_test), replace=True)
   bootstrap = []

   for indices in bootstrap_indices:
      boot_mean = np.mean(pa[indices])
      bootstrap.append(boot_mean)

   mean_A_boot = np.mean(bootstrap)
   se_boot_A = np.std(bootstrap)

   bootstrap = []

   for indices in bootstrap_indices:
      boot_mean = np.mean(pb[indices])
      bootstrap.append(boot_mean)

   mean_B_boot = np.mean(bootstrap)
   se_boot_B = np.std(bootstrap)

   bootstrap = []

   for indices in bootstrap_indices:
      boot_mean = np.mean(diff[indices])
      bootstrap.append(boot_mean)

   mean_diff_boot = np.mean(bootstrap)
   se_boot_diff = np.std(bootstrap)


   return mean_A_boot, mean_B_boot, mean_diff_boot, se_boot_A, se_boot_B, se_boot_diff

def confidence_interval_mean():

    mean_A, mean_B, mean_diff, se_A, se_B, se_diff = standard_error()

    t_critical = 1.96
    lower_A = mean_A - t_critical * se_A
    lower_B = mean_B - t_critical * se_B
    lower_diff = mean_diff - t_critical * se_diff
    upper_A = mean_A + t_critical * se_A
    upper_B = mean_B + t_critical * se_B
    upper_diff = mean_diff + t_critical * se_diff

    interval_A = pd.Interval(left=lower_A, right=upper_A, closed='both')
    interval_B = pd.Interval(left=lower_B, right=upper_B, closed='both')
    interval_diff = pd.Interval(left=lower_diff, right=upper_diff, closed='both')

    return interval_A, interval_B, interval_diff

def confidence_interval_perc_boot():

   alpha = 0.05
   B = 250

   bootstrap_indices = np.random.choice(n_test, size=(B, n_test), replace=True)

   moy_boot_A = []
   moy_boot_B = []
   moy_boot_diff = []

   for indices in bootstrap_indices:
      boot_mean = np.mean(pa[indices])
      moy_boot_A.append(boot_mean)

   for indices in bootstrap_indices:
      boot_mean = np.mean(pb[indices])
      moy_boot_B.append(boot_mean)

   for indices in bootstrap_indices:
      boot_mean = np.mean(diff[indices])
      moy_boot_diff.append(boot_mean)

   lower_A = np.percentile(moy_boot_A, 100 * (alpha / 2))
   upper_A = np.percentile(moy_boot_A, 100 * (1 - alpha / 2))
   lower_B = np.percentile(moy_boot_B, 100 * (alpha / 2))
   upper_B = np.percentile(moy_boot_B, 100 * (1 - alpha / 2))
   lower_diff = np.percentile(moy_boot_diff, 100 * (alpha / 2))
   upper_diff = np.percentile(moy_boot_diff, 100 * (1 - alpha / 2))

   interval_A_boot = pd.Interval(left=lower_A, right=upper_A, closed='both')
   interval_B_boot = pd.Interval(left=lower_B, right=upper_B, closed='both')
   interval_diff_boot = pd.Interval(left=lower_diff, right=upper_diff, closed='both')

   return interval_A_boot, interval_B_boot, interval_diff_boot

def confidence_interval_basic_boot():
    
   alpha = 0.05
   B = 250

   bootstrap_indices = np.random.choice(n_test, size=(B, n_test), replace=True)

   moy_boot_A = []
   moy_boot_B = []
   moy_boot_diff = []

   for indices in bootstrap_indices:
      boot_mean = np.mean(pa[indices])
      moy_boot_A.append(boot_mean)

   for indices in bootstrap_indices:
      boot_mean = np.mean(pb[indices])
      moy_boot_B.append(boot_mean)

   for indices in bootstrap_indices:
      boot_mean = np.mean(diff[indices])
      moy_boot_diff.append(boot_mean)

   lower_A = np.percentile(moy_boot_A, 100 * (alpha / 2))
   upper_A = np.percentile(moy_boot_A, 100 * (1 - alpha / 2))
   lower_B = np.percentile(moy_boot_B, 100 * (alpha / 2))
   upper_B = np.percentile(moy_boot_B, 100 * (1 - alpha / 2))
   lower_diff = np.percentile(moy_boot_diff, 100 * (alpha / 2))
   upper_diff = np.percentile(moy_boot_diff, 100 * (1 - alpha / 2))

   interval_A_boot = pd.Interval(left=2*np.mean(pa)-upper_A, right=2*np.mean(pa)-lower_A, closed='both')

   interval_B_boot = pd.Interval(left=2*np.mean(pb)-upper_B, right=2*np.mean(pb)-lower_B, closed='both')

   interval_diff_boot = pd.Interval(left=2*np.mean(diff)-upper_diff, right=2*np.mean(diff)-lower_diff, closed='both')

   return interval_A_boot, interval_B_boot, interval_diff_boot
    

def statistical_testing():

        mean_A, mean_B, mean_diff, se_A, se_B, se_diff = standard_error()
        mean_A_boot, mean_B_boot, mean_diff_boot, se_A_boot, se_B_boot, se_diff_boot = standard_error_boot()

        test_stat = mean_diff/se_diff

        student_bil = stats.t.ppf(0.975, df=n_test-1)
        p_value_bil = 1 - stats.t.cdf(abs(test_stat), df = n_test-1)
        student_right = stats.t.ppf(0.95, df=n_test-1)
        p_value_right = 1 - stats.t.cdf(test_stat, df=n_test-1)

        puissance_bil = 1 - stats.norm.cdf(student_bil - test_stat, 0, 1)
        puissance_right = 1 - stats.norm.cdf(student_right - test_stat, 0, 1)

        test_stat_boot = mean_diff_boot/se_diff_boot


        p_value_bil_boot = 1 - stats.t.cdf(abs(test_stat_boot), df = n_test-1)
        p_value_right_boot = 1 - stats.t.cdf(test_stat_boot, df=n_test-1)

        puissance_bil_boot = 1 - stats.norm.cdf(student_bil - test_stat_boot, 0, 1)
        puissance_right_boot = 1 - stats.norm.cdf(student_right - test_stat_boot, 0, 1)

        return p_value_bil, p_value_bil_boot, puissance_bil, puissance_bil_boot
