# Exercise 5.1: Writing functions for bootstrap replicates

In [None]:
import numpy as np
import pandas as pd

import iqplot

import bokeh.io
import bokeh.plotting

bokeh.io.output_notebook()

In [1]:
def draw_bs_reps(data, func, rg, size=1, args=()):
    '''
    Draws multiple bootstrap replicates and returns a numy array. 

    Parameters:
        data (array-like): The input data from which bootstrap replicates are generated
        func (function): The statistic to be computed on each bootstrap sample
            It should have the signature func(data, *args)
        rg (numpy.random.Generator): An instance of a Numpy random number generator
        size (int, optional): The number of replicates to generate (default is 1)
        args (tuple, optional): Additional arguments to be passed to func (default is an empty tuple).

    Returns:
        numpy.ndarray: An array of bootstrap replicates
    '''
    n = len(data)
    replicates = [func(rg.choice(data, size=n), *args) for _ in range(size)]
    return np.array(replicates)

In [None]:
# not completed yet

def draw_bs_pairs(data1, data2, func, rg, size=1, args=()):
    '''
    Draws multiple bootstrap replicates and returns a numy array. 

    Parameters:
        data (array-like): The input data from which bootstrap replicates are generated
        func (function): The statistic to be computed on each bootstrap sample
            It should have the signature func(data1, data2, *args)
        rg (numpy.random.Generator): An instance of a Numpy random number generator
        size (int, optional): The number of replicates to generate (default is 1)
        args (tuple, optional): Additional arguments to be passed to func (default is an empty tuple).

    Returns:
        numpy.ndarray: An array of bootstrap replicates
    '''
    n = len(data)
    replicates = [func(rg.choice(data, size=n), *args) for _ in range(size)]
    return np.array(replicates)

# Exercise 5.2: Hacker stats with bee sperm data

In [2]:
import numpy as np
import pandas as pd

In [5]:
cd ..

/Users/fabienne/git/bootcamp


In [7]:
df_bee_sperm = pd.read_csv('data/bee_sperm.csv', comment='#')
df_bee_sperm.head(20)

Unnamed: 0,Specimen,Treatment,Environment,TreatmentNCSS,Sample ID,Colony,Cage,Sample,Sperm Volume per 500 ul,Quantity,ViabilityRaw (%),Quality,Age (d),Infertil,AliveSperm,Quantity Millions,Alive Sperm Millions,Dead Sperm Millions
0,227,Control,Cage,1,C2-1-1,2,1,1,2150000,2150000,96.7263814616756,96.726381,14,0,2079617,2.15,2.079617,0.070383
1,228,Control,Cage,1,C2-1-2,2,1,2,2287500,2287500,96.3498079760595,96.349808,14,0,2204001,2.2875,2.204001,0.083499
2,229,Control,Cage,1,C2-1-3,2,1,3,87500,87500,98.75,98.75,14,0,86406,0.0875,0.086406,0.001094
3,230,Control,Cage,1,C2-1-4,2,1,4,1875000,1875000,93.2874208336941,93.287421,14,0,1749139,1.875,1.749139,0.125861
4,231,Control,Cage,1,C2-1-5,2,1,5,1587500,1587500,97.7925061050061,97.792506,14,0,1552456,1.5875,1.552456,0.035044
5,232,Control,Cage,1,C2-1-6,2,1,6,2600000,2600000,99.3110435663627,99.311044,14,0,2582087,2.6,2.582087,0.017913
6,233,Control,Cage,1,C2-1-7,2,1,7,3412500,3412500,96.9269279143752,96.926928,14,0,3307631,3.4125,3.307631,0.104869
7,234,Control,Cage,1,C2-1-8,2,1,8,2562500,2562500,90.4013147346481,90.401315,14,0,2316533,2.5625,2.316533,0.245967
8,235,Control,Cage,1,C2-1-9,2,1,9,87500,87500,97.3765432098765,97.376543,14,0,85204,0.0875,0.085204,0.002296
9,236,Control,Cage,1,C2-2-1,2,2,1,1325000,1325000,100.0,100.0,14,0,1325000,1.325,1.325,0.0


In [18]:
df_bee_weight = pd.read_csv('data/bee_weight.csv', comment='#')
df_bee_weight

Unnamed: 0,Specimen,Colony,Cage,Sample-Nr.,Weight,Treatment,TreatmentNCSS
0,1,3,1,1,292.0,Control,1
1,2,3,1,2,296.0,Control,1
2,3,3,1,3,298.0,Control,1
3,4,3,1,4,290.0,Control,1
4,5,3,1,5,304.0,Control,1
...,...,...,...,...,...,...,...
315,316,84,3,6,296.6,Pesticide,2
316,317,84,3,7,280.6,Pesticide,2
317,318,84,3,8,274.8,Pesticide,2
318,319,84,3,9,258.7,Pesticide,2


In [16]:
import numpy as np
import pandas as pd

import iqplot

import bokeh.io
import bokeh.plotting

bokeh.io.output_notebook()
p = iqplot.ecdf(
    data=df_bee_weight,
    q="Weight",
    cats="Treatment",
    style="staircase",
)

bokeh.io.show(p)

In [20]:
# Pull our data sets as Numpy arrays
df_weight_control = df_bee_weight.loc[
    df_bee_weight['Treatment']=='Control', 'Weight'].values
df_weight_pesticide = df_bee_weight.loc[
    df_bee_weight['Treatment']=='Pesticide', 'Weight'].values

# Compute the means
np.mean(df_weight_control), np.mean(df_weight_pesticide)

(277.0563, 278.27333333333337)

In [24]:
rng = np.random.default_rng(3252)

bs_sample = rng.choice(df_weight_control, replace=True, size=len(df_weight_control))

In [26]:
# Original data set
p = iqplot.ecdf(
    data=df_weight_control.loc[df_weight_control['Treatment']=='Control', :],
    q='Weight',
)

# Bootstrap data set
p = iqplot.ecdf(
    data=bs_sample,
    q='bootstrap',
    cats=None,
    p=p,
    marker_kwargs=dict(
        fill_color=None,
        line_color='gray'
    ),
)

bokeh.io.show(p)

AttributeError: 'numpy.ndarray' object has no attribute 'loc'