# Boostrap Sampling

In [5]:
import pandas as pd
import numpy as np

In [3]:
so_df = pd.read_feather("/Users/codexplore/Developer/repos/ab-testing/data/stack_overflow.feather")

In [None]:
# "converted_comp" column: the salary of data-scientist
so_df["converted_comp"].mean()

np.float64(119574.71738168952)

In [9]:
N = 5000
def bootstrap_sample(data, col_name, n_iterations): 

    bootstrap_means = [] 

    for _ in range(n_iterations): 
        # frac = 1 meaning that take the same size as data, but boostrap (sampling with replacement)
        sample = data.sample(frac=1, replace=True)[col_name]
        sample_mean = np.mean(sample) 

        bootstrap_means.append(sample_mean) 

    return bootstrap_means 

# Generate 5000 bootstrap samples 
bootstrap_means = bootstrap_sample(so_df, "converted_comp", n_iterations=N)

In [None]:
# calculate the 95% confidence interval from the bootstrap means
# Calculate 95% confidence interval 
# np.percentile: Computes the specified percentile (2.5th and 97.5th) of the bootstrap means to determine the confidence interval.

ci_lower = np.percentile(bootstrap_means, 2.5) 

ci_upper = np.percentile(bootstrap_means, 97.5) 

print(f"95% Confidence Interval: ({ci_lower}, {ci_upper})") 

95% Confidence Interval: (108776.53386775762, 130661.09273551525)


In [24]:
import plotly.graph_objects as go

fig = go.Figure()

# Histogram of bootstrap means
fig.add_trace(go.Histogram(
    x=bootstrap_means,
    nbinsx=30,
    marker=dict(color='lightblue', line=dict(color='black', width=1)),
    opacity=0.7
))

# Vertical lines for confidence interval
fig.add_vline(x=ci_lower, line=dict(color='red', dash='dash'), name='CI Lower')
fig.add_vline(x=ci_upper, line=dict(color='red', dash='dash'), name='CI Upper')

# Add annotations to display x values
fig.add_annotation(x=ci_lower, y=0, text=f"{ci_lower:.2f}", 
                   showarrow=True, arrowhead=1, ax=0, ay=-250)

fig.add_annotation(x=ci_upper, y=0, text=f"{ci_upper:.2f}", 
                   showarrow=True, arrowhead=1, ax=0, ay=-250)
# Update layout
fig.update_layout(
    title=f'Distribution of Bootstrap Means (Number of Bootstrap Samples = {N})',
    xaxis_title='Mean Data Scientist Salary',
    yaxis_title='Frequency',
    bargap=0.05
)

fig.show()


- The histogram of the bootstrap distribution follows the bell shape, meaning that it's roughly normally distributed.