In [2]:
# Common libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import seaborn as sns

# Chapter-specific libraries
import statsmodels.stats.proportion as ssprop # To calculate the standardized effect size
import statsmodels.stats.power as ssp #To calculate the standard power

In [6]:
# determining sample size in a t-test   
effect_size = ssprop.proportion_effectsize(0.1925, 0.1825) # desired proportion of bookings vs observed rate of bookings
ssp.tt_ind_solve_power( # t-test, two sample outputs
    effect_size=effect_size, # difference of two means over standard deviation
    alpha = 0.05, # significance level, test incorrectly rejects true null
    nobs1=None, # number of observations
    alternative='larger', # left, right, or two sided t-test
    power=0.8 # 1- probability of type II error, test correctly rejects false Null hypothesis
    )



18835.143119883356

In [7]:
# determining sample size in a regression
hist_data_df = pd.read_csv(r'/home/beastman/Projects/HomeSchool/data-analysis/BehavioralDataAnalysis/Chapter 8 - Experimental Design - the Basics/chap8-historical_data.csv')

exp_null_data_df = hist_data_df.copy().sample(2000)
exp_null_data_df['oneclick'] = np.where(np.random.uniform(0,1,2000) > 0.5,1,0)
mod = smf.logit('booked ~ oneclick + age + gender', data=exp_null_data_df)
mod.fit(disp=0).summary()

0,1,2,3
Dep. Variable:,booked,No. Observations:,2000.0
Model:,Logit,Df Residuals:,1996.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 18 Nov 2023",Pseudo R-squ.:,0.265
Time:,11:29:27,Log-Likelihood:,-686.19
converged:,True,LL-Null:,-933.63
Covariance Type:,nonrobust,LLR p-value:,6.152999999999999e-107

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,9.9210,0.638,15.539,0.000,8.670,11.172
gender[T.male],0.1345,0.138,0.977,0.328,-0.135,0.404
oneclick,-0.0543,0.137,-0.396,0.692,-0.323,0.215
age,-0.3091,0.018,-17.644,0.000,-0.343,-0.275


In [11]:
# oneclick would be considered non-significant as p value is > 0.1

def log_reg_fun(df):
    model = smf.logit('booked ~ oneclick + age + gender', data=df)
    res = model.fit(disp=0)
    coeff = res.params['oneclick']
    return coeff

def boot_CI_fun(df, metric_fun, B=100, conf_level=0.9):
    N = len(df)
    coeffs = []

    for i in range(B):
        sim_df  = df.sample(n=N, replace=True)
        coeff = metric_fun(sim_df)
        coeffs.append(coeff)

    coeffs.sort()
    start_idx = round(B * (1 - conf_level) / 2)
    end_idx = -1 * start_idx
    confint = [coeffs[start_idx], coeffs[end_idx]]
    return confint

def decision_fun(df, metric_fun, B = 100, conf_level=0.9):
    boot_CI = boot_CI_fun(df, metric_fun, B, conf_level)
    decision = 1 if boot_CI[0] > 0 else 0
    return decision

def single_sim_fun(Nexp, df, metric_fun, eff_size, B = 100, conf_level = 0.9):

    # predicted probability of booking
    hist_model = smf.logit('booked ~ age + gender + period', data=df)
    res = hist_model.fit(disp=0)
    sim_df = df.copy()
    sim_df['pred_prob_bkg'] = res.predict()

    # filter to desired sample size
    sim_df = sim_df.sample(Nexp)

    # random assignment of sample groups
    sim_df['oneclick'] = np.where(np.random.uniform(size=Nexp) <= 0.5, 0, 1)

    # adding effect to treatment group
    sim_df['pred_prob_bkg'] = np.where(
        sim_df.oneclick == 1,
        sim_df.pred_prob_bkg + eff_size,
        sim_df.pred_prob_bkg
        )
    sim_df['booked'] = np.where(
        sim_df.pred_prob_bkg >= np.random.uniform(size=Nexp), 1, 0
    )

    # calc decision (should be 1)
    decision = decision_fun(sim_df, metric_fun=metric_fun, B=B, conf_level=conf_level)
    return decision

def power_sim_fun(df, metric_fun, Nexp, eff_size, Nsim, B = 100, conf_level=0.9):
    power_list = []
    for i in range(Nsim):
        print(f"starting simulation number {i}\n")
        power_list.append(
            single_sim_fun(
                Nexp=Nexp,
                df=df,
                metric_fun=metric_fun,
                eff_size=eff_size,
                B=B,
                conf_level=conf_level
            )
        )
    power = np.mean(power_list)
    return power



power_sim_fun(df=hist_data_df, metric_fun=log_reg_fun, Nexp=int(4e4), eff_size=0.01, Nsim=20)

starting simulation number 0

starting simulation number 1

starting simulation number 2

starting simulation number 3

starting simulation number 4

starting simulation number 5

starting simulation number 6

starting simulation number 7

starting simulation number 8

starting simulation number 9

starting simulation number 10

starting simulation number 11

starting simulation number 12

starting simulation number 13

starting simulation number 14

starting simulation number 15

starting simulation number 16

starting simulation number 17

starting simulation number 18

starting simulation number 19



0.95

In [3]:
import statsmodels.formula.api as smf

exp_data_df = pd.read_csv(r'/home/beastman/Projects/HomeSchool/data-analysis/BehavioralDataAnalysis/Chapter 8 - Experimental Design - the Basics/chap8-experimental_data.csv')
model = smf.logit('booked ~ age + gender + oneclick', data = exp_data_df)
res = model.fit()
res.summary()

Optimization terminated successfully.
         Current function value: 0.161220
         Iterations 9


0,1,2,3
Dep. Variable:,booked,No. Observations:,40160.0
Model:,Logit,Df Residuals:,40156.0
Method:,MLE,Df Model:,3.0
Date:,"Sun, 26 Nov 2023",Pseudo R-squ.:,0.3311
Time:,12:47:39,Log-Likelihood:,-6474.6
converged:,True,LL-Null:,-9679.1
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,11.6928,0.226,51.819,0.000,11.251,12.135
gender[T.male],0.2542,0.049,5.182,0.000,0.158,0.350
age,-0.3941,0.006,-61.282,0.000,-0.407,-0.381
oneclick,0.1578,0.047,3.357,0.001,0.066,0.250


In [4]:
def diff_prob_fun(df, reg_model = model):
    
    no_button = df.loc[:, 'age': 'gender']
    no_button.loc[:, 'oneclick'] = 0

    button = df.loc[:, 'age': 'gender']
    button.loc[:, 'oneclick'] = 1

    no_button.loc[:, 'pred_bkg_rate'] = res.predict(no_button) 
    button.loc[:, 'pred_bkg_rate'] = res.predict(button)

    diff = button.loc[:, 'pred_bkg_rate'] - no_button.loc[:, 'pred_bkg_rate']
    return diff.mean()

diff_prob_fun(exp_data_df, reg_model = model)

0.007129714313552712