In [1]:
# Common libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import seaborn as sns

# Chapter-specific libraries
# To rescale numeric variables
from sklearn.preprocessing import MinMaxScaler
# To one-hot encode cat. variables
from sklearn.preprocessing import OneHotEncoder

In [2]:
es = pd.read_csv('ES_dat.csv')
sig = pd.read_csv('sig_dat.csv')
hist_df = pd.read_csv('chap10-historical_data.csv')
exp_df = pd.read_csv('chap10-experimental_data.csv')

In [3]:
mixed = smf.mixedlm("call_CSAT ~ reason + age", data = hist_df, groups = hist_df['center_ID'])

print(mixed.fit().summary())

            Mixed Linear Model Regression Results
Model:              MixedLM Dependent Variable: call_CSAT    
No. Observations:   695205  Method:             REML         
No. Groups:         10      Scale:              1.1217       
Min. group size:    54203   Log-Likelihood:     -1026427.7247
Max. group size:    79250   Converged:          Yes          
Mean group size:    69520.5                                  
-------------------------------------------------------------
                   Coef. Std.Err.    z    P>|z| [0.025 0.975]
-------------------------------------------------------------
Intercept          3.899    0.335  11.641 0.000  3.243  4.556
reason[T.property] 0.199    0.003  74.786 0.000  0.194  0.205
age                0.020    0.000 176.747 0.000  0.020  0.020
Group Var          1.122    0.407                            



In [4]:

vcf = {"rep_ID": "0+C(rep_ID)"} # dictionary of nested variables as key, value defines
                                # random intercept or random slope (varying by category) aka categorical variable
                                # expressed as "0+C(var)" with var as nested variable

mixed2 = smf.mixedlm(
    "call_CSAT ~ reason + age",
      data=hist_df,
      groups=hist_df["center_ID"],
      vc_formula=vcf
      )
print(mixed2.fit().summary())

            Mixed Linear Model Regression Results
Model:             MixedLM  Dependent Variable:  call_CSAT   
No. Observations:  695205   Method:              REML        
No. Groups:        10       Scale:               0.3904      
Min. group size:   54203    Log-Likelihood:      -660498.6462
Max. group size:   79250    Converged:           Yes         
Mean group size:   69520.5                                   
-------------------------------------------------------------
                   Coef. Std.Err.    z    P>|z| [0.025 0.975]
-------------------------------------------------------------
Intercept          3.874    0.099  38.992 0.000  3.679  4.069
reason[T.property] 0.200    0.002 126.789 0.000  0.196  0.203
age                0.020    0.000 298.301 0.000  0.020  0.020
rep_ID Var         1.904    0.303                            



In [7]:
# aggregating data to level of call center
center_data_df = hist_df.groupby('center_ID').agg(
    nreps = ('rep_ID', lambda x: x.nunique()),
    avg_call_CSAT = ("call_CSAT", "mean"), 
    avg_age=("age", "mean"),
    pct_reason_pmt=('reason', 
                    lambda x: sum(1 if r=='payment' else 0 for r in x) / len(x))
)

center_data_df['nreps'] = center_data_df.nreps.astype(float)

center_data_df

Unnamed: 0_level_0,nreps,avg_call_CSAT,avg_age,pct_reason_pmt
center_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,18.0,3.66443,39.96288,0.601027
2,21.0,3.958169,39.959532,0.599237
3,22.0,4.030376,39.98183,0.599508
4,15.0,5.296561,40.063354,0.59969
5,21.0,5.921405,39.977681,0.600679
6,21.0,4.750132,39.991947,0.59882
7,19.0,4.442338,40.008159,0.599171
8,19.0,4.138269,40.004682,0.597199
9,19.0,4.060705,40.033742,0.59722
10,18.0,7.528956,39.978467,0.598759


In [9]:
### Function to prep the data
def strat_prep_fun(dat_df):

    #Extracting components of the data
    num_df = dat_df.copy().loc[:,dat_df.dtypes=='float64'] #Numeric vars
    center_ID = [i for i in dat_df.index]

    #Normalizing all numeric variables to [0,1]
    scaler = MinMaxScaler()
    scaler.fit(num_df)
    num_np = scaler.transform(num_df)
    
    return center_ID, num_np
    
def pair_fun(dat_df, K = 2):
    
    match_len = K - 1 # Number of matches we want to find
    match_idx = match_len - 1 # Accounting for 0-indexing
    
    center_ID, data_np = strat_prep_fun(dat_df)
    N = len(data_np)
    
    #Calculate distance matrix
    from scipy.spatial import distance_matrix
    d_mat = distance_matrix(data_np, data_np)
    np.fill_diagonal(d_mat,N+1)
    # Set up variables
    available = [i for i in range(N)]
    available_temp = available.copy()
    matches_lst = []
    lim = int(N/match_len)
    
    closest = np.argpartition(d_mat, kth=match_idx,axis=1)
    
    for n in available:
        if len(matches_lst) == lim: break
        if n in available_temp:
            for match_lim in range(match_idx,N-1):
                possible_matches = closest[n,:match_lim].tolist()
                matches = list(set(available_temp) & set(possible_matches))
                if len(matches) == match_len:
                    matches.append(n)
                    matches_lst.append(matches)
                    available_temp \
                    = [m for m in available_temp if m not in matches]
                    break
                else:
                    closest[n,:] = np.argpartition(d_mat[n,:], kth=match_lim)
    #Map center indices to their proper IDs
    matches_id_lst = [[center_ID[k[0]],center_ID[k[1]]] for k in matches_lst]
    return np.array(matches_id_lst)
pair_fun(center_data_df)

array([[ 2,  1],
       [ 6,  3],
       [ 7,  4],
       [10,  5],
       [ 9,  8]])

In [None]:
def hlm_metric_fun(dat_df):
    vcf = {"rep_ID": "0+C(rep_ID)"}
    h_mod = smf.mixedlm("call_CSAT ~ reason + age + group",
                        data = dat_df,
                        groups = dat_df["center_ID"],
                        re_formula = '1',
                        vc_formula=vcf)
    coeff = h_mod.fit().fe_params.values[2]
    return coeff

##### Simulation function #####
def power_sim_fun(dat_df, metric_fun = hlm_metric_fun, Ncalls_rep = 1000, eff_size = 1, B = 20, conf_level = 0.9):
    
    #Extract the stratified pairs
    stratified_pairs = stratified_assgnt_fun(dat_df, K=2)
    Npairs = len(stratified_pairs)
    Nperm = 2 ** Npairs
    power_list = []
    
    for m in dat_df.month.unique():
        #Sample down the data
        sample_data_df = dat_df.loc[dat_df.month==m,]
        sample_data_df = sample_data_df.groupby('rep_ID')\
        .sample(n=Ncalls_rep, replace=True)\
        .reset_index(drop = True)
        for perm in range(Nperm):
            bin_str = f'{perm:0{Npairs}b}'
            idx = np.array([[i for i in range(Npairs)],
                            [int(d) for d in bin_str]]).T
            treat = [stratified_pairs[tuple(idx[i])] for i in range(Npairs)]
            
            sim_data_df = sample_data_df.copy()
            sim_data_df['group'] = 'ctrl'
            sim_data_df.loc[(sim_data_df.center_ID.isin(treat)),'group']\
                = 'treat'
            
            sim_data_df.loc[(sim_data_df.group=='treat'),'call_CSAT'] =\
                sim_data_df.loc[(sim_data_df.group=='treat'),'call_CSAT'] + eff_size
                
            sim_data_df.loc[(sim_data_df.call_CSAT > 10), 'call_CSAT'] = 10
            
            # Option 1: extract CIs for visualization
            #sim_CI = boot_CI_fun(sim_data_df, lm_metric_fun)
            #power_list.append(sim_CI)
            
            # Option 2: calculate decision for overall power determination
            D = decision_fun(sim_data_df, metric_fun, B = B, conf_level = conf_level)
            power_list.append(D)
            
    return power_list

In [None]:
for perm in range(Nperm):
    bin_str = f'{perm:0{Npairs}b}'
    idx = np.array([[i for i in range(Npairs)],
                    int(d) for d in bin_str]).T
    treat = [stratified_pairs[tuple(idx[i])] for i in range(Npairs)]
    sim_data_df = sample_data_df.copy()
    sim_data_df['group'] = 'ctrl'
    sim_data_df.loc[(sim_data_df.center_ID.isin(treat)), 'group'] = 'treat'