In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
experiment = pd.read_csv('./Data/nls_experiment.csv')
employee = pd.read_csv('./Data/nls_employees.csv')
local_offices = pd.read_csv('./Data/nls_local_offices.csv')

In [3]:
local_offices = local_offices[['Office_ID','Office_Name']]

In [4]:
experiment = experiment.merge(local_offices, on='Office_ID')
experiment.head()

Unnamed: 0,Employee_ID,Office_ID,Course,Intake_Proficiency_Score,Intake_Applications_Score,Outcome_Proficiency_Score,Outcome_Applications_Score,Office_Name
0,900268,1,103,0.235537,0.22998,0.523067,0.675717,Miami Local
1,900861,1,103,0.190878,0.221601,0.740856,0.623202,Miami Local
2,901864,1,103,0.106225,0.236564,0.586822,0.66194,Miami Local
3,904483,1,103,0.29842,0.236571,0.736453,0.584302,Miami Local
4,906274,1,103,0.293189,0.321137,0.672396,0.698116,Miami Local


In [13]:
experiment['treatA'] = experiment['Office_Name'].apply(lambda x: 1 if x in ['Miami Local', 'Houston Local'] else (0 if x in ['New York Local', 'Los Angeles Local'] else np.nan))
experiment['treatB'] = experiment['Office_Name'].apply(lambda x: 1 if x in ['Denver Local', 'Detroit Local'] else (0 if x in ['New York Local', 'Los Angeles Local'] else np.nan))

## Proficiency Score

In [45]:
# make a subset of proficiency scores
exp_prof = experiment[['Employee_ID','Office_Name', 'Intake_Proficiency_Score', 'Outcome_Proficiency_Score','treatA','treatB']]

In [46]:
# Melt the dataframe to have a single 'score' column
exp_prof = exp_prof.melt(id_vars=['Employee_ID', 'Office_Name', 'treatA', 'treatB'], 
                                value_vars=['Intake_Proficiency_Score', 'Outcome_Proficiency_Score'], 
                                var_name='Score_Type', value_name='Score')

# Create the 'After' column
exp_prof['After'] = exp_prof['Score_Type'].apply(lambda x: 1 if 'Outcome' in x else 0)

# Drop the 'Score_Type' column as it's no longer needed
exp_prof = exp_prof.drop(columns=['Score_Type'])

exp_prof.head()

Unnamed: 0,Employee_ID,Office_Name,treatA,treatB,Score,After
0,900268,Miami Local,1.0,,0.235537,0
1,900861,Miami Local,1.0,,0.190878,0
2,901864,Miami Local,1.0,,0.106225,0
3,904483,Miami Local,1.0,,0.29842,0
4,906274,Miami Local,1.0,,0.293189,0


### Compare A with Current

In [47]:
pd.crosstab(exp_app['treatA'],
            exp_app['After'],
            values=exp_prof['Score'],
            aggfunc='mean').applymap(lambda x: round(x, 2))

After,0,1
treatA,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.26,0.6
1.0,0.29,0.7


In [37]:
# Run a Did model for treatment A
DiD_A_Prof = smf.ols('Score ~ treatA * After',
                     data=exp_prof).fit()
print(DiD_A_Prof.summary())

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.902
Model:                            OLS   Adj. R-squared:                  0.902
Method:                 Least Squares   F-statistic:                     2484.
Date:                Sat, 07 Dec 2024   Prob (F-statistic):               0.00
Time:                        16:52:20   Log-Likelihood:                 1116.0
No. Observations:                 814   AIC:                            -2224.
Df Residuals:                     810   BIC:                            -2205.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.2613      0.004     65.334   

### Compare B with Current

In [48]:
pd.crosstab(exp_prof['treatB'],
            exp_prof['After'],
            values=exp_prof['Score'],
            aggfunc='mean').applymap(lambda x: round(x, 2))

After,0,1
treatB,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.26,0.6
1.0,0.3,0.73


In [36]:
# Run a Did model for treatment B
DiD_B_Prof = smf.ols('Score ~ treatB * After', data=exp_prof).fit()
print(DiD_B_Prof.summary())

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.904
Model:                            OLS   Adj. R-squared:                  0.904
Method:                 Least Squares   F-statistic:                     2648.
Date:                Sat, 07 Dec 2024   Prob (F-statistic):               0.00
Time:                        16:52:11   Log-Likelihood:                 1131.5
No. Observations:                 846   AIC:                            -2255.
Df Residuals:                     842   BIC:                            -2236.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.2613      0.004     63.183   

## 2. Application Score

In [42]:
# make a subset of proficiency scores
exp_app = experiment[[
    'Employee_ID', 'Office_Name', 'Intake_Applications_Score',
    'Outcome_Applications_Score', 'treatA', 'treatB'
]]

In [43]:
# Melt the dataframe to have a single 'score' column
exp_app = exp_app.melt(
    id_vars=['Employee_ID', 'Office_Name', 'treatA', 'treatB'],
    value_vars=['Intake_Applications_Score', 'Outcome_Applications_Score'],
    var_name='Score_Type',
    value_name='Score')

# Create the 'After' column
exp_app['After'] = exp_app['Score_Type'].apply(lambda x: 1
                                                 if 'Outcome' in x else 0)

# Drop the 'Score_Type' column as it's no longer needed
exp_app = exp_app.drop(columns=['Score_Type'])

exp_app.head()

Unnamed: 0,Employee_ID,Office_Name,treatA,treatB,Score,After
0,900268,Miami Local,1.0,,0.22998,0
1,900861,Miami Local,1.0,,0.221601,0
2,901864,Miami Local,1.0,,0.236564,0
3,904483,Miami Local,1.0,,0.236571,0
4,906274,Miami Local,1.0,,0.321137,0


### Compare A with current

In [44]:
pd.crosstab(exp_app['treatA'], exp_app['After'], values=exp_app['Score'], aggfunc='mean').applymap(lambda x: round(x, 2))

After,0,1
treatA,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.27,0.62
1.0,0.31,0.68


In [35]:
# Run a Did model for treatment A
DiD_A_App = smf.ols('Score ~ treatA * After', data=exp_app).fit()
print(DiD_A_App.summary())

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.884
Model:                            OLS   Adj. R-squared:                  0.883
Method:                 Least Squares   F-statistic:                     2054.
Date:                Sat, 07 Dec 2024   Prob (F-statistic):               0.00
Time:                        16:51:58   Log-Likelihood:                 1048.2
No. Observations:                 814   AIC:                            -2088.
Df Residuals:                     810   BIC:                            -2069.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.2661      0.004     61.206   

### Compare B with Current

In [41]:
pd.crosstab(exp_app['treatB'],
            exp_app['After'],
            values=exp_app['Score'],
            aggfunc='mean').applymap(lambda x: round(x, 2))

After,0,1
treatB,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.27,0.62
1.0,0.31,0.58


In [26]:
# Run a Did model for treatment B
DiD_B_App = smf.ols('Score ~ treatB * After', data=exp_app).fit()
print(DiD_B_App.summary())

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.872
Method:                 Least Squares   F-statistic:                     1922.
Date:                Sat, 07 Dec 2024   Prob (F-statistic):               0.00
Time:                        16:44:38   Log-Likelihood:                 1160.4
No. Observations:                 846   AIC:                            -2313.
Df Residuals:                     842   BIC:                            -2294.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.2661      0.004     66.574   