In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from statsmodels.api import OLS
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GB, 
    GradientBoostingClassifier as GC)
from ISLP.bart import BART
import sklearn.model_selection as skm
import seaborn as sns
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [19]:
#Load data
data = pd.read_pickle("data.pkl")

In [31]:
# Significance stars
def stars(p):
    if p < 0.01:
        return "***"
    elif p < 0.05:
        return "**"
    elif p < 0.1:
        return "*"
    else:
        return ""

rows = []

# By treatment (Female vs Male)
for t in ["type_1", "type_2", "type_3", "type_4"]:
    f = data[(data[t] == 1) & (data["female"] == 1)]["callback"]
    m = data[(data[t] == 1) & (data["female"] == 0)]["callback"]

    diff = f.mean() - m.mean()
    pval = stats.ttest_ind(f, m, equal_var=False).pvalue
    se = ((f.var(ddof=1)/len(f)) + (m.var(ddof=1)/len(m)))**0.5
    n = len(f) + len(m)

    rows.append({
        "Group": t.replace("type_", "Treatment "),
        "Female mean": f"{f.mean():.3f}",
        "Male mean": f"{m.mean():.3f}",
        "Gap (F - M)": f"{diff:.3f}{stars(pval)}",
        "SE": f"({se:.3f})",
        "N": n
    })

# Public-facing occupations
f = data[(data["public_facing"] == 1) & (data["female"] == 1)]["callback"]
m = data[(data["public_facing"] == 1) & (data["female"] == 0)]["callback"]
diff = f.mean() - m.mean()
pval = stats.ttest_ind(f, m, equal_var=False).pvalue
se = ((f.var(ddof=1)/len(f)) + (m.var(ddof=1)/len(m)))**0.5
n = len(f) + len(m)

rows.append({
    "Group": "Public-facing occupations",
    "Female mean": f"{f.mean():.3f}",
    "Male mean": f"{m.mean():.3f}",
    "Gap (F - M)": f"{diff:.3f}{stars(pval)}",
    "SE": f"({se:.3f})",
    "N": n
})

# Non-public occupations
f = data[(data["public_facing"] == 0) & (data["female"] == 1)]["callback"]
m = data[(data["public_facing"] == 0) & (data["female"] == 0)]["callback"]
diff = f.mean() - m.mean()
pval = stats.ttest_ind(f, m, equal_var=False).pvalue
se = ((f.var(ddof=1)/len(f)) + (m.var(ddof=1)/len(m)))**0.5
n = len(f) + len(m)

rows.append({
    "Group": "Non-public occupations",
    "Female mean": f"{f.mean():.3f}",
    "Male mean": f"{m.mean():.3f}",
    "Gap (F - M)": f"{diff:.3f}{stars(pval)}",
    "SE": f"({se:.3f})",
    "N": n
})

# Overall Female vs Male
f = data[data["female"] == 1]["callback"]
m = data[data["female"] == 0]["callback"]
diff = f.mean() - m.mean()
pval = stats.ttest_ind(f, m, equal_var=False).pvalue
se = ((f.var(ddof=1)/len(f)) + (m.var(ddof=1)/len(m)))**0.5
n = len(f) + len(m)

rows.append({
    "Group": "Overall",
    "Female mean": f"{f.mean():.3f}",
    "Male mean": f"{m.mean():.3f}",
    "Gap (F - M)": f"{diff:.3f}{stars(pval)}",
    "SE": f"({se:.3f})",
    "N": n
})

# Combine into summary table
summary = pd.DataFrame(rows)

# Display nicely
print(summary)

# Export to LaTeX
summary.to_latex(
    "callback_gaps.tex",
    index=False,
    caption="Gender Callback Gaps by Treatment and Occupation Type",
    label="tab:callback_gaps",
    escape=False
)


                       Group Female mean Male mean Gap (F - M)       SE     N
0                Treatment 1       0.121     0.095       0.026  (0.016)  1408
1                Treatment 2       0.093     0.094      -0.002  (0.021)   747
2                Treatment 3       0.062     0.081      -0.019  (0.021)   616
3                Treatment 4       0.074     0.048       0.026  (0.021)   522
4  Public-facing occupations       0.138     0.111     0.027**  (0.013)  2498
5     Non-public occupations       0.083     0.057     0.027**  (0.013)  1612
6                    Overall       0.116     0.090    0.026***  (0.009)  4110
