# NOTEBOOK FOR LPM AND LOGIT MODELS AS BASELINE

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from statsmodels.api import OLS
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree, 
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GB, 
    GradientBoostingClassifier as GC)
from ISLP.bart import BART
import sklearn.model_selection as skm
import seaborn as sns
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

## Read the data

In [2]:
df = pd.read_pickle("data.pkl")

In [6]:
## Create interaction terms for LPM table
for i in range(1, 5):
    df[f"type_{i}_female_public"] = (
        df[f"type_{i}"] * df["female"] * df["public_facing"])
    df[f"type_{i}_female"] = (
        df[f"type_{i}"] * df["female"])
    
df["female_public"] = (df["female"] * df["public_facing"])

In [10]:
#### LOGIT ####
# Define triple interactions
triple_interactions = [
    "type_1_female_public",
    "type_2_female_public",
    "type_3_female_public",
    "type_4_female_public"
]

# Controls
X_vars = ["type_1_female_public", "type_2_female_public", "type_3_female_public", "type_4_female_public","ba_quality", "language_skills", 
              "exp_highquality", "ma", "certificate",
              'occ_Administrative', 'occ_Biotech and Pharmacy', 'occ_Civil Engineer',
              'occ_Clerical', 'occ_Ecommerce', 'occ_Education',
              'occ_Electrical Engineer', 'occ_Executive Assistant', 'occ_Finance',
              'occ_Food Services Managers', 'occ_Human Resources Payroll',
              'occ_Insurance', 'occ_Maintenance Technician',
              'occ_Marketing and Sales', 'occ_Media and Arts', 'occ_Production',
              'occ_Programmer', 'occ_Retail', 'occ_Social Worker', 'occ_Technology']

# Outcome 
y = df["callback"]


X = df[X_vars]
X = sm.add_constant(X)
y = df["callback"]

# Run model
logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.312766
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               callback   No. Observations:                12910
Model:                          Logit   Df Residuals:                    12880
Method:                           MLE   Df Model:                           29
Date:                Sun, 30 Nov 2025   Pseudo R-squ.:                 0.03536
Time:                        21:17:14   Log-Likelihood:                -4037.8
converged:                       True   LL-Null:                       -4185.8
Covariance Type:            nonrobust   LLR p-value:                 4.931e-46
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                          -3.0048      0.230    -13.083      0.000   

In [11]:
#### OLS MODEL ####
#Uses triple interactions
# Outcome
y = df["callback"]

# Regressors
X = df[X_vars]
X = sm.add_constant(X)

# Fit OLS
ols_model = sm.OLS(y, X).fit()

print(ols_model.summary())


                            OLS Regression Results                            
Dep. Variable:               callback   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     11.39
Date:                Sun, 30 Nov 2025   Prob (F-statistic):           4.43e-52
Time:                        21:21:09   Log-Likelihood:                -2589.5
No. Observations:               12910   AIC:                             5239.
Df Residuals:                   12880   BIC:                             5463.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

In [13]:
#### EXPORT AS LATEX TABLE ####
triple_interactions = [
    "type_1_female_public",
    "type_2_female_public",
    "type_3_female_public",
    "type_4_female_public"
]

res = ols_model

table = pd.DataFrame({
    "Coefficient": res.params[triple_interactions],
    "Std. Error": res.bse[triple_interactions],
    "p-value": res.pvalues[triple_interactions]
}).round(3)

print(
    table.to_latex(
        index=True,
        escape=False,
        caption="OLS results for Triple Interactions",
        label="tab:triple_interactions",
        float_format="%.3f"
    )
)


\begin{table}
\caption{OLS results for Triple Interactions}
\label{tab:triple_interactions}
\begin{tabular}{lrrr}
\toprule
 & Coefficient & Std. Error & p-value \\
\midrule
type_1_female_public & 0.010 & 0.011 & 0.373 \\
type_2_female_public & 0.011 & 0.014 & 0.431 \\
type_3_female_public & -0.015 & 0.014 & 0.290 \\
type_4_female_public & -0.039 & 0.014 & 0.005 \\
\bottomrule
\end{tabular}
\end{table}



In [10]:
#### OLS MODEL ONLY TWO WAY INTERACTIONS ####
#Uses double interaction of type and female
# Controls
X_vars = ["type_1_female", "type_2_female", "type_3_female", "type_4_female","female_public","ba_quality", "language_skills", 
              "exp_highquality", "ma", "certificate",
              'occ_Administrative', 'occ_Biotech and Pharmacy', 'occ_Civil Engineer',
              'occ_Clerical', 'occ_Ecommerce', 'occ_Education',
              'occ_Electrical Engineer', 'occ_Executive Assistant', 'occ_Finance',
              'occ_Food Services Managers', 'occ_Human Resources Payroll',
              'occ_Insurance', 'occ_Maintenance Technician',
              'occ_Marketing and Sales', 'occ_Media and Arts', 'occ_Production',
              'occ_Programmer', 'occ_Retail', 'occ_Social Worker', 'occ_Technology']

# Outcome 
y = df["callback"]

# Regressors
X = df[X_vars]
X = sm.add_constant(X)

# Fit OLS
ols_model = sm.OLS(y, X).fit()

In [11]:
#### EXPORT AS LATEX TABLE ####
vars_interest = ["type_1_female", "type_2_female", "type_3_female", "type_4_female", "female_public"]
res2 = ols_model

table = pd.DataFrame({
    "Coefficient": res2.params[vars_interest],
    "Std. Error": res2.bse[vars_interest],
    "p-value": res2.pvalues[vars_interest],
    
}).round(3)

print(
    table.to_latex(
        index=True,
        escape=False,
        caption="OLS results for Two Way Interactions",
        label="tab:triple_interactions",
        float_format="%.3f"
    )
)

\begin{table}
\caption{OLS results for Two Way Interactions}
\label{tab:triple_interactions}
\begin{tabular}{lrrr}
\toprule
 & Coefficient & Std. Error & p-value \\
\midrule
type_1_female & -0.021 & 0.008 & 0.014 \\
type_2_female & -0.025 & 0.010 & 0.013 \\
type_3_female & -0.045 & 0.011 & 0.000 \\
type_4_female & -0.064 & 0.011 & 0.000 \\
female_public & 0.059 & 0.009 & 0.000 \\
\bottomrule
\end{tabular}
\end{table}

